From 8874bb2a0a4a8725a57faaf6bc16756a07adbbc8 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 24 May 2023 10:15:41 +0100
Subject: [PATCH 001/182] [vecz] Move to compiler module

This is conceptually a compiler module. We previously kept it separate
in case we wanted to release it separately. This is no longer the case.
---
 .../compiler_passes/vecz/include/vecz/pass.h  |  133 +
 .../vecz/include/vecz/vecz_choices.h          |  294 ++
 .../vecz/include/vecz/vecz_target_info.h      |  695 ++++
 .../source/analysis/control_flow_analysis.cpp |   99 +
 .../source/analysis/divergence_analysis.cpp   |  806 ++++
 .../analysis/instantiation_analysis.cpp       |  131 +
 .../source/analysis/liveness_analysis.cpp     |  252 ++
 .../analysis/packetization_analysis.cpp       |  180 +
 .../source/analysis/simd_width_analysis.cpp   |  196 +
 .../vecz/source/analysis/stride_analysis.cpp  |   88 +
 .../analysis/uniform_value_analysis.cpp       |  501 +++
 .../vectorizable_function_analysis.cpp        |  131 +
 .../analysis/vectorization_unit_analysis.cpp  |   38 +
 .../vecz/source/control_flow_boscc.cpp        | 1407 +++++++
 .../vecz/source/control_flow_roscc.cpp        |  150 +
 .../compiler_passes/vecz/source/debugging.cpp |   80 +
 .../include/analysis/control_flow_analysis.h  |   98 +
 .../include/analysis/divergence_analysis.h    |  480 +++
 .../include/analysis/instantiation_analysis.h |   36 +
 .../include/analysis/liveness_analysis.h      |  100 +
 .../include/analysis/packetization_analysis.h |  106 +
 .../include/analysis/simd_width_analysis.h    |   68 +
 .../source/include/analysis/stride_analysis.h |  127 +
 .../include/analysis/uniform_value_analysis.h |  188 +
 .../analysis/vectorizable_function_analysis.h |   72 +
 .../analysis/vectorization_unit_analysis.h    |  121 +
 .../vecz/source/include/control_flow_boscc.h  |  273 ++
 .../vecz/source/include/control_flow_roscc.h  |   56 +
 .../vecz/source/include/debugging.h           |  201 +
 .../vecz/source/include/ir_cleanup.h          |   52 +
 .../vecz/source/include/llvm_helpers.h        |   54 +
 .../vecz/source/include/memory_operations.h   |  627 ++++
 .../vecz/source/include/offset_info.h         |  261 ++
 .../vecz/source/include/reachability.h        |  116 +
 .../vecz/source/include/simd_packet.h         |  100 +
 .../transform/common_gep_elimination_pass.h   |   56 +
 .../transform/control_flow_conversion_pass.h  |  153 +
 .../inline_post_vectorization_pass.h          |   49 +
 .../include/transform/instantiation_pass.h    |  113 +
 .../interleaved_group_combine_pass.h          |   94 +
 .../include/transform/packetization_helpers.h |  220 ++
 .../include/transform/packetization_pass.h    |   77 +
 .../source/include/transform/packetizer.h     |  234 ++
 .../vecz/source/include/transform/passes.h    |  209 ++
 .../include/transform/printf_scalarizer.h     |  117 +
 .../include/transform/scalarization_pass.h    |   68 +
 .../source/include/transform/scalarizer.h     |  323 ++
 .../transform/ternary_transform_pass.h        |   49 +
 .../source/include/vectorization_context.h    |  319 ++
 .../source/include/vectorization_helpers.h    |   68 +
 .../source/include/vectorization_heuristics.h |   43 +
 .../vecz/source/include/vectorization_unit.h  |  259 ++
 .../vecz/source/include/vectorizer.h          |   74 +
 .../vecz/source/include/vecz_pass_builder.h   |   68 +
 .../vecz/source/ir_cleanup.cpp                |  192 +
 .../vecz/source/llvm_helpers.cpp              |   73 +
 .../vecz/source/memory_operations.cpp         | 1002 +++++
 .../vecz/source/offset_info.cpp               | 1058 ++++++
 .../compiler_passes/vecz/source/pass.cpp      |  253 ++
 .../compiler_passes/vecz/source/passes.def    |   51 +
 .../vecz/source/reachability.cpp              |  281 ++
 .../vecz/source/simd_packet.cpp               |   55 +
 .../source/transform/basic_mem2reg_pass.cpp   |  248 ++
 .../transform/builtin_inlining_pass.cpp       |  325 ++
 .../transform/common_gep_elimination_pass.cpp |  112 +
 .../control_flow_conversion_pass.cpp          | 3135 ++++++++++++++++
 .../inline_post_vectorization_pass.cpp        |  141 +
 .../source/transform/instantiation_pass.cpp   |  350 ++
 .../interleaved_group_combine_pass.cpp        |  548 +++
 .../transform/loop_rotate_custom_pass.cpp     |   39 +
 .../transform/packetization_helpers.cpp       |  662 ++++
 .../source/transform/packetization_pass.cpp   |   81 +
 .../vecz/source/transform/packetizer.cpp      | 3283 +++++++++++++++++
 .../vecz/source/transform/passes.cpp          |  176 +
 .../source/transform/pre_linearize_pass.cpp   |  351 ++
 .../source/transform/printf_scalarizer.cpp    |  392 ++
 .../source/transform/remove_intptr_pass.cpp   |  127 +
 .../source/transform/scalarization_pass.cpp   |  284 ++
 .../vecz/source/transform/scalarizer.cpp      | 1616 ++++++++
 .../transform/simplify_infinite_loop_pass.cpp |  142 +
 .../transform/squash_small_vectors_pass.cpp   |  277 ++
 .../transform/ternary_transform_pass.cpp      |  234 ++
 .../transform/uniform_reassociation_pass.cpp  |  355 ++
 .../vecz/source/vector_target_info.cpp        | 1379 +++++++
 .../vecz/source/vector_target_info_arm.cpp    |  408 ++
 .../vecz/source/vector_target_info_riscv.cpp  |  752 ++++
 .../vecz/source/vectorization_choices.cpp     |  172 +
 .../vecz/source/vectorization_context.cpp     |  894 +++++
 .../vecz/source/vectorization_helpers.cpp     |  395 ++
 .../vecz/source/vectorization_heuristics.cpp  |  390 ++
 .../vecz/source/vectorization_unit.cpp        |  175 +
 .../vecz/source/vectorizer.cpp                |  364 ++
 .../vecz/source/vecz_pass_builder.cpp         |  274 ++
 .../vecz/test/lit/llvm/AArch64/lit.local.cfg  |   18 +
 .../llvm/AArch64/shuffled_load_aarch64_1.ll   |   55 +
 .../llvm/AArch64/shuffled_load_aarch64_2.ll   |   56 +
 .../llvm/AArch64/shuffled_load_aarch64_3.ll   |   57 +
 .../llvm/AArch64/shuffled_load_aarch64_4.ll   |   57 +
 .../llvm/AArch64/shuffled_load_aarch64_5.ll   |   69 +
 .../llvm/AArch64/shuffled_load_aarch64_6.ll   |   59 +
 .../vecz/test/lit/llvm/Boscc/boscc_killer.ll  |  150 +
 .../vecz/test/lit/llvm/Boscc/boscc_merge.ll   |  298 ++
 .../vecz/test/lit/llvm/Boscc/boscc_merge2.ll  |  173 +
 .../vecz/test/lit/llvm/Boscc/boscc_merge3.ll  |  130 +
 .../lit/llvm/Boscc/duplicate_preheader.ll     |  134 +
 .../vecz/test/lit/llvm/Boscc/nested_loops1.ll |  198 +
 .../vecz/test/lit/llvm/Boscc/nested_loops2.ll |  140 +
 .../vecz/test/lit/llvm/Boscc/nested_loops3.ll |  149 +
 .../vecz/test/lit/llvm/Boscc/nested_loops4.ll |  190 +
 .../vecz/test/lit/llvm/Boscc/nested_loops5.ll |  117 +
 .../lit/llvm/Boscc/partial_linearization0.ll  |  436 +++
 .../lit/llvm/Boscc/partial_linearization1.ll  |  320 ++
 .../lit/llvm/Boscc/partial_linearization10.ll |  568 +++
 .../lit/llvm/Boscc/partial_linearization11.ll |  425 +++
 .../lit/llvm/Boscc/partial_linearization12.ll |  782 ++++
 .../lit/llvm/Boscc/partial_linearization13.ll |  247 ++
 .../lit/llvm/Boscc/partial_linearization14.ll |  356 ++
 .../lit/llvm/Boscc/partial_linearization15.ll |  415 +++
 .../lit/llvm/Boscc/partial_linearization16.ll |  394 ++
 .../lit/llvm/Boscc/partial_linearization17.ll |  468 +++
 .../lit/llvm/Boscc/partial_linearization18.ll |  357 ++
 .../lit/llvm/Boscc/partial_linearization19.ll |  379 ++
 .../lit/llvm/Boscc/partial_linearization2.ll  |  340 ++
 .../lit/llvm/Boscc/partial_linearization20.ll |  288 ++
 .../lit/llvm/Boscc/partial_linearization21.ll |  239 ++
 .../lit/llvm/Boscc/partial_linearization22.ll |  348 ++
 .../lit/llvm/Boscc/partial_linearization3.ll  |  332 ++
 .../lit/llvm/Boscc/partial_linearization4.ll  |  219 ++
 .../lit/llvm/Boscc/partial_linearization5.ll  |  264 ++
 .../lit/llvm/Boscc/partial_linearization6.ll  |  228 ++
 .../lit/llvm/Boscc/partial_linearization7.ll  |  262 ++
 .../lit/llvm/Boscc/partial_linearization8.ll  |  220 ++
 .../lit/llvm/Boscc/partial_linearization9.ll  |  173 +
 .../vecz/test/lit/llvm/Boscc/printf.ll        |  125 +
 .../lit/llvm/Boscc/scalable_linearization.ll  |   25 +
 .../lit/llvm/OpaquePointers/basic_mem2reg.ll  |   63 +
 .../llvm/OpaquePointers/basic_vecz_mem2reg.ll |   73 +
 .../OpaquePointers/builtin_inlining_mem.ll    |  122 +
 .../OpaquePointers/builtin_pointer_return.ll  |   66 +
 .../control_flow_conversion_ptrs.ll           |   52 +
 .../OpaquePointers/interleaved_load_ooo.ll    |   57 +
 .../lit/llvm/OpaquePointers/load_add_store.ll |   45 +
 .../lit/llvm/OpaquePointers/masked_store.ll   |   82 +
 .../lit/llvm/OpaquePointers/remove_intptr.ll  |   54 +
 .../llvm/OpaquePointers/ternary_transform.ll  |  118 +
 .../define_interleaved_store.ll               |   82 +
 .../define_interleaved_store_as_masked.ll     |   82 +
 .../vector_phi_uniform.ll                     |   87 +
 .../vector_phi_varying.ll                     |   94 +
 .../test/lit/llvm/RISCV/broadcast_vector.ll   |  209 ++
 .../lit/llvm/RISCV/define_subgroup_scans.ll   |  183 +
 .../llvm/RISCV/define_subgroup_scans_vp.ll    |  184 +
 .../test/lit/llvm/RISCV/extract_element.ll    |  169 +
 .../test/lit/llvm/RISCV/insert_element.ll     |  137 +
 .../vecz/test/lit/llvm/RISCV/lit.local.cfg    |   23 +
 .../test/lit/llvm/RISCV/packetize_shuffle.ll  |   43 +
 .../lit/llvm/RISCV/packetize_shuffle_bool.ll  |   50 +
 .../llvm/RISCV/packetize_shuffle_concat.ll    |   50 +
 .../llvm/RISCV/packetize_shuffle_narrow.ll    |   44 +
 .../lit/llvm/RISCV/packetize_shuffle_wider.ll |   44 +
 .../lit/llvm/RISCV/select_scalar_vector.ll    |   50 +
 .../vecz/test/lit/llvm/RISCV/vp_memops.ll     |  100 +
 .../vecz/test/lit/llvm/RISCV/vp_vsetvli.ll    |   48 +
 .../llvm/ScalableVectors/broadcast_vector.ll  |  181 +
 .../test/lit/llvm/ScalableVectors/builtins.ll |   40 +
 .../test/lit/llvm/ScalableVectors/cast.ll     |   35 +
 .../define_interleaved_store.ll               |   65 +
 .../define_interleaved_store_as_masked.ll     |   65 +
 .../ScalableVectors/define_masked_load.ll     |   70 +
 .../define_masked_scatter_gather.ll           |   90 +
 .../ScalableVectors/define_subgroup_scans.ll  |  183 +
 .../define_subgroup_scans_vp.ll               |  187 +
 .../llvm/ScalableVectors/extract_element.ll   |  145 +
 .../test/lit/llvm/ScalableVectors/fadd.ll     |   40 +
 .../lit/llvm/ScalableVectors/fail_builtins.ll |   37 +
 .../llvm/ScalableVectors/insert_element.ll    |  121 +
 .../llvm/ScalableVectors/interleaved_load.ll  |   60 +
 .../lit/llvm/ScalableVectors/intrinsics.ll    |  196 +
 .../lit/llvm/ScalableVectors/lit.local.cfg    |   18 +
 .../llvm/ScalableVectors/load_add_store.ll    |   40 +
 .../llvm/ScalableVectors/load_binops_store.ll |   47 +
 .../test/lit/llvm/ScalableVectors/metadata.ll |   41 +
 .../ScalableVectors/packetize_mask_varying.ll |   56 +
 .../lit/llvm/ScalableVectors/scalable_auto.ll |   37 +
 .../test/lit/llvm/ScalableVectors/select.ll   |   67 +
 .../ScalableVectors/select_scalar_vector.ll   |   57 +
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |   62 +
 .../llvm/ScalableVectors/subgroup_builtins.ll |   69 +
 .../llvm/ScalableVectors/subgroup_scans.ll    |  154 +
 ...cans_spv_khr_uniform_group_instructions.ll |  175 +
 ...s_spv_khr_uniform_group_instructions_vp.ll |  175 +
 .../llvm/ScalableVectors/subgroup_scans_vp.ll |  154 +
 .../test/lit/llvm/ScalableVectors/vectors.ll  |   41 +
 .../ScalableVectors/verification_fail_phi.ll  |   49 +
 .../lit/llvm/ScalableVectors/widen_vload.ll   |   35 +
 .../llvm/ScalableVectors/workitem_funcs.ll    |   41 +
 .../llvm/VectorPredication/boscc_reduction.ll |   51 +
 .../test/lit/llvm/VectorPredication/choice.ll |   34 +
 .../compute_vector_length.ll                  |   54 +
 .../define_interleaved_load_store.ll          |   80 +
 .../define_masked_load_store.ll               |   76 +
 .../define_masked_scatter_gather.ll           |   90 +
 .../define_subgroup_scans.ll                  |  184 +
 .../llvm/VectorPredication/load_add_store.ll  |  105 +
 .../packetize_mask_varying.ll                 |   53 +
 .../llvm/VectorPredication/scatter_gather.ll  |   65 +
 .../VectorPredication/subgroup_reductions.ll  |  266 ++
 ...ions_spv_khr_uniform_group_instructions.ll |  218 ++
 .../llvm/VectorPredication/subgroup_scans.ll  |  157 +
 ...cans_spv_khr_uniform_group_instructions.ll |  178 +
 .../test/lit/llvm/VectorPredication/udiv.ll   |   50 +
 .../VectorWidening/define_interleaved_load.ll |   66 +
 .../define_interleaved_load_as_masked.ll      |   66 +
 .../VectorWidening/delete_packetized_memop.ll |   70 +
 .../extractelement_constant_index.ll          |   40 +
 .../extractelement_runtime_index.ll           |   55 +
 .../extractelement_runtime_index2.ll          |   56 +
 .../extractelement_runtime_index3.ll          |   62 +
 .../insertelement_constant_index.ll           |   57 +
 ...rtelement_constant_index_constant_value.ll |   54 +
 .../insertelement_runtime_index.ll            |   60 +
 .../llvm/VectorWidening/interleaved_safety.ll |   98 +
 .../onearg_relationals_isfiniteDv4_d.ll       |   41 +
 .../onearg_relationals_isfiniteDv4_f.ll       |   40 +
 .../onearg_relationals_isinfDv4_d.ll          |   40 +
 .../onearg_relationals_isinfDv4_f.ll          |   40 +
 .../onearg_relationals_isnanDv4_d.ll          |   43 +
 .../onearg_relationals_isnanDv4_f.ll          |   43 +
 .../onearg_relationals_isnormalDv4_d.ll       |   42 +
 .../onearg_relationals_isnormalDv4_f.ll       |   42 +
 .../llvm/VectorWidening/scalar_vector_user.ll |   79 +
 .../lit/llvm/VectorWidening/vector_copy.ll    |   47 +
 .../llvm/VectorWidening/vector_phi_varying.ll |   91 +
 .../test/lit/llvm/VectorWidening/widen_abs.ll |   68 +
 .../lit/llvm/VectorWidening/widen_binops.ll   |   57 +
 .../lit/llvm/VectorWidening/widen_copysign.ll |   76 +
 .../test/lit/llvm/VectorWidening/widen_fma.ll |   57 +
 .../widen_fmin_vector_scalar.ll               |   64 +
 .../lit/llvm/VectorWidening/widen_fmuladd.ll  |   57 +
 .../lit/llvm/VectorWidening/widen_fmuladd2.ll |   91 +
 .../llvm/VectorWidening/widen_fmuladd_phi.ll  |   74 +
 .../lit/llvm/VectorWidening/widen_fshl.ll     |   48 +
 .../lit/llvm/VectorWidening/widen_fshr.ll     |   48 +
 .../VectorWidening/widen_shufflevector.ll     |   43 +
 .../lit/llvm/VectorWidening/widen_sqrt.ll     |   53 +
 .../vecz/test/lit/llvm/alloca_alias.ll        |   69 +
 .../vecz/test/lit/llvm/arm_neon_store.ll      |   65 +
 .../lit/llvm/async_workgroup_copy_uniform.ll  |   60 +
 .../vecz/test/lit/llvm/atomic_cmpxchg.ll      |   83 +
 .../vecz/test/lit/llvm/atomicrmw.ll           |   83 +
 .../vecz/test/lit/llvm/atomicrmw_uniform.ll   |   81 +
 .../vecz/test/lit/llvm/basic_mem2reg.ll       |   64 +
 .../vecz/test/lit/llvm/bitcast_function.ll    |   79 +
 .../test/lit/llvm/branch_splitting_and.ll     |   70 +
 .../vecz/test/lit/llvm/branch_splitting_or.ll |   70 +
 .../test/lit/llvm/builtin_inlining_addsat.ll  |  118 +
 .../test/lit/llvm/builtin_inlining_clamp.ll   |   41 +
 .../test/lit/llvm/builtin_inlining_fmax.ll    |   53 +
 .../test/lit/llvm/builtin_inlining_fmin.ll    |   53 +
 .../test/lit/llvm/builtin_inlining_mem.ll     |   77 +
 .../lit/llvm/builtin_inlining_negative.ll     |   53 +
 .../lit/llvm/builtin_inlining_positive.ll     |   67 +
 .../test/lit/llvm/builtin_pointer_return.ll   |   66 +
 ...all_instantiation_failure_cantduplicate.ll |  128 +
 .../call_instantiation_failure_cantinline.ll  |  128 +
 .../call_instantiation_failure_optnone.ll     |  128 +
 ...ll_instantiation_failure_user_undefined.ll |  128 +
 .../call_instantiation_success_builtin.ll     |  129 +
 .../call_instantiation_success_instrinsic.ll  |  129 +
 ...call_instantiation_success_user_defined.ll |  129 +
 .../vecz/test/lit/llvm/constant_address.ll    |   58 +
 .../lit/llvm/constant_address_with_uniform.ll |   41 +
 .../vecz/test/lit/llvm/contiguous_allocas.ll  |   74 +
 .../control_flow_conversion_nested_loops.ll   |  208 ++
 .../llvm/control_flow_conversion_order_y.ll   |  208 ++
 .../llvm/control_flow_conversion_order_z.ll   |  208 ++
 .../lit/llvm/control_flow_conversion_ptrs.ll  |   73 +
 .../control_flow_conversion_uniform_if.ll     |  168 +
 .../control_flow_conversion_uniform_loop.ll   |  176 +
 .../control_flow_conversion_varying_if.ll     |  166 +
 .../control_flow_conversion_varying_loop.ll   |  185 +
 .../vecz/test/lit/llvm/convert3.ll            |   65 +
 .../vecz/test/lit/llvm/convert4.ll            |   61 +
 .../vecz/test/lit/llvm/convert_contiguity.ll  |   47 +
 .../vecz/test/lit/llvm/define_gather_load.ll  |   43 +
 .../lit/llvm/define_gather_load_as_masked.ll  |   43 +
 .../test/lit/llvm/define_interleaved_load.ll  |   62 +
 .../llvm/define_interleaved_load_as_masked.ll |   79 +
 .../test/lit/llvm/define_interleaved_store.ll |   63 +
 .../define_interleaved_store_as_masked.ll     |   80 +
 .../test/lit/llvm/define_internal_builtins.ll |   32 +
 .../lit/llvm/define_masked_gather_load.ll     |   83 +
 .../vecz/test/lit/llvm/define_masked_load.ll  |   90 +
 .../lit/llvm/define_masked_scatter_store.ll   |   85 +
 .../vecz/test/lit/llvm/define_masked_store.ll |   90 +
 .../test/lit/llvm/define_scatter_store.ll     |   44 +
 .../llvm/define_scatter_store_as_masked.ll    |   44 +
 .../test/lit/llvm/define_subgroup_scans.ll    |   49 +
 .../test/lit/llvm/delete_packetized_memop.ll  |   77 +
 .../vecz/test/lit/llvm/diverging_loop.ll      |   49 +
 .../test/lit/llvm/diverging_nested_loop.ll    |   64 +
 .../vecz/test/lit/llvm/early-cse-mul-swap.ll  |   78 +
 .../vecz/test/lit/llvm/emit_memintrinsics.ll  |  195 +
 .../llvm/emit_no_unaligned_memintrinsics.ll   |   91 +
 .../vecz/test/lit/llvm/expect_assume.ll       |   87 +
 .../lit/llvm/extractelement_constant_index.ll |   40 +
 .../lit/llvm/extractelement_runtime_index.ll  |   50 +
 .../vecz/test/lit/llvm/gep_duplication.ll     |   75 +
 .../vecz/test/lit/llvm/gep_elim_opaque.ll     |   54 +
 .../vecz/test/lit/llvm/indirect_call.ll       |   30 +
 .../lit/llvm/inlined_function_debug_info.ll   |  141 +
 .../lit/llvm/insert_element_debug_info.ll     |  142 +
 .../lit/llvm/insertelement_constant_index.ll  |   48 +
 .../lit/llvm/insertelement_runtime_index.ll   |   56 +
 .../test/lit/llvm/instantiate_constants.ll    |   95 +
 .../llvm/interleaved_defuse_instantiated.ll   |   90 +
 .../vecz/test/lit/llvm/interleaved_load16.ll  |   87 +
 .../test/lit/llvm/interleaved_load_ooo.ll     |   58 +
 .../vecz/test/lit/llvm/interleaved_safety.ll  |   95 +
 .../test/lit/llvm/intrinsics-scalarize.ll     |  207 ++
 .../vecz/test/lit/llvm/intrinsics.ll          |  200 +
 .../vecz/test/lit/llvm/irreducible_loop.ll    |   68 +
 .../test/lit/llvm/loop_call_instantiation.ll  |   63 +
 .../test/lit/llvm/masked_calls_max_builtin.ll |   81 +
 .../vecz/test/lit/llvm/masked_interleaved.ll  |   74 +
 .../lit/llvm/masked_interleaved_as_scatter.ll |   75 +
 .../test/lit/llvm/masked_interleaved_group.ll |   99 +
 .../lit/llvm/masked_interleaved_group2.ll     |  118 +
 .../vecz/test/lit/llvm/masking_exit_blocks.ll |   70 +
 .../test/lit/llvm/mem2reg_alloca_pointer1.ll  |   72 +
 .../test/lit/llvm/mem2reg_alloca_pointer2.ll  |   72 +
 .../vecz/test/lit/llvm/memop_stride.ll        |   34 +
 .../vecz/test/lit/llvm/memop_stride10.ll      |   38 +
 .../vecz/test/lit/llvm/memop_stride11.ll      |   38 +
 .../vecz/test/lit/llvm/memop_stride12.ll      |   38 +
 .../vecz/test/lit/llvm/memop_stride13.ll      |   38 +
 .../vecz/test/lit/llvm/memop_stride14.ll      |   35 +
 .../vecz/test/lit/llvm/memop_stride15.ll      |   38 +
 .../vecz/test/lit/llvm/memop_stride16.ll      |   36 +
 .../vecz/test/lit/llvm/memop_stride17.ll      |   40 +
 .../vecz/test/lit/llvm/memop_stride18.ll      |   37 +
 .../vecz/test/lit/llvm/memop_stride2.ll       |   35 +
 .../vecz/test/lit/llvm/memop_stride3.ll       |   35 +
 .../vecz/test/lit/llvm/memop_stride4.ll       |   37 +
 .../vecz/test/lit/llvm/memop_stride5.ll       |   35 +
 .../vecz/test/lit/llvm/memop_stride6.ll       |   35 +
 .../vecz/test/lit/llvm/memop_stride7.ll       |   35 +
 .../vecz/test/lit/llvm/memop_stride8.ll       |   36 +
 .../vecz/test/lit/llvm/memop_stride9.ll       |   38 +
 .../test/lit/llvm/multiple_exit_blocks.ll     |   63 +
 .../lit/llvm/multiple_kernels_inlining.ll     |   51 +
 .../lit/llvm/multiple_vectorization_flags.ll  |   44 +
 .../test/lit/llvm/multiple_vectorizations.ll  |  133 +
 .../llvm/multiple_vectorizations_nested.ll    |   50 +
 .../lit/llvm/multiple_vectorizations_vp.ll    |   39 +
 .../test/lit/llvm/no_instantiate_memop.ll     |   66 +
 .../test/lit/llvm/no_over_scalarization.ll    |   68 +
 .../test/lit/llvm/no_redundant_bitcasts.ll    |   93 +
 .../vecz/test/lit/llvm/no_vecz1.ll            |   43 +
 .../vecz/test/lit/llvm/no_vecz2.ll            |   57 +
 .../test/lit/llvm/offset_info_analysis.ll     |   55 +
 .../llvm/onearg_relationals_isfiniteDv4_d.ll  |   50 +
 .../llvm/onearg_relationals_isfiniteDv4_f.ll  |   49 +
 .../lit/llvm/onearg_relationals_isfinited.ll  |  268 ++
 .../lit/llvm/onearg_relationals_isfinitef.ll  |  268 ++
 .../lit/llvm/onearg_relationals_isinfDv4_d.ll |   49 +
 .../lit/llvm/onearg_relationals_isinfDv4_f.ll |   49 +
 .../lit/llvm/onearg_relationals_isinfd.ll     |  268 ++
 .../lit/llvm/onearg_relationals_isinff.ll     |  268 ++
 .../lit/llvm/onearg_relationals_isnanDv4_d.ll |   61 +
 .../lit/llvm/onearg_relationals_isnanDv4_f.ll |   61 +
 .../lit/llvm/onearg_relationals_isnand.ll     |  271 ++
 .../lit/llvm/onearg_relationals_isnanf.ll     |  271 ++
 .../llvm/onearg_relationals_isnormalDv4_d.ll  |   53 +
 .../llvm/onearg_relationals_isnormalDv4_f.ll  |   53 +
 .../lit/llvm/onearg_relationals_isnormald.ll  |  269 ++
 .../lit/llvm/onearg_relationals_isnormalf.ll  |  269 ++
 .../vecz/test/lit/llvm/opencl_metadata1.ll    |   77 +
 .../vecz/test/lit/llvm/opencl_metadata2.ll    |   76 +
 .../vecz/test/lit/llvm/overaligned_allocas.ll |   80 +
 .../test/lit/llvm/packetization_branch.ll     |   66 +
 .../test/lit/llvm/packetization_debug_info.ll |  167 +
 .../test/lit/llvm/packetization_nonvarying.ll |   93 +
 .../lit/llvm/packetization_uniform_branch.ll  |  105 +
 .../test/lit/llvm/packetize_struct_gep.ll     |   46 +
 .../lit/llvm/packetize_uniform_conditional.ll |  159 +
 .../packetize_uniform_default_conditional.ll  |  159 +
 .../packetize_uniform_default_noreduce.ll     |  159 +
 .../packetize_uniform_default_noreduce2.ll    |   73 +
 .../llvm/packetize_uniform_default_reduce.ll  |  165 +
 .../packetize_uniform_loops_conditional.ll    |  160 +
 .../llvm/packetize_uniform_loops_noreduce.ll  |  159 +
 .../llvm/packetize_uniform_loops_noreduce2.ll |   73 +
 .../llvm/packetize_uniform_loops_reduce.ll    |   76 +
 .../lit/llvm/packetize_uniform_noreduce.ll    |  159 +
 .../lit/llvm/packetize_uniform_noreduce2.ll   |   73 +
 .../test/lit/llvm/packetize_uniform_reduce.ll |   76 +
 .../test/lit/llvm/partial_linearization0.ll   |  377 ++
 .../test/lit/llvm/partial_linearization1.ll   |  261 ++
 .../test/lit/llvm/partial_linearization10.ll  |  465 +++
 .../test/lit/llvm/partial_linearization11.ll  |  357 ++
 .../test/lit/llvm/partial_linearization12.ll  |  627 ++++
 .../test/lit/llvm/partial_linearization13.ll  |  218 ++
 .../test/lit/llvm/partial_linearization14.ll  |  292 ++
 .../test/lit/llvm/partial_linearization15.ll  |  385 ++
 .../test/lit/llvm/partial_linearization16.ll  |  319 ++
 .../test/lit/llvm/partial_linearization17.ll  |  376 ++
 .../test/lit/llvm/partial_linearization18.ll  |  289 ++
 .../test/lit/llvm/partial_linearization19.ll  |  308 ++
 .../test/lit/llvm/partial_linearization2.ll   |  274 ++
 .../test/lit/llvm/partial_linearization20.ll  |  236 ++
 .../test/lit/llvm/partial_linearization21.ll  |  197 +
 .../test/lit/llvm/partial_linearization22.ll  |  264 ++
 .../test/lit/llvm/partial_linearization23.ll  |  247 ++
 .../test/lit/llvm/partial_linearization3.ll   |  269 ++
 .../test/lit/llvm/partial_linearization4.ll   |  195 +
 .../test/lit/llvm/partial_linearization5.ll   |  221 ++
 .../test/lit/llvm/partial_linearization6.ll   |  200 +
 .../test/lit/llvm/partial_linearization7.ll   |  228 ++
 .../test/lit/llvm/partial_linearization8.ll   |  191 +
 .../test/lit/llvm/partial_linearization9.ll   |  148 +
 .../llvm/partial_linearization_exit_masks.ll  |   65 +
 .../vecz/test/lit/llvm/pass_pipeline.ll       |   48 +
 .../test/lit/llvm/pass_pipeline_printafter.ll |   45 +
 .../vecz/test/lit/llvm/phi_interleaved.ll     |   89 +
 .../vecz/test/lit/llvm/phi_node_debug_info.ll |  135 +
 .../vecz/test/lit/llvm/phi_scatter_gather.ll  |   65 +
 .../test/lit/llvm/phi_scatter_gather_2.ll     |   60 +
 .../test/lit/llvm/predicate_with_switch.ll    |   61 +
 .../vecz/test/lit/llvm/preserve-fast-math.ll  |   35 +
 .../vecz/test/lit/llvm/printf_float.ll        |   88 +
 .../vecz/test/lit/llvm/regression_by_all.ll   |  126 +
 .../vecz/test/lit/llvm/remove_intptr.ll       |   52 +
 .../vecz/test/lit/llvm/remove_intptr_2.ll     |   42 +
 .../vecz/test/lit/llvm/remove_intptr_phi.ll   |   52 +
 .../vecz/test/lit/llvm/roscc_simplify.ll      |   53 +
 .../scalar_load_store_in_varying_branch.ll    |   51 +
 .../vecz/test/lit/llvm/scalar_splat.ll        |   38 +
 ...plat_after_load_store_in_varying_branch.ll |   48 +
 .../llvm/scalar_splat_after_varying_branch.ll |   45 +
 .../llvm/scalar_splat_in_varying_branch.ll    |   55 +
 .../vecz/test/lit/llvm/scalar_vector_user.ll  |   78 +
 .../vecz/test/lit/llvm/scalarization_calls.ll |   86 +
 .../lit/llvm/scalarization_calls_uniform.ll   |   47 +
 .../test/lit/llvm/scalarization_debug_info.ll |  183 +
 .../lit/llvm/scalarization_instructions.ll    |  142 +
 .../scalarization_instructions_uniform.ll     |   67 +
 .../llvm/scalarization_masked_load_store.ll   |   56 +
 .../vecz/test/lit/llvm/scalarize-gather.ll    |   55 +
 .../vecz/test/lit/llvm/scalarize-splat.ll     |   49 +
 .../vecz/test/lit/llvm/scalarize_mixed_gep.ll |   46 +
 .../vecz/test/lit/llvm/scan_fact.ll           |  197 +
 .../lit/llvm/secretly_scalar_load_store.ll    |   49 +
 .../vecz/test/lit/llvm/select-no-crash.ll     |   93 +
 .../vecz/test/lit/llvm/shuffled_load_1.ll     |   54 +
 .../vecz/test/lit/llvm/shuffled_load_2.ll     |   55 +
 .../vecz/test/lit/llvm/shuffled_load_3.ll     |   56 +
 .../vecz/test/lit/llvm/shuffled_load_4.ll     |   56 +
 .../vecz/test/lit/llvm/shuffled_load_5.ll     |   70 +
 .../vecz/test/lit/llvm/shuffled_load_6.ll     |   58 +
 .../vecz/test/lit/llvm/squash_extract_sext.ll |   69 +
 .../lit/llvm/squash_extract_sext_bigendian.ll |   69 +
 .../vecz/test/lit/llvm/squash_extract_zext.ll |   69 +
 .../lit/llvm/squash_extract_zext_bigendian.ll |   69 +
 .../test/lit/llvm/squash_float2_gather.ll     |   56 +
 .../vecz/test/lit/llvm/stride_aligned.ll      |   76 +
 .../lit/llvm/stride_aligned_scalarized.ll     |   63 +
 .../vecz/test/lit/llvm/stride_misaligned.ll   |   64 +
 .../lit/llvm/stride_misaligned_scalarized.ll  |   63 +
 .../vecz/test/lit/llvm/struct_phi.ll          |  107 +
 .../vecz/test/lit/llvm/struct_select.ll       |   49 +
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |   88 +
 .../vecz/test/lit/llvm/subgroup_reductions.ll |  246 ++
 ...ions_spv_khr_uniform_group_instructions.ll |  194 +
 .../vecz/test/lit/llvm/subgroup_scans.ll      |  162 +
 ...cans_spv_khr_uniform_group_instructions.ll |  175 +
 .../ternary_transform_different_strides.ll    |   54 +
 .../llvm/ternary_transform_divergent_gep.ll   |   54 +
 .../ternary_transform_divergent_source.ll     |   54 +
 .../lit/llvm/ternary_transform_negative.ll    |   44 +
 .../lit/llvm/ternary_transform_positive.ll    |   54 +
 ...ary_transform_uniform_cond_diff_strides.ll |   54 +
 .../ternary_transform_uniform_condition.ll    |   52 +
 ..._transform_uniform_condition_packetized.ll |   46 +
 .../llvm/ternary_transform_uniform_source.ll  |   52 +
 .../llvm/ternary_transform_uniform_sources.ll |   52 +
 .../llvm/too_large_simdwidth_packetization.ll |  117 +
 .../llvm/too_large_simdwidth_scalarization.ll |   41 +
 .../vecz/test/lit/llvm/undef_debug_info.ll    |  120 +
 .../vecz/test/lit/llvm/undef_ub.ll            |   47 +
 .../test/lit/llvm/uniform_address_base.ll     |   56 +
 .../test/lit/llvm/uniform_address_index.ll    |   56 +
 .../vecz/test/lit/llvm/uniform_loop.ll        |   45 +
 .../lit/llvm/uniform_loop_contiguous_phi1.ll  |   49 +
 .../lit/llvm/uniform_loop_contiguous_phi2.ll  |   50 +
 .../lit/llvm/uniform_loop_contiguous_phi3.ll  |   51 +
 .../lit/llvm/uniform_loop_contiguous_phi4.ll  |   51 +
 .../test/lit/llvm/uniform_loop_metadata.ll    |   50 +
 .../test/lit/llvm/uniform_reassociation1.ll   |   58 +
 .../test/lit/llvm/uniform_reassociation2.ll   |   59 +
 .../test/lit/llvm/uniform_reassociation3.ll   |   59 +
 .../test/lit/llvm/unmangled_builtin_call.ll   |   67 +
 .../vecz/test/lit/llvm/user_calls.ll          |  113 +
 .../vecz/test/lit/llvm/varying_load1.ll       |   86 +
 .../vecz/test/lit/llvm/varying_load2.ll       |   89 +
 .../llvm/vector_intrinsics_scalarization.ll   |   80 +
 .../vecz/test/lit/llvm/vector_phi_uniform.ll  |   87 +
 .../vecz/test/lit/llvm/vector_phi_varying.ll  |   97 +
 .../vecz/test/lit/llvm/vector_printf.ll       |   92 +
 .../vecz/test/lit/llvm/vector_printf32.ll     |   92 +
 .../vecz/test/lit/llvm/vector_printf64.ll     |   92 +
 .../test/lit/llvm/vector_printf_floats.ll     |  102 +
 .../vector_printf_floats_no_double_support.ll |  100 +
 .../vecz/test/lit/llvm/vecz_blend_div_loop.ll |  154 +
 .../test/lit/llvm/vecz_scalar_gather_load.ll  |  111 +
 .../lit/llvm/vecz_scalar_interleaved_load.ll  |   83 +
 .../vecz/test/lit/llvm/workitem_builtins.ll   |  105 +
 .../vecz/tools/source/veczc.cpp               |  446 +++
 518 files changed, 79698 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
new file mode 100644
index 0000000000000..d9419af1024e8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -0,0 +1,133 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Vecz passes header.
+
+#ifndef VECZ_PASS_H
+#define VECZ_PASS_H
+
+#include <compiler/utils/vectorization_factor.h>
+#include <llvm/ADT/Optional.h>
+#include <llvm/IR/PassManager.h>
+
+#include <cstdint>
+
+#include "vecz/vecz_choices.h"
+
+namespace llvm {
+class ModulePass;
+class StringRef;
+class Module;
+class TargetMachine;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+class BuiltinInfo;
+}  // namespace utils
+}  // namespace compiler
+
+namespace vecz {
+/// @addtogroup vecz
+/// @{
+
+struct VeczPassOptions {
+  VeczPassOptions() : vecz_auto(false), vec_dim_idx(0), local_size(0) {}
+
+  /// @brief boolean choices such as double support, partial scalarization
+  vecz::VectorizationChoices choices;
+
+  /// @brief vectorization factor, including known min and scalable flag
+  compiler::utils::VectorizationFactor factor;
+
+  /// @brief automatically work out factor
+  bool vecz_auto;
+
+  /// @brief Index of vectorization dimension to use (0 => x, 1 => y, 2 => z).
+  uint32_t vec_dim_idx;
+
+  /// @param local_size Value specifying the local size for the function (0 is
+  /// unknown)
+  uint64_t local_size;
+};
+
+/// @brief Analysis pass which determines on which functions @ref RunVeczPass
+/// should operate.
+class VeczPassOptionsAnalysis
+    : public llvm::AnalysisInfoMixin<VeczPassOptionsAnalysis> {
+  using VeczPassOptionsCallbackFn =
+      std::function<bool(llvm::Function &, llvm::ModuleAnalysisManager &,
+                         llvm::SmallVectorImpl<VeczPassOptions> &)>;
+  friend AnalysisInfoMixin<VeczPassOptionsAnalysis>;
+  static llvm::AnalysisKey Key;
+  VeczPassOptionsCallbackFn queryFunc =
+      [](llvm::Function &F, llvm::ModuleAnalysisManager &,
+         llvm::SmallVectorImpl<VeczPassOptions> &Opts) -> bool {
+    if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
+      return false;
+    }
+    // TODO what are our defaults, here?
+    Opts.emplace_back();
+    return true;
+  };
+
+ public:
+  VeczPassOptionsAnalysis() = default;
+  /// @brief explicit constructor which uses the given callback to determine
+  /// whether vectorization should be performed on the passed function. If the
+  /// default constructor is used, all functions with a SPIR calling convention
+  /// will be vectorized
+  explicit VeczPassOptionsAnalysis(VeczPassOptionsCallbackFn queryFunc)
+      : queryFunc(queryFunc) {}
+  using Result = VeczPassOptionsCallbackFn;
+  Result run(llvm::Module &, llvm::ModuleAnalysisManager &) {
+    return queryFunc;
+  }
+};
+
+/// @brief A helper pass which can be used to inspect and test the
+/// vectorization options set on a per-function basis.
+class VeczPassOptionsPrinterPass
+    : public llvm::PassInfoMixin<VeczPassOptionsPrinterPass> {
+  llvm::raw_ostream &OS;
+
+ public:
+  explicit VeczPassOptionsPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+/// @brief A new-style module pass that provides a wrapper for using the
+/// the ComputeAorta IR vectorizer. This vectorizes kernels
+/// to vectorization factor specified when the pass is created. In our case this
+/// is typically the local size in the first dimension but there are other
+/// factors to consider when picking the vectorization factor, like being a
+/// power of 2. This pass queries the @ref `VeczShouldRunOnFunctionAnalysis`, so
+/// if you do not wish all kernels to be vectorized, you must ensure your pass
+/// manager's ModuleAnalysisManager is configured with a custom @ref
+/// `VeczShouldRunOnFunctionAnalysis`
+class RunVeczPass : public llvm::PassInfoMixin<RunVeczPass> {
+ public:
+  /// @brief llvm's entry point for the PassManager
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_PASS_H
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
new file mode 100644
index 0000000000000..e714e66fee19a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
@@ -0,0 +1,294 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Internal Vecz Choices header.
+
+#ifndef VECZ_VECZ_CHOICES_H_INCLUDED
+#define VECZ_VECZ_CHOICES_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/StringRef.h>
+
+// Forward declaration
+namespace llvm {
+class StringRef;
+class Twine;
+}  // namespace llvm
+
+namespace vecz {
+
+/// @brief Describes and holds various Vecz choices.
+///
+/// These choices can affect the code generated and are usually optimization
+/// related. Since they are not always the best choice for a given target, they
+/// are controlled at runtime by this class.
+class VectorizationChoices {
+ public:
+  VectorizationChoices();
+  ~VectorizationChoices() = default;
+
+  /// @brief Enumeration with the available choices for Vecz.
+  ///
+  /// These are choices that can affect the code generated, often for
+  /// optimization reasons. The Choices are prefixed by a `e<Category>` prefix,
+  /// where `<Category>` is an arbitrary string to help document the intention
+  /// of the Choice. For example, optimizations are prefixed with
+  /// `eOptimization`.
+  ///
+  /// @note Each Choice has to be uniquely named without taking into account
+  /// it's prefix, i.e. there shouldn't be any Choices sharing the same name
+  /// but with different prefixes. Also, Choices names must not start with
+  /// `"no"`, although different capitalizations (e.g. `"No"`) are allowed.
+  /// Additionally, Choices' names should contain only alphanumeric characters.
+  /// These restrictions are in place to allow for a `Choices` string to be
+  /// parsable easily. See, for example, `parseChoicesString` . If you add a
+  /// new Choice here, please also update the parseChoicesString function, as
+  /// well as the two relevant `cl::opt` in `vectorizer.cpp`.
+  enum Choice {
+    /// @brief An invalid Choice ID, useful for error checking etc. Equals 0.
+    eInvalid = 0,
+    /// @brief Packetize uniform instructions instead of using a vector splat.
+    ///
+    /// When going through the packetization process, the default behaviour when
+    /// encountering a uniform instruction is creating a vector splat
+    /// with its value and stopping the packetization there. This option changes
+    /// that behaviour, and instead makes the packetizer packetize even the
+    /// uniform instructions, provided that they are used by a varying
+    /// instruction.
+    eOptimizationPacketizeUniform,
+    /// @brief Packetize uniform instructions, but only in loops.
+    ///
+    /// This is similar to eOptimizationPacketizeUniform, with the difference
+    /// that it only affects uniform values used inside loops.
+    eOptimizationPacketizeUniformInLoops,
+    /// @brief Emit loops for instantiated call instructions
+    ///
+    /// This will emit instantiated call instruction in loops instead of
+    /// actually instantiating them. It only works when the call instruction has
+    /// no users.
+    eOptimizationInstantiateCallsInLoops,
+    /// @brief Use the BOSCC linearization algorithm during Control-Flow
+    //         Conversion.
+    //
+    //  @note This optimization retains uniform branches by duplicating pieces
+    //  of the code.
+    eLinearizeBOSCC,
+    /// @brief Turn on full scalarization in the Scalarization pass
+    //
+    // This is useful for testing the scalarizer, and isn't intended to deliver
+    // any performance benefits.
+    eFullScalarization,
+    /// @brief Treat division operations as being able to throw CPU exceptions
+    ///
+    /// @note This choice must be enabled for strict correctness on targets that
+    /// support hardware exceptions on division by zero/division overflow, which
+    /// require extra code to prevent traps on inactive vector lanes during
+    /// linearization. However, any trapping behaviour of the input IR may be
+    /// preserved (that is, on positively-executed code paths); it is left to
+    /// the front end to conform to OpenCL spec in this regard.
+    eDivisionExceptions,
+    /// @brief Generate a vector-predicated kernel such that no work-items
+    /// (vector elements) with side effects with IDs beyond the local workgroup
+    /// size are enabled.
+    ///
+    /// @note The exact semantics concerning which operations are
+    /// masked/unmasked are not defined. The guarantee is that the vectorized
+    /// kernel will be safe to execute on workgroups with sizes smaller than
+    /// the vector width. Some architectures may want to predicate beyond that
+    /// remit for performance reasons, even if the vector-predicated operations
+    /// are safe to execute on any input.
+    eVectorPredication,
+    /// @brief Force a default vectorization width, made without
+    /// target-specific knowledge.
+    ///
+    /// @note This is most-commonly used in testing. Packetization may make
+    /// decisions based on the target, which can make testing more difficult.
+    /// This choice forces the default vector register width.
+    eTargetIndependentPacketization,
+  };
+
+  /// @brief Check if a choice is enabled or not
+  /// @param C The choice to check for
+  /// @return true if the choice is enabled, false otherwise
+  bool isEnabled(Choice C) const { return Enabled.count(C) > 0; }
+
+  /// @brief Enable a choice
+  /// @param C The choice to enable
+  /// @return true if the choice was already enabled, false otherwise
+  bool enable(Choice C) {
+    auto res = Enabled.insert(C);
+    return res.second;
+  }
+
+  /// @brief Disable a choice
+  /// @param C The choice to disable
+  /// @return true if the choice was enabled, false otherwise
+  bool disable(Choice C) { return Enabled.erase(C); }
+
+  /// @brief Parse a semicolon separated of Choices to enable or disable
+  ///
+  /// This functions accepts a string of Choices, separated by semicolon, and
+  /// enables or disables them. The Choices are parsed according to the
+  /// following rules:
+  /// - The Choices are separated by a semicolon (';') character
+  /// - Only one separator is allowed between each Choice.
+  /// - Trailing separators are ignored (but only one is allowed still).
+  /// - Choices are specified as they are in their enumerations, without the
+  ///   "e<Category>" prefix.
+  /// - Choices can be prefixed with the "no" prefix (without any whitespace),
+  ///   which specifies that the Choice needs to be disabled instead of being
+  ///   enabled.
+  /// - The "no" prefix only applies to the Choice it is attached to and not to
+  ///   any following Choices.
+  /// - Whitespace between the Choices and the separators, as well as leading
+  ///   and trailing whitespace at the beginning and end of the string, is
+  ///   ignored.
+  ///
+  /// Examples:
+  /// - "PacketizeUniform"
+  /// - "PacketizeUniform;InstantiateCallsInLoops"
+  /// - "PacketizeUniform ;   noInstantiateCallsInLoops \n"
+  /// - " noPacketizeUniform;noInstantiateCallsInLoops; "
+  ///
+  /// @param[in] Str The string containing the Choices to enable/disable
+  /// @return true on success, false if the parsing failed
+  bool parseChoicesString(llvm::StringRef Str);
+
+  /// @brief Convert a Choice name from a string to the matching Choice value
+  ///
+  /// The choices are matched without their e<Category> prefix.
+  ///
+  /// @param[in] Str The string with the Choice name
+  /// @return The Choice name, or eInvalid in case of error
+  static Choice fromString(llvm::StringRef Str);
+
+  //
+  // Specific getters and setters for the most commonly used choices
+  //
+
+  /// @brief Check if the eOptimizationPacketizeUniform choice is set
+  /// @return true if the choice is set, false otherwise
+  bool packetizeUniform() const {
+    return isEnabled(eOptimizationPacketizeUniform);
+  }
+  /// @brief Enable the eOptimizationPacketizeUniform choice
+  /// @return true if eOptimizationPacketizeUniform was already enabled
+  bool enablePacketizeUniform() {
+    return enable(eOptimizationPacketizeUniform);
+  }
+  /// @brief Disable the eOptimizationPacketizeUniform choice
+  /// @return true if eOptimizationPacketizeUniform was enabled
+  bool disablePacketizeUniform() {
+    return disable(eOptimizationPacketizeUniform);
+  }
+
+  /// @brief Check if the eOptimizationPacketizeUniformInLoops choice is set
+  /// @return true if the choice is set, false otherwise
+  bool packetizeUniformInLoops() const {
+    return isEnabled(eOptimizationPacketizeUniformInLoops);
+  }
+  /// @brief Enable the eOptimizationPacketizeUniformInLoops choice
+  /// @return true if eOptimizationPacketizeUniformInLoops was already enabled
+  bool enablePacketizeUniformInLoops() {
+    return enable(eOptimizationPacketizeUniformInLoops);
+  }
+  /// @brief Disable the eOptimizationPacketizeUniformInLoops choice
+  /// @return true if eOptimizationPacketizeUniformInLoops was enabled
+  bool disablePacketizeUniformInLoops() {
+    return disable(eOptimizationPacketizeUniformInLoops);
+  }
+
+  /// @brief Check if the eOptimizationInstantiateCallsInLoops choice is set
+  /// @return true if the choice is set, false otherwise
+  bool instantiateCallsInLoops() const {
+    return isEnabled(eOptimizationInstantiateCallsInLoops);
+  }
+  /// @brief Enable the eOptimizationInstantiateCallsInLoops choice
+  /// @return true if eOptimizationInstantiateCallsInLoops was already enabled
+  bool enableInstantiateCallsInLoops() {
+    return enable(eOptimizationInstantiateCallsInLoops);
+  }
+  /// @brief Disable the eOptimizationInstantiateCallsInLoops choice
+  /// @return true if eOptimizationInstantiateCallsInLoops was enabled
+  bool disableInstantiateCallsInLoops() {
+    return disable(eOptimizationInstantiateCallsInLoops);
+  }
+
+  /// @brief Check if the eLinearizeBOSCC choice is set
+  /// @return true if the choice is set, false otherwise
+  bool linearizeBOSCC() const { return isEnabled(eLinearizeBOSCC); }
+  /// @brief Enable the eLinearizeBOSCC choice
+  /// @return true if eLinearizeBOSCC was already enabled
+  bool enableLinearizeBOSCC() { return enable(eLinearizeBOSCC); }
+  /// @brief Disable the eLinearizeBOSCC choice
+  /// @return true if eLinearizeBOSCC was enabled
+  bool disableLinearizeBOSCC() { return disable(eLinearizeBOSCC); }
+
+  /// @brief Check if the eVectorPredication choice is set
+  /// @return true if the choice is set, false otherwise
+  bool vectorPredication() const { return isEnabled(eVectorPredication); }
+  /// @brief Enable the eVectorPredication choice
+  /// @return true if eVectorPredication was already enabled
+  bool enableVectorPredication() { return enable(eVectorPredication); }
+  /// @brief Disable the eVectorPredication choice
+  /// @return true if eVectorPredication was enabled
+  bool disableVectorPredication() { return disable(eVectorPredication); }
+
+  /// @brief Check if the eTargetIndependentPacketization choice is set
+  /// @return true if the choice is set, false otherwise
+  bool targetIndependentPacketization() const {
+    return isEnabled(eTargetIndependentPacketization);
+  }
+  /// @brief Enable the eTargetIndependentPacketization choice
+  /// @return true if eTargetIndependentPacketization was already enabled
+  bool enableTargetIndependentPacketization() {
+    return enable(eTargetIndependentPacketization);
+  }
+  /// @brief Disable the eTargetIndependentPacketization choice
+  /// @return true if eTargetIndependentPacketization was enabled
+  bool disableTargetIndependentPacketization() {
+    return disable(eTargetIndependentPacketization);
+  }
+
+  struct ChoiceInfo {
+    llvm::StringLiteral name;
+    Choice number;
+    llvm::StringLiteral desc;
+  };
+
+  static llvm::ArrayRef<ChoiceInfo> queryAvailableChoices();
+
+ private:
+  /// @brief All the choices enabled
+  llvm::SmallSet<Choice, 2> Enabled;
+
+  /// @brief Print an error message, used by parseChoicesString
+  ///
+  /// The error message will contain the message given as well as the Choices
+  /// string being parsed and the position that the error occured.
+  //
+  /// @param[in] Input The Choices string being parsed
+  /// @param[in] Position The position where the parsin error occured
+  /// @param[in] Msg The accompanying error message
+  static void printChoicesParseError(llvm::StringRef Input, unsigned Position,
+                                     llvm::Twine Msg);
+};
+
+}  // namespace vecz
+#endif  // VECZ_VECZ_CHOICES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
new file mode 100644
index 0000000000000..74d22c2391b53
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -0,0 +1,695 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief External vecz header.  Contains the API to the vectorizer.
+
+#ifndef VECZ_VECZ_TARGET_INFO_H_INCLUDED
+#define VECZ_VECZ_TARGET_INFO_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/TypeSize.h>
+
+namespace llvm {
+class TargetMachine;
+class TargetTransformInfo;
+class Type;
+}  // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @addtogroup vecz
+/// @{
+
+/// @brief Kinds of interleaved memory operations.
+enum InterleavedOperation : int {
+  /// @brief Invalid memory operation.
+  eInterleavedInvalid = 0,
+  /// @brief Store memory operation.
+  eInterleavedStore,
+  /// @brief Load memory operation.
+  eInterleavedLoad,
+  /// @brief Masked Store memory operation.
+  eMaskedInterleavedStore,
+  /// @brief Masked Load memory operation.
+  eMaskedInterleavedLoad
+};
+
+/// @brief Used by the vectorizer to query for target capabilities and
+/// materialize memory intrinsics.
+class TargetInfo {
+ public:
+  /// @brief Create a new vector target info instance.
+  /// @param[in] tm LLVM target machine that will be used for compilation, can
+  /// be NULL if no target data is available.
+  TargetInfo(llvm::TargetMachine *tm);
+
+  virtual ~TargetInfo() = default;
+
+  /// @brief Return the target machine.
+  llvm::TargetMachine *getTargetMachine() const { return TM_; }
+
+  /// @brief Create a vector load. If a stride greater than one is used, the
+  /// load will be interleaved, i.e. lanes are loaded from non-contiguous
+  /// memory.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] stride Distance in elements between two lanes in memory.
+  ///                     A stride of one represents a contiguous load.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If null, the operation is unpredicated:
+  /// it is executed on all lanes.
+  ///
+  /// @return IR value that results from the vector load.
+  virtual llvm::Value *createLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
+                                  llvm::Value *ptr, llvm::Value *stride,
+                                  llvm::Value *evl = nullptr) const;
+
+  /// @brief Create a vector store. If a stride greater than one is used, the
+  /// store will be interleaved, i.e. lanes are stored to non-contiguous memory.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] stride Distance in elements between two lanes in memory.
+  ///                     A stride of one represents a contiguous store.
+  /// @param[in] alignment The alignment of the store, in bytes
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If null, the operation is unpredicated:
+  /// it is executed on all lanes.
+  ///
+  /// @return IR value that results from the vector store.
+  virtual llvm::Value *createStore(llvm::IRBuilder<> &builder,
+                                   llvm::Value *data, llvm::Value *ptr,
+                                   llvm::Value *stride, unsigned alignment,
+                                   llvm::Value *evl = nullptr) const;
+
+  /// @brief Create a masked vector load.
+  ///        Only lanes with a non-zero mask will be loaded from the address.
+  ///        Other lanes will contain undefined data.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] mask Vector mask used to disable loading certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked vector load.
+  virtual llvm::Value *createMaskedLoad(llvm::IRBuilder<> &builder,
+                                        llvm::Type *ty, llvm::Value *ptr,
+                                        llvm::Value *mask, llvm::Value *evl,
+                                        unsigned alignment) const;
+
+  /// @brief Create a masked vector store.
+  ///        Only lanes with a non-zero mask will be stored to the address.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] mask Vector mask used to disable storing certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked vector store.
+  virtual llvm::Value *createMaskedStore(llvm::IRBuilder<> &builder,
+                                         llvm::Value *data, llvm::Value *ptr,
+                                         llvm::Value *mask, llvm::Value *evl,
+                                         unsigned alignment) const;
+
+  /// @brief Create a interleaved vector load.
+  ///
+  /// @note Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value to.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment  Alignment of the load
+  ///
+  /// @return IR value that results from the interleaved load.
+  virtual llvm::Value *createInterleavedLoad(llvm::IRBuilder<> &builder,
+                                             llvm::Type *ty, llvm::Value *ptr,
+                                             llvm::Value *stride,
+                                             llvm::Value *evl,
+                                             unsigned alignment) const;
+
+  /// @brief Create a interleaved vector store.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the load
+  ///
+  /// @return IR value that results from the interleaved vector store.
+  virtual llvm::Value *createInterleavedStore(
+      llvm::IRBuilder<> &builder, llvm::Value *data, llvm::Value *ptr,
+      llvm::Value *stride, llvm::Value *evl, unsigned alignment) const;
+
+  /// @brief Create a masked interleaved vector load.
+  ///        Only lanes with a non-zero mask will be loaded from the address.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value to.
+  /// @param[in] mask Vector mask used to disable loading certain lanes.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the load
+  ///
+  /// @return IR value that results from the masked interleaved vector load.
+  virtual llvm::Value *createMaskedInterleavedLoad(
+      llvm::IRBuilder<> &builder, llvm::Type *ty, llvm::Value *ptr,
+      llvm::Value *mask, llvm::Value *stride, llvm::Value *evl,
+      unsigned alignment) const;
+
+  /// @brief Create a masked interleaved vector store.
+  ///        Only lanes with a non-zero mask will be stored to the address.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] mask Vector mask used to disable storing certain lanes.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the load
+  ///
+  /// @return IR value that results from the masked interleaved vector store.
+  virtual llvm::Value *createMaskedInterleavedStore(
+      llvm::IRBuilder<> &builder, llvm::Value *data, llvm::Value *ptr,
+      llvm::Value *mask, llvm::Value *stride, llvm::Value *evl,
+      unsigned alignment) const;
+
+  /// @brief Create a gather vector load.
+  ///        Vector lanes are loaded from different memory addresses.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the gather vector load.
+  virtual llvm::Value *createGatherLoad(llvm::IRBuilder<> &builder,
+                                        llvm::Type *ty, llvm::Value *ptr,
+                                        llvm::Value *evl,
+                                        unsigned alignment) const;
+
+  /// @brief Create a scatter vector store.
+  ///        Vector lanes are stored to different memory addresses.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the scatter vector store.
+  virtual llvm::Value *createScatterStore(llvm::IRBuilder<> &builder,
+                                          llvm::Value *data, llvm::Value *ptr,
+                                          llvm::Value *evl,
+                                          unsigned alignment) const;
+
+  /// @brief Create a masked gather vector load.
+  ///        Only lanes with a non-zero mask will be loaded from different
+  ///        address.
+  ///        Other lanes will contain undefined data.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] mask Vector mask used to disable loading certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked gather vector load.
+  virtual llvm::Value *createMaskedGatherLoad(llvm::IRBuilder<> &builder,
+                                              llvm::Type *ty, llvm::Value *ptr,
+                                              llvm::Value *mask,
+                                              llvm::Value *evl,
+                                              unsigned alignment) const;
+
+  /// @brief Create a masked scatter vector store.
+  ///        Only lanes with a non-zero mask will be stored to the address.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] mask Vector mask used to disable storing certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked scatter vector store.
+  virtual llvm::Value *createMaskedScatterStore(
+      llvm::IRBuilder<> &builder, llvm::Value *data, llvm::Value *ptr,
+      llvm::Value *mask, llvm::Value *evl, unsigned alignment) const;
+
+  /// @brief Create a scalable extractelement instruction. Note that the
+  /// operands are expected to have been pre-packetized before passing to this
+  /// function.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] Ctx Vectorization context.
+  /// @param[in] extract The original pre-packetized extractelement Instruction
+  /// @param[in] narrowTy Narrowed type of @a extract.
+  /// @param[in] src The packetized source vector
+  /// @param[in] index The packetized extraction index
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  ///
+  /// @return A value identical to the requested extractelement
+  virtual llvm::Value *createScalableExtractElement(
+      llvm::IRBuilder<> &builder, vecz::VectorizationContext &Ctx,
+      llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src,
+      llvm::Value *index, llvm::Value *evl) const;
+
+  /// @brief Create an outer broadcast of a vector. An outer broadcast is one
+  /// where a vector with length V is replicated in its entirety N times across
+  /// the lanes of a larger vector with length L x V. The broadcast factor is
+  /// expected to be scalable:
+  ///
+  ///   outer_broadcast(<A,B>, vscale x 1) -> <A,B,A,B,A,B,...>
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] vector Vector to broadcast.
+  /// @param[in] VL Vector length.
+  /// @param[in] factor Broadcast factor.
+  virtual llvm::Value *createOuterScalableBroadcast(
+      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
+      llvm::ElementCount factor) const;
+
+  /// @brief Create an inner broadcast of a vector. An inner broadcast is one
+  /// where a vector with length V has its lanes individually and sequentially
+  /// replicated N times to fill a larger vector with length L x V. The
+  /// broadcast factor is expected to be a fixed amount:
+  ///
+  ///   inner_broadcast(<A,B,C,...>, 2) -> <A,A,B,B,C,C, ...>
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] vector Vector to broadcast.
+  /// @param[in] VL Vector length.
+  /// @param[in] factor Broadcast factor.
+  virtual llvm::Value *createInnerScalableBroadcast(
+      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
+      llvm::ElementCount factor) const;
+
+  /// @brief Utility function for packetizing an insertelement instruction by a
+  /// scalable factor. Note that the operands are expected to have been
+  /// pre-packetized before passing to this function.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] Ctx Vectorization context.
+  /// @param[in] insert the original pre-packetized insertelement Instruction
+  /// @param[in] elt the packetized element to insert
+  /// @param[in] into the packetized source vector
+  /// @param[in] index the packetized insertion index
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  ///
+  /// @return a value identical to the requested insertelement
+  virtual llvm::Value *createScalableInsertElement(
+      llvm::IRBuilder<> &builder, vecz::VectorizationContext &Ctx,
+      llvm::Instruction *insert, llvm::Value *elt, llvm::Value *into,
+      llvm::Value *index, llvm::Value *evl) const;
+
+  /// @brief Function allowing targets to customize the insertion of
+  /// instructions to calculate the vector-predicated kernel width.
+  ///
+  /// Note that this must return an expression equivalent to:
+  ///   i32 = umin(%factor, %remainingIters)
+  /// This is the expression computed if this function returns nullptr.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] remainingIters the remaining number of work-items being
+  /// executed in the work-group in the dimension being vectorized.
+  /// @param[in] widestEltTy an optimization hint indicating the widest (vector
+  /// element) type in the kernel. Must not be relied on for correctness.
+  /// @param[in] factor the vectorization width.
+  virtual llvm::Value *createVPKernelWidth(llvm::IRBuilder<> &builder,
+                                           llvm::Value *remainingIters,
+                                           unsigned widestEltTy,
+                                           llvm::ElementCount factor) const {
+    (void)builder;
+    (void)remainingIters;
+    (void)widestEltTy;
+    (void)factor;
+    return nullptr;
+  }
+
+  /// @brief Create a single-source vector shuffle with a general shuffle mask.
+  /// Can work with dynamic shuffle masks and scalable vectors, and can return
+  /// vectors of a different length to the source.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] src the source vector
+  /// @param[in] mask the shuffle mask
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes.
+  ///
+  /// @return the result of the shuffle operation
+  virtual llvm::Value *createVectorShuffle(llvm::IRBuilder<> &builder,
+                                           llvm::Value *src, llvm::Value *mask,
+                                           llvm::Value *evl) const;
+
+  /// @brief Create a vector slide-up operation, that moves all vector elements
+  /// up by one place, with the specified element inserted into the zeroth
+  /// position.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] src the source vector
+  /// @param[in] insert the value to slide into the vacant position
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes.
+  ///
+  /// @return the result of the slide-up operation
+  virtual llvm::Value *createVectorSlideUp(llvm::IRBuilder<> &builder,
+                                           llvm::Value *src,
+                                           llvm::Value *insert,
+                                           llvm::Value *evl) const;
+
+  /// @brief Determine whether the specified group of interleaved memory
+  /// instructions can be optimized or not.
+  ///
+  /// @param[in] val Memory access operation.
+  /// @param[in] kind Kind of interleaved instructions.
+  /// @param[in] stride Stride of the interleaved memory operations.
+  /// @param[in] groupSize Number of interleaved operations in the group.
+  ///
+  /// @return true if the interleaved group can be optimized, false otherwise.
+  virtual bool canOptimizeInterleavedGroup(const llvm::Instruction &val,
+                                           InterleavedOperation kind,
+                                           int stride,
+                                           unsigned groupSize) const;
+
+  /// @brief Try to optimize a group of consecutive interleaved vector memory
+  /// instructions. These instructions collectively access a consecutive chunk
+  /// of memory and are sorted by increasing address.
+  ///
+  /// @note Pointers are scalar and need to be 'scalar aligned'.
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] Kind Kind of interleaved group to look for.
+  /// @param[in] group List of interleaved operations.
+  /// @param[in] masks List of mask operands.
+  /// @param[in] baseAddress Base pointer for the memory operation.
+  /// @param[in] stride Stride of the interleaved memory operations.
+  ///
+  /// @return Return true if the interleaved group was optimized or false.
+  virtual bool optimizeInterleavedGroup(llvm::IRBuilder<> &builder,
+                                        InterleavedOperation Kind,
+                                        llvm::ArrayRef<llvm::Value *> group,
+                                        llvm::ArrayRef<llvm::Value *> masks,
+                                        llvm::Value *baseAddress,
+                                        int stride) const;
+
+  /// @brief (De-)interleave a list of vectors.
+  ///
+  /// @param[in] builder Builder used to generate new instructions.
+  /// @param[in,out] vectors List of vectors to (de-)interleave.
+  /// @param[in] forward true to interleave, false to deinterleave.
+  ///
+  /// @return true if the vectors were (de-)interleaved, false otherwise.
+  virtual bool interleaveVectors(llvm::IRBuilder<> &builder,
+                                 llvm::MutableArrayRef<llvm::Value *> vectors,
+                                 bool forward) const;
+
+  /// @brief Estimates the widest SIMD width that will fit into registers for a
+  ///        given set of values.
+  ///
+  /// @param[in] TTI the Target Transform Info
+  /// @param[in] vals Set of values to fit into registers
+  /// @param[in] width the widest SIMD width to consider
+  /// @return the widest SIMD width that is expected to fit into registers, or
+  ///         zero if the set can never fit into registers.
+  virtual unsigned estimateSimdWidth(
+      const llvm::TargetTransformInfo &TTI,
+      const llvm::ArrayRef<const llvm::Value *> vals, unsigned width) const;
+
+  /// @brief Get the preferred vector width for the given scalar type
+  ///
+  /// @param[in] TTI the Target Transform Info
+  /// @param[in] Ty the scalar type to get the width for
+  /// @return the preferred vector width
+  virtual unsigned getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
+                                         const llvm::Type &Ty) const;
+
+  /// @brief Return whether the value can be packetized by the given width.
+  ///
+  /// @param[in] Val The value to be packetized
+  /// @param[in] Width The vectorization factor by which to packetize Val
+  /// @return true if the value can be packetized, false otherwise.
+  virtual bool canPacketize(const llvm::Value *Val,
+                            llvm::ElementCount Width) const;
+
+  /// @return Whether a given vector type would be legal as the result of a
+  /// binary vp intrinsic.
+  virtual bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const;
+
+ protected:
+  /// @brief This type indicates legality of a VP/Masked memory operation in a
+  /// target.
+  class VPMemOpLegality {
+   public:
+    constexpr VPMemOpLegality() = default;
+    constexpr VPMemOpLegality(bool VPLegal, bool MaskLegal)
+        : VPLegal(VPLegal), MaskLegal(MaskLegal) {}
+
+    /// @brief States whether the operation is legal as or not a VP intrinsic.
+    void setVPLegality(bool Legal) { VPLegal = Legal; }
+
+    /// @brief States whether the operation is legal ot not as a masked memory
+    /// operation.
+    void setMaskLegality(bool Legal) { MaskLegal = Legal; }
+
+    /// @brief Tests whether the operation is legal as a VP intrinsic.
+    constexpr bool isVPLegal() const { return VPLegal; }
+
+    /// @brief Tests whether the operation is legal as a masked memory
+    /// operation.
+    constexpr bool isMaskLegal() const { return MaskLegal; }
+
+   private:
+    bool VPLegal = false;
+    bool MaskLegal = false;
+  };
+
+  /// @brief Create an indices vector to be used in createScalableBroadcast()
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Type of the indices vector.
+  /// @param[in] factor Vectorization factor.
+  /// @param[in] URem Whether to broadcast a fixed-length vector to a scalable
+  /// one or a scalable-vector by a fixed amount.
+  /// @param[in] N Name of the value to produce.
+  static llvm::Value *createBroadcastIndexVector(llvm::IRBuilder<> &builder,
+                                                 llvm::Type *ty,
+                                                 llvm::ElementCount factor,
+                                                 bool URem,
+                                                 const llvm::Twine &N = "");
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.load or
+  /// a masked.load intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to load.
+  /// @param[in] Alignment Alignment of the operation.
+  virtual VPMemOpLegality isVPLoadLegal(const llvm::Function *F, llvm::Type *Ty,
+                                        unsigned Alignment) const;
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.store or
+  /// a masked.store intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to store.
+  /// @param[in] Alignment Alignment of the operation.
+  virtual VPMemOpLegality isVPStoreLegal(const llvm::Function *F,
+                                         llvm::Type *Ty,
+                                         unsigned Alignment) const;
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.gather
+  /// or a masked.gather intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to gather.
+  /// @param[in] Alignment Alignment of the operation.
+  virtual VPMemOpLegality isVPGatherLegal(const llvm::Function *F,
+                                          llvm::Type *Ty,
+                                          unsigned Alignment) const;
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.scatter
+  /// or a masked.scatter intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to scatter.
+  /// @param[in] Alignment Alignment of the operation.
+  virtual VPMemOpLegality isVPScatterLegal(const llvm::Function *F,
+                                           llvm::Type *Ty,
+                                           unsigned Alignment) const;
+
+  /// @brief Function to check whether a given type is valid as the element type
+  /// of a scalable vector used in a VP intrinsic.
+  ///
+  /// @param[in] Ty The type to be checked.
+  virtual bool isLegalVPElementType(llvm::Type *Ty) const;
+
+  /// @brief LLVM target machine that will be used for compilation.
+  llvm::TargetMachine *TM_;
+
+ private:
+  /// @brief Helper function to check legality of memory operations.
+  ///
+  /// @return Illegal in LLVM < 13 and check leagality in LLVM >= 13.
+  VPMemOpLegality checkMemOpLegality(
+      const llvm::Function *F,
+      llvm::function_ref<bool(const llvm::TargetTransformInfo &, llvm::Type *,
+                              unsigned)>
+          Checker,
+      llvm::Type *Ty, unsigned Alignment) const;
+
+  /// @brief Create a broadcast of a vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] vector Vector to broadcast.
+  /// @param[in] VL Vector length.
+  /// @param[in] factor Vectorization factor.
+  /// @param[in] URem Whether to broadcast a fixed-length vector to a scalable
+  /// one or a scalable-vector by a fixed amount
+  llvm::Value *createScalableBroadcast(llvm::IRBuilder<> &builder,
+                                       llvm::Value *vector, llvm::Value *VL,
+                                       llvm::ElementCount factor,
+                                       bool URem) const;
+};
+
+/// @brief Caches and returns the TargetInfo for a Module.
+class TargetInfoAnalysis : public llvm::AnalysisInfoMixin<TargetInfoAnalysis> {
+  friend AnalysisInfoMixin<TargetInfoAnalysis>;
+
+ public:
+  struct Result {
+    Result(std::unique_ptr<TargetInfo> &&I) : Info(std::move(I)) {}
+    /// Handle the invalidation of this information.
+    ///
+    /// When used as a result of TargetInfoAnalysis this method will be called
+    /// when the function this was computed for changes. When it returns false,
+    /// the information is preserved across those changes.
+    bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &,
+                    llvm::ModuleAnalysisManager::Invalidator &) {
+      return false;
+    }
+
+    operator TargetInfo *() { return Info.get(); }
+    operator const TargetInfo *() const { return Info.get(); }
+
+    std::unique_ptr<TargetInfo> Info;
+  };
+
+  using CallbackFn = std::function<Result(const llvm::Module &)>;
+
+  TargetInfoAnalysis();
+
+  TargetInfoAnalysis(llvm::TargetMachine *TM);
+
+  TargetInfoAnalysis(CallbackFn TICallback) : TICallback(TICallback) {}
+
+  /// @brief Retrieve the TargetInfo for the requested module.
+  Result run(llvm::Module &M, llvm::ModuleAnalysisManager &) {
+    return TICallback(M);
+  }
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "TargetInfo analysis"; }
+
+ private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+
+  /// @brief Callback function producing a BuiltinInfo on demand.
+  CallbackFn TICallback;
+};
+
+/// @brief Create a new vector target info instance.
+/// @param[in] tm LLVM target machine that will be used for compilation, can
+/// be NULL if no target data is available.
+/// @return The new TargetInfo instance.
+std::unique_ptr<TargetInfo> createTargetInfoFromTargetMachine(
+    llvm::TargetMachine *tm);
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_VECZ_TARGET_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
new file mode 100644
index 0000000000000..c3c2ab8e4a229
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
@@ -0,0 +1,99 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/control_flow_analysis.h"
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/Analysis/CFG.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+using namespace llvm;
+using namespace vecz;
+
+////////////////////////////////////////////////////////////////////////////////
+
+llvm::AnalysisKey CFGAnalysis::Key;
+
+CFGResult CFGAnalysis::run(llvm::Function &F,
+                           llvm::FunctionAnalysisManager &AM) {
+  CFGResult Res;
+
+  LLVM_DEBUG(dbgs() << "CONTROL FLOW ANALYSIS\n");
+
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+
+  bool mayDiverge = false;
+  for (BasicBlock &BB : F) {
+    // Update diverge information for a block which has varying branch.
+    auto *term = BB.getTerminator();
+    if (isa<ReturnInst>(term) || isa<UnreachableInst>(term)) {
+      // an "unreachable" terminator may be generated from an "optimization"
+      // of undefined behaviour in the IR; where a "trap" call has been
+      // introduced, the end of the Basic Block will never be reached.
+      // This should still be regarded as an exit block for our purposes.
+      if (Res.exitBB) {
+        emitVeczRemarkMissed(&F, &F,
+                             "CFG should not have more than one exit block.");
+        Res.setFailed(true);
+        return Res;
+      }
+      Res.exitBB = &BB;
+      LLVM_DEBUG(dbgs() << BB.getName() << " returns\n");
+    } else if (BranchInst *B = dyn_cast<BranchInst>(term)) {
+      if (B->isConditional()) {
+        auto *const cond = B->getCondition();
+        if (cond && UVR.isVarying(cond)) {
+          mayDiverge = true;
+        }
+      }
+    } else if (isa<SwitchInst>(term)) {
+      // Control Flow Conversion Pass is not able to handle switch instructions.
+      emitVeczRemarkMissed(&F, &F, "Unexpected Switch instruction.");
+      Res.setFailed(true);
+      return Res;
+    }
+  }
+
+  if (!Res.getExitBlock()) {
+    emitVeczRemarkMissed(&F, &F, "Non-terminating CFG in");
+    Res.setFailed(true);
+    return Res;
+  }
+
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  using RPOTraversal = ReversePostOrderTraversal<const Function *>;
+  RPOTraversal FuncRPOT(&F);
+  if (containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
+                             const LoopInfo>(FuncRPOT, LI)) {
+    emitVeczRemarkMissed(&F, &F, "Irreducible loop detected in");
+    Res.setFailed(true);
+    return Res;
+  }
+
+  if (mayDiverge) {
+    Res.setConversionNeeded(true);
+  }
+
+  return Res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
new file mode 100644
index 0000000000000..15003bbe08b34
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -0,0 +1,806 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/divergence_analysis.h"
+
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+using RPOT = ReversePostOrderTraversal<Function *>;
+}  // namespace
+
+BlockQueue::BlockQueue(DivergenceResult const &dr,
+                       DenseSet<BasicBlock *> const &blocks)
+    : DR(dr) {
+  indices.reserve(blocks.size());
+  for (auto *const BB : blocks) {
+    indices.push_back(DR.getTagIndex(BB));
+  }
+
+  // Note that make_heap builds a Max heap, so we use `std::greater` to get a
+  // Min heap.
+  std::make_heap(indices.begin(), indices.end(), std::greater<index_type>());
+}
+
+const BasicBlockTag &BlockQueue::pop() {
+  assert(!indices.empty() && "Trying to pop from an empty BlockQueue");
+  std::pop_heap(indices.begin(), indices.end(), std::greater<index_type>());
+  auto const popped_index = indices.back();
+  indices.pop_back();
+
+  return DR.getBlockTag(popped_index);
+}
+
+void BlockQueue::push(size_t index) {
+  indices.push_back(index);
+  std::push_heap(indices.begin(), indices.end(), std::greater<index_type>());
+}
+
+void BlockQueue::push(BasicBlock const *bb) {
+  indices.push_back(DR.getTagIndex(bb));
+  std::push_heap(indices.begin(), indices.end(), std::greater<index_type>());
+}
+
+DivergenceResult::DivergenceResult(Function &F, FunctionAnalysisManager &AM)
+    : F(F), AM(AM) {}
+
+size_t DivergenceResult::getTagIndex(const llvm::BasicBlock *BB) const {
+  assert(BB && "Trying to get the tag of a null BasicBlock");
+  auto iter = BBMap.find(BB);
+  assert(iter != BBMap.end() && "BasicBlock tag is not defined");
+  return iter->second;
+}
+
+BasicBlockTag &DivergenceResult::getOrCreateTag(BasicBlock *BB) {
+  assert(BB && "Trying to get the tag of a null BasicBlock");
+  auto const &result = BBMap.try_emplace(BB, basicBlockTags.size());
+  if (result.second) {
+    // It's a new map entry, so create the new tag and return it.
+    basicBlockTags.emplace_back();
+    auto &tag = basicBlockTags.back();
+    tag.BB = BB;
+    return tag;
+  }
+  // Return the indexed tag.
+  return basicBlockTags[result.first->second];
+}
+
+LoopTag &DivergenceResult::getTag(const Loop *L) const {
+  assert(L && "Trying to get the tag of a null loop");
+  auto iter = LMap.find(L);
+  assert(iter != LMap.end() && "Loop tag is not defined");
+  return *iter->second;
+}
+
+LoopTag &DivergenceResult::getOrCreateTag(Loop *L) {
+  assert(L && "Trying to get or create the tag of a null loop");
+  auto &tag = LMap[L];
+  if (!tag) {
+    tag = std::make_unique<LoopTag>();
+    tag->loop = L;
+  }
+  return *tag;
+}
+
+bool DivergenceResult::hasFlag(const BasicBlock &BB,
+                               BlockDivergenceFlag F) const {
+  return (getTag(&BB).divergenceFlag & F) == F;
+}
+
+BlockDivergenceFlag DivergenceResult::getFlag(const BasicBlock &BB) const {
+  return getTag(&BB).divergenceFlag;
+}
+
+void DivergenceResult::setFlag(const BasicBlock &BB, BlockDivergenceFlag F) {
+  auto &tag = getTag(&BB);
+  tag.divergenceFlag = static_cast<BlockDivergenceFlag>(tag.divergenceFlag | F);
+}
+
+void DivergenceResult::clearFlag(const BasicBlock &BB, BlockDivergenceFlag F) {
+  auto &tag = getTag(&BB);
+  tag.divergenceFlag =
+      static_cast<BlockDivergenceFlag>(tag.divergenceFlag & ~F);
+}
+
+bool DivergenceResult::isDivCausing(const BasicBlock &BB) const {
+  return (hasFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranch) ||
+          hasFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranchFake));
+}
+
+bool DivergenceResult::isDivergent(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsDivergent);
+}
+
+bool DivergenceResult::isOptional(const BasicBlock &BB) const {
+  return !isDivergent(BB);
+}
+
+bool DivergenceResult::isByAll(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsByAll);
+}
+
+bool DivergenceResult::isBlend(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsBlend);
+}
+
+bool DivergenceResult::isUniform(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsUniform);
+}
+
+bool DivergenceResult::hasFlag(const Loop &L, LoopDivergenceFlag F) const {
+  return (getTag(&L).divergenceFlag & F) == F;
+}
+
+LoopDivergenceFlag DivergenceResult::getFlag(const Loop &L) const {
+  return getTag(&L).divergenceFlag;
+}
+
+void DivergenceResult::setFlag(const Loop &L, LoopDivergenceFlag F) {
+  auto &tag = getTag(&L);
+  tag.divergenceFlag = static_cast<LoopDivergenceFlag>(tag.divergenceFlag | F);
+}
+
+void DivergenceResult::clearFlag(const Loop &L, LoopDivergenceFlag F) {
+  auto &tag = getTag(&L);
+  tag.divergenceFlag = static_cast<LoopDivergenceFlag>(tag.divergenceFlag & ~F);
+}
+
+bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
+  LLVM_DEBUG(dbgs() << "Divergence Analysis: COMPUTE BLOCK ORDERING\n");
+
+  // The DCBI (Dominance Compact Block Indexing) is a topological ordering of
+  // the basic blocks that is also dominance compact, that is, an ordering such
+  // that for any block A, every block that A dominates follows in a contiguous
+  // subsequence in the ordering. To construct this, we gather a reverse post-
+  // order traversal over the CFG, and then a depth-first traversal over the
+  // dominator tree, ordering each node's children according to the previously
+  // calculated reverse post-order. We need to take special care of loop exits,
+  // however, since where a loop exits from some block other than a latch,
+  // the dominator tree traversal can erroneously order it inside of the loop.
+  // To prevent this, we store up exit blocks until we have processed all
+  // the blocks at the current loop level.
+
+  struct DCnode {
+    BasicBlock *BB;
+    unsigned depth = 0;
+  };
+  std::vector<DCnode> graph;
+  llvm::DenseMap<llvm::BasicBlock *, unsigned> indexMap;
+
+  indexMap.reserve(F.size());
+  {
+    // Note that a post-order traversal of the CFG does not include any blocks
+    // with no predecessors, other than the entry block.
+    unsigned index = 0;
+    for (auto *const BB : RPOT(&F)) {
+      indexMap[BB] = index++;
+      graph.emplace_back();
+      graph.back().BB = BB;
+
+      if (auto const *const LTag = getTag(BB).loop) {
+        graph.back().depth = LTag->loop->getLoopDepth();
+      }
+    }
+  }
+
+  // Do a depth-first traversal of the dominator tree
+  SmallVector<unsigned, 16> stack;
+  stack.push_back(0);
+  uint32_t pos = 0;
+  SmallVector<unsigned, 16> children;
+  SmallVector<unsigned, 16> loopExits;
+  while (!stack.empty()) {
+    auto const u = stack.pop_back_val();
+    auto const &uNode = graph[u];
+
+    getTag(uNode.BB).pos = pos++;
+
+    // Children in the same loop or subloops get added back to the stack.
+    // Children outside of the current loop get stored up until we processed
+    // everything in this loop. Note that we can accumulate exit blocks
+    // from multiple points within the loop, and across multiple depth levels.
+    auto *const DTNode = DT.getNode(uNode.BB);
+    unsigned stacked = 0;
+    for (auto *const childNode : make_range(DTNode->begin(), DTNode->end())) {
+      auto const child = indexMap[childNode->getBlock()];
+      auto &cNode = graph[child];
+      if (cNode.depth >= uNode.depth) {
+        stack.push_back(child);
+        ++stacked;
+      } else {
+        // Note that we can exit across more than one loop level, so we need to
+        // find the right place to insert it.
+        auto insert = loopExits.end();
+        while (insert != loopExits.begin()) {
+          auto scan = insert - 1;
+          if (cNode.depth < graph[*scan].depth) {
+            insert = scan;
+          } else {
+            break;
+          }
+        }
+        loopExits.insert(insert, child);
+      }
+    }
+    // Sort any children added to the stack into post-order
+    std::sort(stack.end() - stacked, stack.end(), std::greater<unsigned>());
+
+    if (!loopExits.empty()) {
+      unsigned const curDepth = stack.empty() ? 0 : graph[stack.back()].depth;
+      unsigned const depth = std::max(curDepth, graph[loopExits.back()].depth);
+      unsigned count = 0;
+      while (!loopExits.empty() && depth == graph[loopExits.back()].depth) {
+        stack.push_back(loopExits.pop_back_val());
+        ++count;
+      }
+
+      // Sort the loop exits into post-order
+      std::sort(stack.end() - count, stack.end(), std::greater<unsigned>());
+    }
+  }
+  assert(pos == graph.size() && "Incomplete DCBI");
+
+  reorderTags(pos);
+  return true;
+}
+
+void DivergenceResult::reorderTags(size_t n) {
+  numOrderedBlocks = n;
+
+  // This is a Cycle Sort. It re-orders the tags in the tag vector according to
+  // their calculated block index. Despite the two nested loops, it is O(n).
+  // Out-of-range indices (pos >= n) will be left where they are, but a later
+  // ordered tag might move it afterwards.
+  for (size_t i = 0, n = basicBlockTags.size(); i != n; ++i) {
+    auto &tag = basicBlockTags[i];
+    while (tag.pos < n && tag.pos != i) {
+      std::swap(tag, basicBlockTags[tag.pos]);
+    }
+  }
+
+  // Rebuild the index map after sorting. Note that we can't absorb this into
+  // the above loop, since an unordered tag might not be in its final position
+  // until all of the ordered tags are in their correct places.
+  for (size_t i = 0, n = basicBlockTags.size(); i != n; ++i) {
+    BBMap[basicBlockTags[i].BB] = i;
+  }
+}
+
+bool DivergenceResult::computeLoopOrdering() {
+  loopOrdering.clear();
+  for (auto const &pair : LMap) {
+    loopOrdering.push_back(pair.second.get());
+  }
+
+  std::sort(loopOrdering.begin(), loopOrdering.end(),
+            [](const LoopTag *LHS, const LoopTag *RHS) -> bool {
+              return LHS->loop->getLoopDepth() < RHS->loop->getLoopDepth();
+            });
+
+  return true;
+}
+
+void DivergenceResult::markDivCausing(BasicBlock &BB, DivergenceInfo &DI,
+                                      PostDominatorTree &PDT) {
+  if (isDivCausing(BB)) {
+    return;
+  }
+
+  divCausingBlocks.push_back(&BB);
+  setFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranch);
+  LLVM_DEBUG(dbgs() << "Block " << BB.getName() << " is div_causing\n");
+
+  for (BasicBlock *succ : successors(&BB)) {
+    markDivergent(*succ);
+  }
+
+  // If a block is a joint point (blend) of `BB`, then it is divergent (unless
+  // it is the post-dominator of `BB`).
+  auto const &joins = joinPoints(BB);
+  for (BasicBlock *const join : joins) {
+    setFlag(*join, BlockDivergenceFlag::eBlockIsBlend);
+    LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n");
+
+    if (!PDT.dominates(join, &BB)) {
+      markDivergent(*join);
+    }
+
+    for (BasicBlock *const pred : predecessors(join)) {
+      // If at least 2 successors of `pred` are join points of `BB`, then mark
+      // `pred` as a fake div causing block because its successors may be
+      // executed by multiple work-items.
+      if (std::count_if(
+              succ_begin(pred), succ_end(pred),
+              [&joins](BasicBlock *succ) { return joins.count(succ); }) > 1) {
+        fakeDivCausingBlocks.insert(pred);
+      }
+    }
+
+    // Join points of divergent branches need their PHIs marked varying.
+    DI.insert(join);
+  }
+}
+
+void DivergenceResult::markDivLoopDivBlocks(BasicBlock &BB, Loop &L,
+                                            DivergenceInfo &DI) {
+  markDivergent(L);
+
+  // Find loop exits through which some work-items may leave the loop while
+  // others keep iterating over it. These exit blocks can be reached from the
+  // div_causing block before reaching the latch because the divergent path
+  // cannot fully reconverge before leaving the loop (since the loop is
+  // divergent).
+  SmallVector<BasicBlock *, 1> exits;
+  L.getExitBlocks(exits);
+  auto const &divergentExits = escapePoints(BB, L);
+  for (BasicBlock *E : exits) {
+    if (divergentExits.count(E)) {
+      markDivergent(*E);
+    }
+    // All loop exits of a divergent loop need their PHIs marked varying.
+    DI.insert(E);
+  }
+
+  // The latch of a divergent loop is divergent.
+  markDivergent(*L.getLoopLatch());
+}
+
+void DivergenceResult::markDivergent(const BasicBlock &BB) {
+  if (!isDivergent(BB)) {
+    setFlag(BB, BlockDivergenceFlag::eBlockIsDivergent);
+    LLVM_DEBUG(dbgs() << "\tBlock " << BB.getName() << " is divergent\n");
+  }
+}
+
+void DivergenceResult::markDivergent(const Loop &L) {
+  if (!getTag(&L).isLoopDivergent()) {
+    setFlag(L, LoopDivergenceFlag::eLoopIsDivergent);
+    LLVM_DEBUG(dbgs() << "\tLoop " << L.getName() << " is divergent\n");
+  }
+}
+
+void DivergenceResult::markByAll(BasicBlock &src) {
+  Function &F = *src.getParent();
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  BlockQueue queue(*this);
+  queue.push(&src);
+
+  while (!queue.empty()) {
+    auto &BBTag = queue.pop();
+    auto *const BB = BBTag.BB;
+
+    if (isByAll(*BB)) {
+      continue;
+    }
+
+    const bool isHeaderDivLoop =
+        BBTag.isLoopHeader() && BBTag.loop->isLoopDivergent();
+    // If BB is a loop header, it can only be marked by_all if its loop does not
+    // diverge.
+    if (!isHeaderDivLoop) {
+      setFlag(*BB, BlockDivergenceFlag::eBlockIsByAll);
+      LLVM_DEBUG(dbgs() << "Block " << BB->getName() << " is by_all\n");
+    }
+
+    SmallVector<BasicBlock *, 2> descendants;
+    DT.getDescendants(BB, descendants);
+
+    // For all descendants `D` of `BB` that post-dominate `BB`, `D` is by_all.
+    for (BasicBlock *D : descendants) {
+      if (D != BB) {
+        if (PDT.dominates(D, BB)) {
+          auto const DIndex = getTagIndex(D);
+          auto const *const DLoopTag = basicBlockTags[DIndex].loop;
+          // If we are not in a loop, or the loop we live in does not diverge
+          // nor does the one englobing us if it exists, then mark by_all.
+          Loop *parentLoop;
+          if (!DLoopTag || (!DLoopTag->isLoopDivergent() &&
+                            (!(parentLoop = DLoopTag->loop->getParentLoop()) ||
+                             isByAll(*parentLoop->getHeader())))) {
+            queue.push(DIndex);
+          }
+        }
+      }
+    }
+
+    // For all descendants `D` of `BB` that do not post-dominate `BB`, `D` is
+    // by_all if all predecessors of `D` are by_all.
+    //
+    // If BB is a divergent branch, it cannot propagate by_all to its
+    // successors.
+    if (!isHeaderDivLoop && !isDivCausing(*BB)) {
+      for (BasicBlock *D : descendants) {
+        if (D != BB) {
+          if (!PDT.dominates(D, BB)) {
+            if (std::all_of(
+                    pred_begin(D), pred_end(D),
+                    [this](BasicBlock *pred) { return isByAll(*pred); })) {
+              queue.push(D);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+bool DivergenceResult::isReachable(BasicBlock *src, BasicBlock *dst,
+                                   bool allowLatch) const {
+  DenseSet<BasicBlock *> visited;
+  std::vector<BasicBlock *> worklist;
+
+  worklist.push_back(src);
+  visited.insert(src);
+
+  while (!worklist.empty()) {
+    BasicBlock *BB = worklist.back();
+    worklist.pop_back();
+
+    if (BB == dst) {
+      return true;
+    }
+
+    auto const &BBTag = getTag(BB);
+    for (BasicBlock *succ : successors(BB)) {
+      if (!allowLatch && BBTag.isLoopBackEdge(succ)) {
+        continue;
+      }
+      if (visited.insert(succ).second) {
+        worklist.push_back(succ);
+      }
+    }
+  }
+
+  return false;
+}
+
+DenseSet<BasicBlock *> DivergenceResult::joinPoints(BasicBlock &src) const {
+  if (src.getTerminator()->getNumSuccessors() < 2) {
+    return {};
+  }
+
+  Function &F = *src.getParent();
+  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  DenseMap<BasicBlock const *, BasicBlock const *> defMap;
+  DenseSet<BasicBlock *> joins;
+
+  BlockQueue queue(*this);
+
+  auto schedule = [&defMap, &joins, &queue](BasicBlock *block,
+                                            BasicBlock const *defBlock) {
+    auto defIt = defMap.find(block);
+    // First time we meet this block; not a join (yet).
+    if (defIt == defMap.end()) {
+      queue.push(block);
+      defMap.insert({block, defBlock});
+    } else if (defIt->second != defBlock) {
+      // We've found a block that has two different incoming definitions; it is
+      // a join point.
+      joins.insert(block);
+    }
+  };
+
+  for (BasicBlock *const succ : successors(&src)) {
+    schedule(succ, succ);
+  }
+
+  auto *Node = PDT.getNode(&src);
+  assert(Node && "Could not get node");
+  auto *IDom = Node->getIDom();
+  assert(IDom && "Could not get IDom");
+  BasicBlock *PIDom = IDom->getBlock();
+  assert(PIDom && "Could not get block");
+
+  while (!queue.empty()) {
+    auto &curTag = queue.pop();
+    BasicBlock *cur = curTag.BB;
+
+    if (cur == PIDom) {
+      continue;
+    }
+
+    BasicBlock const *const defBlock = defMap.find(cur)->second;
+
+    auto const *const curLTag = curTag.loop;
+    // If the successor is the header of a nested loop pretend its a single
+    // node with the loop's exits as successors.
+    if (curLTag && curLTag->header == cur) {
+      SmallVector<BasicBlock *, 2> exits;
+      curLTag->loop->getUniqueExitBlocks(exits);
+      for (BasicBlock *const exit : exits) {
+        if (exit == &src) {
+          continue;
+        }
+        schedule(exit, defBlock);
+      }
+    } else {
+      // the successors are either on the same loop level or loop exits
+      for (BasicBlock *const succ : successors(cur)) {
+        if (succ == &src) {
+          continue;
+        }
+        schedule(succ, defBlock);
+      }
+    }
+  }
+
+  return joins;
+}
+
+DenseSet<BasicBlock *> DivergenceResult::escapePoints(BasicBlock const &src,
+                                                      Loop const &L) const {
+  LoopTag const &LTag = getTag(&L);
+
+  DenseSet<BasicBlock *> divergentExits;
+
+  DenseSet<BasicBlock const *> visited;
+  BlockQueue queue(*this);
+
+  queue.push(&src);
+  visited.insert(&src);
+
+  while (!queue.empty()) {
+    auto const &BBTag = queue.pop();
+    auto *const BB = BBTag.BB;
+
+    // We found a divergent loop exit.
+    if (!L.contains(BB)) {
+      divergentExits.insert(BB);
+      continue;
+    }
+
+    bool allowLatch = true;
+    auto *const loopTag = BBTag.loop;
+    // 'BB' is a backedge
+    if (loopTag && loopTag->latch == BB) {
+      if (loopTag == &LTag) {
+        // `BB` is the latch of the current loop; forbid the backedge.
+        allowLatch = false;
+      } else {
+        // Otherwise, forbid the backedge only if none of the remaining blocks
+        // in the queue belong to `L`, in which case no exit block starting
+        // from the header of the nested loop can be divergent.
+        allowLatch =
+            std::any_of(queue.begin(), queue.end(), [this, &L](size_t index) {
+              return L.contains(basicBlockTags[index].BB);
+            });
+      }
+    }
+
+    for (BasicBlock *succ : successors(BB)) {
+      if (BBTag.isLoopBackEdge(succ) && !allowLatch) {
+        continue;
+      }
+      if (visited.insert(succ).second) {
+        queue.push(succ);
+      }
+    }
+  }
+
+  return divergentExits;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+llvm::AnalysisKey DivergenceAnalysis::Key;
+
+DivergenceResult DivergenceAnalysis::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  DivergenceResult Res(F, AM);
+
+  LLVM_DEBUG(dbgs() << "DIVERGENCE ANALYSIS\n");
+  Res.basicBlockTags.reserve(F.size() * 4);
+
+  // Prepare the BasicBlockTags.
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  for (BasicBlock &BB : F) {
+    // Create BB info entries.
+    BasicBlockTag &BBTag = Res.getOrCreateTag(&BB);
+
+    // Update loop info.
+    if (Loop *L = LI.getLoopFor(&BB)) {
+      if (!BBTag.loop) {
+        BBTag.loop = &Res.getOrCreateTag(L);
+        BBTag.loop->latch = L->getLoopLatch();
+        BBTag.loop->header = L->getHeader();
+        BBTag.loop->preheader = L->getLoopPreheader();
+      }
+    }
+  }
+
+  // Find loop live values and update loop exit information.
+  Res.computeLoopOrdering();
+  for (auto *const LTag : Res.loopOrdering) {
+    SmallVector<BasicBlock *, 1> loopExitBlocks;
+    LTag->loop->getExitBlocks(loopExitBlocks);
+    for (BasicBlock *BB : loopExitBlocks) {
+      auto &BBTag = Res.getTag(BB);
+      // If BB already leaves a loop, update it if the previous loop is nested
+      // in the current.
+      if (BBTag.outermostExitedLoop) {
+        if (BBTag.outermostExitedLoop->loop->getLoopDepth() >
+            LTag->loop->getLoopDepth()) {
+          BBTag.outermostExitedLoop = LTag;
+        }
+      } else {
+        BBTag.outermostExitedLoop = LTag;
+      }
+
+      // LoopSimplify pass has already converted SSA form to LCSSA from.
+      // Let's use lcssa phi nodes to find loop live variables like llvm loop
+      // vectorizer.
+      // LoopSimplify pass is added on PreparationPass of vectorizer.cpp.
+      //
+      // See head comment on lib/Transforms/Utils/LCSSA.cpp
+      for (Instruction &I : *BB) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          // lcssa phi has incoming values defined in the loop.
+          for (Value *incoming : PHI->incoming_values()) {
+            if (Instruction *incomingInst = dyn_cast<Instruction>(incoming)) {
+              if (LTag->loop->contains(incomingInst->getParent())) {
+                LTag->loopLiveValues.insert(incoming);
+                LLVM_DEBUG(dbgs() << *incoming << " is a loop live value of "
+                                  << LTag->loop->getName() << "\n");
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // From the UVA, we know which conditions are varying which allows us to
+  // find divergent branches.
+  // Moreover, from divergent branches - and therefore from divergent paths -
+  // we can find more varying values that are computed on those divergent paths.
+  // The latter allows us to find more divergent branches, and so on...
+  // We take a local copy of the UVR because it is not good to modify one
+  // analysis result from another analysis. However, after Control Flow
+  // Conversion has been run, all control flow divergence is converted into
+  // non-uniform dataflow so any subsequent run of the UVA is still correct.
+  auto UVR = AM.getResult<UniformValueAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  Res.computeBlockOrdering(DT);
+
+  std::vector<std::pair<BasicBlock *, Value *>> uniformBranches;
+  uniformBranches.reserve(F.size() - 1u);
+  for (BasicBlock &BB : F) {
+    if (BranchInst *B = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (B->isConditional()) {
+        uniformBranches.push_back({&BB, B->getCondition()});
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      uniformBranches.push_back({&BB, SI->getCondition()});
+    }
+  }
+
+  while (!uniformBranches.empty()) {
+    // Partition the list so all the varying branches are grouped at the end.
+    auto const varyingBranches =
+        std::partition(uniformBranches.begin(), uniformBranches.end(),
+                       [&UVR](std::pair<BasicBlock *, Value *> &p) -> bool {
+                         return !UVR.isVarying(p.second);
+                       });
+
+    // Process all the varying branches.
+    DivergenceInfo divergenceInfo;
+    for (auto it = varyingBranches; it != uniformBranches.end(); ++it) {
+      BasicBlock *BB = it->first;
+
+      // Find blocks diverged by varying branch block.
+      Res.markDivCausing(*BB, divergenceInfo, PDT);
+
+      if (auto const *const LTag = Res.getTag(BB).loop) {
+        Loop *L = LTag->loop;
+        while (L) {
+          // If BB is a varying branch, mark the loop as diverging if any two
+          // instances of a SIMD group can leave the loop over different exit
+          // edges and/or in different iterations. This means that BB cannot
+          // be postdominated by any block of L.
+          auto *Node = PDT.getNode(BB);
+          assert(Node && "Could not get node");
+          auto *IDom = Node->getIDom();
+          assert(IDom && "Could not get IDom");
+          BasicBlock *PIDom = IDom->getBlock();
+          if (!L->contains(PIDom)) {
+            Res.markDivLoopDivBlocks(*BB, *L, divergenceInfo);
+          } else {
+            // If the loop does not diverge because of `BB`, none of its
+            // parent loops can diverge either.
+            break;
+          }
+          L = L->getParentLoop();
+        }
+      }
+    }
+
+    // Remove all the varying branches from the end of the list.
+    uniformBranches.erase(varyingBranches, uniformBranches.end());
+
+    // PHIs defined in join point of divergent branches and in exit blocks of
+    // divergent loops are varying.
+    bool updated = false;
+    for (BasicBlock *BB : divergenceInfo) {
+      bool const exitedLoop = Res.getTag(BB).outermostExitedLoop;
+      for (Instruction &I : *BB) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          // Loop exits might have constant phi nodes (lcssa value).
+          if (exitedLoop || !PHI->hasConstantOrUndefValue()) {
+            if (!UVR.isVarying(&I)) {
+              updated = true;
+              UVR.markVaryingValues(&I);
+              LLVM_DEBUG(dbgs()
+                         << I.getName() << " is a varying instruction\n");
+            }
+          }
+        } else {
+          break;
+        }
+      }
+    }
+    if (!updated) {
+      // We made no updates, so we processed all the varying branches.
+      break;
+    }
+  }
+
+  // All blocks that are predecessors of join points of div causing blocks and
+  // have a uniform condition must be marked as fake div causing blocks because
+  // divergence may have occurred at the div causing block and we must make sure
+  // we execute all paths that lead to the join point.
+  for (BasicBlock *BB : Res.fakeDivCausingBlocks) {
+    if (BB->getTerminator()->getNumSuccessors() > 1 && !Res.isDivCausing(*BB)) {
+      Res.setFlag(*BB, BlockDivergenceFlag::eBlockHasDivergentBranchFake);
+      LLVM_DEBUG(dbgs() << "Found fake div causing block " << BB->getName()
+                        << "\n");
+      // Because we have marked `BB` as a target for linearization, its join
+      // points must be marked as `blend` because they may lose some
+      // predecessors during the rewiring.
+      for (BasicBlock *join : Res.joinPoints(*BB)) {
+        Res.setFlag(*join, BlockDivergenceFlag::eBlockIsBlend);
+        LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n");
+      }
+    }
+  }
+
+  // By definition, the entry block is by_all.
+  Res.markByAll(F.getEntryBlock());
+
+  return Res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
new file mode 100644
index 0000000000000..434d427ddde03
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/instantiation_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <multi_llvm/opaque_pointers.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_context.h"
+
+#define DEBUG_TYPE "vecz-instantiation"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+bool analyzeType(Type *Ty) {
+  return !Ty->isVoidTy() && !Ty->isVectorTy() &&
+         !FixedVectorType::isValidElementType(Ty);
+}
+
+bool analyzeMemOp(MemOp &Op) {
+  assert(isa<PointerType>(Op.getPointerType()) &&
+         multi_llvm::isOpaqueOrPointeeTypeMatches(
+             cast<PointerType>(Op.getPointerType()), Op.getDataType()) &&
+         "MemOp inconsistency");
+  return analyzeType(Op.getDataType());
+}
+
+bool analyzeCall(VectorizationContext const &Ctx, CallInst *CI) {
+  Function *Callee = CI->getCalledFunction();
+  VECZ_FAIL_IF(!Callee);
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(Callee)) {
+    if (auto Op = MemOp::get(CI)) {
+      return analyzeMemOp(*Op);
+    }
+    return false;
+  }
+
+  // Handle function containing pointers as parameter.
+  if (any_of(Callee->args(),
+             [](const Argument &A) { return A.getType()->isPointerTy(); })) {
+    return true;
+  }
+
+  // Handle masked function calls
+  if (Ctx.isMaskedFunction(Callee)) {
+    return true;
+  }
+
+  auto const Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
+
+  // Intrinsics without side-effects can be safely instantiated.
+  if (Callee->isIntrinsic() &&
+      (Props & compiler::utils::eBuiltinPropertyNoSideEffects)) {
+    // If the intrinsic has a vector equivalent, then we can use it directly
+    // instead.
+    if (Props & compiler::utils::eBuiltinPropertyVectorEquivalent) {
+      return analyzeType(CI->getType());
+    }
+    return true;
+  }
+
+  // Functions returning void must have side-effects.
+  // We cannot vectorize them and instead we need to instantiate them.
+  bool HasSideEffects = Callee->getReturnType()->isVoidTy() ||
+                        (Props & compiler::utils::eBuiltinPropertySideEffects);
+  if (HasSideEffects &&
+      (Props & compiler::utils::eBuiltinPropertySupportsInstantiation)) {
+    return true;
+  }
+
+  return analyzeType(CI->getType());
+}
+
+bool analyzeAlloca(VectorizationContext const &Ctx, AllocaInst *alloca) {
+  // Possibly, we could packetize by creating a wider array, but for now let's
+  // just let instantiation deal with it.
+  if (alloca->isArrayAllocation()) {
+    return true;
+  }
+
+  // We can create an array of anything, however, we need to be careful of
+  // alignment. In the case the alloca has a specific alignment requirement, we
+  // have to be sure it divides the type allocation size, otherwise only the
+  // first vector element would necessarily be correctly aligned.
+  auto *const dataTy = alloca->getAllocatedType();
+  uint64_t const memSize = Ctx.dataLayout()->getTypeAllocSize(dataTy);
+  uint64_t const align = alloca->getAlign().value();
+  return (align != 0 && (memSize % align) != 0);
+}
+}  // namespace
+
+namespace vecz {
+bool needsInstantiation(VectorizationContext const &Ctx, Instruction &I) {
+  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    return analyzeCall(Ctx, CI);
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+    MemOp Op = *MemOp::get(Load);
+    return analyzeMemOp(Op);
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+    MemOp Op = *MemOp::get(Store);
+    return analyzeMemOp(Op);
+  } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+    return analyzeAlloca(Ctx, Alloca);
+  } else if (isa<AtomicRMWInst>(&I) || isa<AtomicCmpXchgInst>(&I)) {
+    return true;
+  } else {
+    return analyzeType(I.getType());
+  }
+}
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
new file mode 100644
index 0000000000000..1dd7e04dbe0bf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -0,0 +1,252 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Implementation based on Section 5.2 of the paper:
+// Florian Brandner, Benoit Boissinot, Alain Darte, Benoît Dupont de Dinechin,
+// Fabrice Rastello.
+// Computing Liveness Sets for SSA-Form Programs.
+// [Research Report] RR-7503, INRIA. 2011, pp.25. inria-00558509v2
+//
+// https://hal.inria.fr/inria-00558509v2
+
+#include "analysis/liveness_analysis.h"
+
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Instructions.h>
+
+#include "vectorization_unit.h"
+
+using namespace llvm;
+using namespace vecz;
+
+llvm::AnalysisKey LivenessAnalysis::Key;
+
+namespace {
+
+// Returns true if V defines a variable and is likely to require a register
+bool definesVariable(const Value &V) {
+  // Constants are likely to be immediate values
+  if (isa<Constant>(V)) {
+    return false;
+  }
+
+  // If a value isn't used, it can't be live
+  if (V.user_empty()) {
+    return false;
+  }
+
+  const auto valueType = V.getType();
+  return !valueType->isVoidTy() && !valueType->isLabelTy() &&
+         !valueType->isTokenTy() && !valueType->isMetadataTy();
+}
+
+// Tries to push a value onto the set, if it is not there already.
+// Returns true if the value was pushed, false otherwise.
+//
+// Note that since the implementation completely processes every instruction
+// sequentially, only the last element needs to be checked.
+inline bool pushOnce(BlockLivenessInfo::LiveSet &s, Value *V) {
+  if (!s.empty() && s.back() == V) {
+    return false;
+  }
+  s.push_back(V);
+  return true;
+}
+
+}  // namespace
+
+class LivenessResult::Impl {
+ public:
+  Impl(LivenessResult &lr) : LR(lr) {}
+
+  void recalculate();
+
+ private:
+  LivenessResult &LR;
+
+  void computeByVar(const BasicBlock &BB);
+
+  void computeVar(Value *V, const BasicBlock *BB);
+
+  void mark(Value *V, const BasicBlock *parent, const BasicBlock *BB);
+
+  void calculateMaxRegistersInBlock(const llvm::BasicBlock *BB);
+
+  // private utility method for code conciseness
+  inline BlockLivenessInfo &info(const BasicBlock *BB) const {
+    auto BIi = LR.BlockInfos.find(BB);
+    assert(BIi != LR.BlockInfos.end() && "Block Liveness Info does not exist!");
+    return BIi->second;
+  }
+};
+
+LivenessResult LivenessAnalysis::run(llvm::Function &F,
+                                     llvm::FunctionAnalysisManager &) {
+  Result R(F);
+  R.recalculate();
+  return R;
+}
+
+size_t LivenessResult::getMaxLiveVirtualRegisters() const {
+  return maxNumberOfLiveValues;
+}
+
+const BlockLivenessInfo &LivenessResult::getBlockInfo(
+    const BasicBlock *BB) const {
+  auto found = BlockInfos.find(BB);
+  assert(found != BlockInfos.end() && "No liveness information for BasicBlock");
+  return found->second;
+}
+
+void LivenessResult::recalculate() {
+  maxNumberOfLiveValues = 0;
+
+  BlockInfos.clear();
+
+  Impl impl(*this);
+  impl.recalculate();
+}
+
+void LivenessResult::Impl::recalculate() {
+  auto &F = LR.F;
+
+  // Create infos in advance so things don't relocate under our feet.
+  for (auto &BB : F) {
+    (void)LR.BlockInfos[&BB];
+  }
+
+  // Arguments are always live-ins of the entry block (if they are used).
+  {
+    auto *BB = &F.getEntryBlock();
+    auto &BI = info(BB);
+    for (auto &arg : F.args()) {
+      if (!arg.use_empty()) {
+        BI.LiveIn.push_back(&arg);
+        computeVar(&arg, BB);
+      }
+    }
+  }
+
+  // Add all other variables to the live sets.
+  for (auto &BB : F) {
+    auto &BI = LR.BlockInfos[&BB];
+    for (auto &I : BB) {
+      if (definesVariable(I)) {
+        if (isa<PHINode>(I)) {
+          // PHI nodes are always live-ins.
+          BI.LiveIn.push_back(&I);
+        }
+        computeVar(&I, &BB);
+      }
+    }
+  }
+
+  // Calculate the maximum number of live values in every block.
+  for (auto &BB : F) {
+    calculateMaxRegistersInBlock(&BB);
+  }
+
+  // Store the largest number of live values in the function.
+  for (const auto &entry : LR.BlockInfos) {
+    LR.maxNumberOfLiveValues = std::max(LR.maxNumberOfLiveValues,
+                                        entry.getSecond().MaxRegistersInBlock);
+  }
+}
+
+void LivenessResult::Impl::computeVar(Value *V, const BasicBlock *BB) {
+  SmallPtrSet<const BasicBlock *, 8> UseBlocks;
+  for (auto *User : V->users()) {
+    if (auto *UI = dyn_cast<Instruction>(User)) {
+      if (auto *PHI = dyn_cast<PHINode>(UI)) {
+        for (unsigned i = 0, n = PHI->getNumIncomingValues(); i != n; ++i) {
+          if (PHI->getIncomingValue(i) == V) {
+            const auto *Incoming = PHI->getIncomingBlock(i);
+
+            if (pushOnce(info(Incoming).LiveOut, V) && Incoming != BB) {
+              UseBlocks.insert(Incoming);
+            }
+          }
+        }
+      } else {
+        const auto *Parent = UI->getParent();
+        if (Parent != BB) {
+          UseBlocks.insert(Parent);
+        }
+      }
+    }
+  }
+
+  for (auto *UB : UseBlocks) {
+    if (pushOnce(info(UB).LiveIn, V)) {
+      mark(V, BB, UB);
+    }
+  }
+}
+
+void LivenessResult::Impl::mark(Value *V, const BasicBlock *parent,
+                                const BasicBlock *BB) {
+  // Propagate backward
+  for (const auto *pred : predecessors(BB)) {
+    auto &PBI = info(pred);
+    if (pushOnce(PBI.LiveOut, V) && pred != parent && pushOnce(PBI.LiveIn, V)) {
+      mark(V, parent, pred);
+    }
+  }
+}
+
+void LivenessResult::Impl::calculateMaxRegistersInBlock(const BasicBlock *BB) {
+  auto &BI = LR.BlockInfos[BB];
+  SmallPtrSet<const Value *, 16> liveOut(BI.LiveOut.begin(), BI.LiveOut.end());
+  SmallPtrSet<const Value *, 16> seenButNotInLiveOut;
+
+  auto maxRegistersUsed = liveOut.size();
+  auto registersUsed = liveOut.size();
+
+  // Walk backwards through instructions in a block to count the maximum number
+  // of live values in that block.
+  for (auto &inst : make_range(BB->rbegin(), BB->rend())) {
+    // Phi nodes were in live out or were counted as operands. No need to
+    // decrement the registerCount, as one of the arguments used a register.
+    if (isa<PHINode>(&inst)) {
+      break;
+    }
+
+    // Operands are live so they use a register. Increment registerCount if not
+    // in live out or already counted.
+    for (const auto *operand : inst.operand_values()) {
+      if (definesVariable(*operand) && !liveOut.count(operand) &&
+          !seenButNotInLiveOut.count(operand)) {
+        registersUsed++;
+        seenButNotInLiveOut.insert(operand);
+      }
+    }
+
+    // If inst defines a variable, one less register was used before it
+    if (definesVariable(inst)) {
+      registersUsed--;
+    }
+
+    maxRegistersUsed = std::max(registersUsed, maxRegistersUsed);
+  }
+
+  assert(registersUsed == BI.LiveIn.size() &&
+         "Final number of live values inconsistent with live-in");
+
+  BI.MaxRegistersInBlock = maxRegistersUsed;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
new file mode 100644
index 0000000000000..9613422675414
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
@@ -0,0 +1,180 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/packetization_analysis.h"
+
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <multi_llvm/opaque_pointers.h>
+
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "offset_info.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+bool isDivergenceReduction(const Function &F) {
+  compiler::utils::Lexer L(F.getName());
+  return (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
+          L.Consume("divergence_"));
+}
+}  // namespace
+
+llvm::AnalysisKey PacketizationAnalysis::Key;
+
+PacketizationAnalysisResult::PacketizationAnalysisResult(
+    llvm::Function &f, StrideAnalysisResult &sar)
+    : F(f), SAR(sar), UVR(sar.UVR) {
+  // Vectorize branch conditions.
+  for (BasicBlock &BB : F) {
+    auto *TI = BB.getTerminator();
+    if (UVR.isVarying(TI)) {
+      markForPacketization(TI);
+    }
+  }
+
+  // Then vectorize other instructions, starting at leaves.
+  std::vector<Instruction *> Leaves;
+  UVR.findVectorLeaves(Leaves);
+
+  // Traverse the function from the leaves to find instructions that need to be
+  // packetized.
+  for (Instruction *I : Leaves) {
+    markForPacketization(I);
+  }
+}
+
+void PacketizationAnalysisResult::markForPacketization(Value *V) {
+  if (!toPacketize.insert(V).second) {
+    return;
+  }
+
+  auto *const I = dyn_cast<Instruction>(V);
+  if (!I) {
+    return;
+  }
+
+  if (auto *phi = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, n = phi->getNumIncomingValues(); i < n; ++i) {
+      auto *const incoming = phi->getIncomingValue(i);
+      if (UVR.isVarying(incoming)) {
+        markForPacketization(incoming);
+      }
+    }
+    return;
+  }
+
+  auto mo = MemOp::get(I);
+  if (UVR.isMaskVarying(I)) {
+    if (mo) {
+      markForPacketization(mo->getMaskOperand());
+      return;
+    }
+
+    if (auto *const CI = dyn_cast<CallInst>(I)) {
+      Function *Callee = CI->getCalledFunction();
+      if (Callee && UVR.Ctx.isInternalBuiltin(Callee) &&
+          isDivergenceReduction(*Callee)) {
+        markForPacketization(CI->getOperand(0));
+        return;
+      }
+    }
+  }
+
+  if (mo) {
+    auto *const ptr = mo->getPointerOperand();
+    if (ptr && UVR.isVarying(ptr)) {
+      auto const *info = SAR.getInfo(ptr);
+      assert(info && "markForPacketization: Unable to obtain stride info");
+
+      bool hasValidStride = info->hasStride();
+
+      // Analyse the computed stride to see if the pointer will need to be
+      // packetized. No packetization is necessary where a contiguous or
+      // interleaved memop can be created, since only the pointer to the
+      // first element will be used.
+      if (hasValidStride) {
+        // Get the pointer stride as a number of elements
+        auto *const eltTy = mo->getDataType();
+        assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+                   cast<PointerType>(ptr->getType()), eltTy) &&
+               "MemOp assumption broken");
+        if (eltTy->isVectorTy() || eltTy->isPointerTy()) {
+          // No interleaved memops exist for vector element types or pointer
+          // types. We can only vectorize pointer loads/stores or widen vector
+          // load/stores if they are contiguous.
+          auto const stride = info->getConstantMemoryStride(
+              eltTy, &F.getParent()->getDataLayout());
+          if (stride != 1) {
+            hasValidStride = false;
+          }
+        } else if (!VectorType::isValidElementType(eltTy)) {
+          hasValidStride = false;
+        }
+      }
+
+      // Only mark the pointer for packetization if it does not have a
+      // valid linear stride
+      if (!hasValidStride) {
+        markForPacketization(ptr);
+      }
+    }
+
+    auto *const data = mo->getDataOperand();
+    auto *const mask = mo->getMaskOperand();
+    if (data && UVR.isVarying(data)) {
+      markForPacketization(data);
+    }
+    if (mask && UVR.isVarying(mask)) {
+      markForPacketization(mask);
+    }
+    return;
+  }
+
+  if (auto *const intrinsic = dyn_cast<llvm::IntrinsicInst>(I)) {
+    auto const intrinsicID = intrinsic->getIntrinsicID();
+    if (intrinsicID == llvm::Intrinsic::lifetime_end ||
+        intrinsicID == llvm::Intrinsic::lifetime_start) {
+      // We don't trace through lifetime intrinsics.
+      return;
+    }
+  }
+
+  // Mark any varying operands for packetization..
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    auto *const opI = I->getOperand(i);
+    if (UVR.isVarying(opI)) {
+      markForPacketization(opI);
+    }
+  }
+}
+
+PacketizationAnalysisResult PacketizationAnalysis::run(
+    Function &F, llvm::FunctionAnalysisManager &AM) {
+  auto &SAR = AM.getResult<StrideAnalysis>(F);
+  return Result(F, SAR);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
new file mode 100644
index 0000000000000..ba8eeaa77a09b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -0,0 +1,196 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/simd_width_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Function.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+#include "analysis/liveness_analysis.h"
+#include "analysis/packetization_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-simd-width"
+
+using namespace llvm;
+using namespace vecz;
+
+llvm::AnalysisKey SimdWidthAnalysis::Key;
+
+namespace {
+bool definedOrUsedInLoop(Value *V, Loop *L) {
+  if (!L) {
+    // We're not in a loop, so consider everything.
+    return true;
+  }
+
+  auto const *const I = dyn_cast<Instruction>(V);
+  if (I && L->contains(I)) {
+    // It's defined in the current loop.
+    return true;
+  }
+
+  // If it's used in the current loop, return true, unless it is a PHI node.
+  // Values defined outwith the loop, but used only by a PHI node within it must
+  // be loop-carried variable initial values. If these are not otherwise used
+  // directly within the loop, then they are not really live inside the loop.
+  for (auto const *const U : V->users()) {
+    auto const *const I = dyn_cast<Instruction>(U);
+    if (I && !isa<PHINode>(I) && L->contains(I)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+// Avoid Spill implementation. It focus on avoiding register spill by optimizing
+// register pressure.
+unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F,
+                                           FunctionAnalysisManager &AM,
+                                           unsigned MinWidth) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  const TargetTransformInfo TTI = VU.context().getTargetTransformInfo(F);
+  const auto &Liveness = AM.getResult<LivenessAnalysis>(F);
+  const auto &PAR = AM.getResult<PacketizationAnalysis>(F);
+  const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  // Determine the SIMD width based on a live values register usage estimation.
+  assert(!VU.width().isScalable() && "Can't handle scalable-vectors");
+  unsigned SimdWidth = VU.width().getFixedValue();
+  assert(SimdWidth != 0 && "SimdWidthAnalysis: SimdWidth == 0");
+
+  SmallSet<const Value *, 16> OpenIntervals;
+  SmallVector<const Value *, 16> IntervalArray;
+
+  auto ShouldConsider = [&](const Value *V) -> bool {
+    // Filter out work item builtin calls such as get_local_id()
+    if (auto *const CI = dyn_cast<CallInst>(V)) {
+      const Function *Callee = CI->getCalledFunction();
+      if (Callee &&
+          VU.context().builtins().analyzeBuiltin(*Callee).properties ==
+              compiler::utils::eBuiltinPropertyWorkItem) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  LLVM_DEBUG(dbgs() << "VEC(REG): Calculating max register usage:\n");
+  for (const auto &BB : F) {
+    // Get the LiveIns for this Basic Block.
+    // The principle of the Loop Aware SIMD Width Analysis is that it is not
+    // acceptable to spill values in the middle of a loop, however it may be
+    // acceptable to spill some values before entering a loop.
+    const auto &BI = Liveness.getBlockInfo(&BB);
+    OpenIntervals.clear();
+    auto *const CurLoop = LI.getLoopFor(&BB);
+    for (auto *V : BI.LiveOut) {
+      if (ShouldConsider(V) && PAR.needsPacketization(V) &&
+          definedOrUsedInLoop(V, CurLoop)) {
+        OpenIntervals.insert(V);
+      }
+    }
+
+    // Walk backwards through instructions in a block to count the maximum
+    // number of live values in that block.
+    for (auto &inst : make_range(BB.rbegin(), BB.rend())) {
+      if (isa<PHINode>(&inst)) {
+        break;
+      }
+
+      // The first instruction in the reverse range will be the terminator,
+      // so we don't really need to consider it. However we do need to consider
+      // the live set at the point before the last (i.e. first) instruction, so
+      // we deal with the operands first and then process the live set.
+      if (PAR.needsPacketization(&inst)) {
+        bool isGEP = isa<GetElementPtrInst>(&inst);
+        for (auto operand : inst.operand_values()) {
+          if (isa<Instruction>(operand) || isa<Argument>(operand)) {
+            if (!isGEP || PAR.needsPacketization(operand)) {
+              OpenIntervals.insert(operand);
+            }
+          }
+        }
+      }
+
+      OpenIntervals.erase(&inst);
+      IntervalArray.assign(OpenIntervals.begin(), OpenIntervals.end());
+      SimdWidth = VU.context().targetInfo().estimateSimdWidth(
+          TTI, IntervalArray, SimdWidth);
+      LLVM_DEBUG(dbgs() << "VEC(REG): Interval # " << OpenIntervals.size()
+                        << " at SIMD Width " << SimdWidth << '\n');
+      LLVM_DEBUG(
+          for (auto OII = OpenIntervals.begin(), OIIE = OpenIntervals.end();
+               OII != OIIE; OII++) { dbgs() << "inst:" << **OII << '\n'; });
+
+      if (SimdWidth < MinWidth) {
+        return 0;
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "VEC(REG): Found widest fitting SIMD width: "
+                    << SimdWidth << '\n');
+  return SimdWidth;
+}
+
+SimdWidthAnalysis::Result SimdWidthAnalysis::run(
+    Function &F, llvm::FunctionAnalysisManager &AM) {
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+
+  // If the target does not provide vector registers, return 0.
+  MaxVecRegBitWidth = multi_llvm::getFixedValue(
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector));
+
+  if (MaxVecRegBitWidth == 0) {
+    return 0;
+  }
+
+  // If the vectorization factor is for scalable vectors, return 0.
+  if (VU.width().isScalable()) {
+    return 0;
+  }
+
+  auto SimdWidth = avoidSpillImpl(F, AM, 1);
+  if (SimdWidth != 0 && SimdWidth < 4) {
+    // We only return 0 (i.e. don't vectorize) in the case that the packetized
+    // values wouldn't fit into vector registers even with a factor of 1. If
+    // the packetized values fit into vector registers for any width, we use
+    // a baseline factor of 4 since this is empirically better than 2.
+    SimdWidth = 4;
+  }
+
+  return SimdWidth;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
new file mode 100644
index 0000000000000..e4c61dd941ebc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/stride_analysis.h"
+
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "offset_info.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+llvm::AnalysisKey StrideAnalysis::Key;
+
+OffsetInfo &StrideAnalysisResult::analyze(Value *V) {
+  auto const find = analyzed.find(V);
+  if (find != analyzed.end()) {
+    return find->second;
+  }
+
+  // We construct it on the stack first, and copy it into the map, because
+  // the constructor itself can create more things in the map and constructing
+  // it in-place could result in the storage being re-allocated while the
+  // constructor is still running.
+  auto const OI = OffsetInfo(*this, V);
+  return analyzed.try_emplace(V, OI).first->second;
+}
+
+StrideAnalysisResult::StrideAnalysisResult(llvm::Function &f,
+                                           UniformValueResult &uvr)
+    : F(f), UVR(uvr), assumptions(F) {
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (!UVR.isVarying(&I)) {
+        continue;
+      }
+
+      if (auto mo = MemOp::get(&I)) {
+        auto *const ptr = mo->getPointerOperand();
+        analyze(ptr);
+      }
+    }
+  }
+}
+
+void StrideAnalysisResult::manifestAll(IRBuilder<> &B) {
+  auto const saved = B.GetInsertPoint();
+  for (auto &info : analyzed) {
+    info.second.manifest(B, *this);
+  }
+  B.SetInsertPoint(saved->getParent(), saved);
+}
+
+Value *StrideAnalysisResult::buildMemoryStride(IRBuilder<> &B, llvm::Value *Ptr,
+                                               llvm::Type *EleTy) const {
+  if (auto *const info = getInfo(Ptr)) {
+    return info->buildMemoryStride(B, EleTy, &F.getParent()->getDataLayout());
+  }
+  return nullptr;
+}
+
+StrideAnalysisResult StrideAnalysis::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+  return Result(F, UVR);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
new file mode 100644
index 0000000000000..98d6ae406fae2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -0,0 +1,501 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/uniform_value_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+
+#include <cstdlib>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_unit.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+
+// Find leaves by recursing through an instruction's uses
+bool findStrayLeaves(UniformValueResult &UVR, Instruction &I,
+                     DenseSet<Instruction *> &Visited) {
+  for (Use &U : I.uses()) {
+    auto *User = U.getUser();
+    if (isa<StoreInst>(User) || isa<AtomicRMWInst>(User) ||
+        isa<AtomicCmpXchgInst>(User)) {
+      if (UVR.isValueOrMaskVarying(User)) {
+        return true;
+      }
+    } else if (auto *CI = dyn_cast<CallInst>(User)) {
+      if (CI->use_empty()) {
+        // Any call instruction with no uses is counted as a leaf. This case
+        // should also cover any kind of masked stores, since masked stores are
+        // builtin calls with no uses, there is no need to explicitly check for
+        // masked stores.
+        if (UVR.isValueOrMaskVarying(CI)) {
+          return true;
+        }
+      }
+    } else if (auto *UI = dyn_cast<Instruction>(User)) {
+      if (isa<LoadInst>(User)) {
+        // Don't trace through loads
+      } else if (Visited.insert(UI).second) {
+        if (findStrayLeaves(UVR, *UI, Visited)) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool isDivergenceReduction(const Function &F) {
+  compiler::utils::Lexer L(F.getName());
+  return (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
+          L.Consume("divergence_"));
+}
+
+}  // namespace
+
+UniformValueResult::UniformValueResult(Function &F, VectorizationUnit &vu)
+    : F(F), VU(vu), Ctx(VU.context()), dimension(VU.dimension()) {}
+
+bool UniformValueResult::isVarying(const Value *V) const {
+  auto found = varying.find(V);
+  if (found == varying.end()) {
+    return false;
+  }
+  return found->second == VaryingKind::eValueVarying;
+}
+
+bool UniformValueResult::isMaskVarying(const Value *V) const {
+  auto found = varying.find(V);
+  if (found == varying.end()) {
+    return false;
+  }
+  return found->second == VaryingKind::eMaskVarying;
+}
+
+bool UniformValueResult::isValueOrMaskVarying(const Value *V) const {
+  auto found = varying.find(V);
+  if (found == varying.end()) {
+    return false;
+  }
+  return found->second != VaryingKind::eValueUniform;
+}
+
+/// @brief Utility function to check whether an instruction is a call to a
+/// subgroup reduction or subgroup broadcast operaton.
+///
+/// @param[in] I Instruction to check
+/// @param[in] BI BuiltinInfo for platform-specific builtin IDs
+/// @return true if the instruction is a call to a subgroup reduction or
+/// builtin.
+static bool isSubgroupBroadcastOrReduction(
+    const Instruction &I, const compiler::utils::BuiltinInfo &BI) {
+  if (!isa<CallInst>(&I)) {
+    return false;
+  }
+  auto *const CI = cast<CallInst>(&I);
+  auto *const Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return false;
+  }
+  auto const Builtin = BI.analyzeBuiltin(*Callee);
+  if (compiler::utils::eBuiltinSubgroupReduceInvalid !=
+      BI.getBuiltinSubgroupReductionKind(Builtin)) {
+    return true;
+  }
+  return Builtin.isValid() && Builtin.ID == BI.getSubgroupBroadcastBuiltin();
+}
+
+void UniformValueResult::findVectorLeaves(
+    std::vector<Instruction *> &Leaves) const {
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Subgroup reductions and broadcasts are always vector leaves regardless
+      // of uniformity.
+      if (isSubgroupBroadcastOrReduction(I, BI)) {
+        Leaves.push_back(&I);
+        continue;
+      }
+
+      if (!isVarying(&I)) {
+        if (isMaskVarying(&I)) {
+          // it's a leaf if only its mask operand is varying, since the value
+          // itself will be uniform and won't propagate "varying" to its users.
+          Leaves.push_back(&I);
+          continue;
+        }
+        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+          Function *Callee = CI->getCalledFunction();
+          if (!Callee) {
+            continue;
+          }
+
+          // If its a call to user defined function whose use is empty, and is
+          // uniform then add it to the leaves
+          if (!Callee->isIntrinsic() && CI->use_empty()) {
+            // Try to identify the called function
+            auto const Builtin = BI.analyzeBuiltin(*Callee);
+            if (!Builtin.isValid()) {
+              Leaves.push_back(CI);
+            }
+          }
+        }
+        continue;
+      }
+
+      if (StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+        Instruction *Ptr = dyn_cast<Instruction>(Store->getPointerOperand());
+        if (Ptr && isVarying(Ptr)) {
+          Leaves.push_back(Store);
+        }
+        continue;
+      }
+
+      if (ReturnInst *Ret = dyn_cast<ReturnInst>(&I)) {
+        Leaves.push_back(Ret);
+        continue;
+      }
+
+      if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&I)) {
+        Leaves.push_back(RMW);
+        continue;
+      } else if (AtomicCmpXchgInst *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) {
+        Leaves.push_back(CmpXchg);
+        continue;
+      }
+
+      // Functions that have no uses are leaves.
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        bool IsCallLeaf = false;
+        if (CI->use_empty()) {
+          IsCallLeaf = true;
+        } else if (auto Op = MemOp::get(CI)) {
+          // Handle masked stores.
+          if (Op->isStore() &&
+              (Op->isMaskedMemOp() || Op->isMaskedInterleavedMemOp() ||
+               Op->isMaskedScatterGatherMemOp())) {
+            IsCallLeaf = true;
+          }
+        }
+        if (IsCallLeaf) {
+          Leaves.push_back(CI);
+          continue;
+        }
+      }
+    }
+  }
+}
+
+void UniformValueResult::findVectorRoots(std::vector<Value *> &Roots) const {
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI || !CI->getCalledFunction()) {
+        continue;
+      }
+      auto const Builtin = BI.analyzeBuiltinCall(*CI, dimension);
+      auto const Uniformity = Builtin.uniformity;
+      if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+          Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+        // Calls to `get_global_id`/`get_local_id` are roots.
+        Roots.push_back(CI);
+      } else if ((Uniformity == compiler::utils::eBuiltinUniformityNever) &&
+                 !CI->getType()->isVoidTy()) {
+        // Non-void builtins with side-effects are also roots.
+        Roots.push_back(CI);
+      }
+    }
+  }
+
+  // Add vectorized arguments to the list of roots.
+  for (const VectorizerTargetArgument &TargetArg : VU.arguments()) {
+    if (!TargetArg.IsVectorized && !TargetArg.PointerRetPointeeTy) {
+      continue;
+    }
+
+    if (&F == VU.scalarFunction()) {
+      Roots.push_back(TargetArg.OldArg);
+    } else if (&F == VU.vectorizedFunction()) {
+      if (TargetArg.Placeholder) {
+        Roots.push_back(TargetArg.Placeholder);
+      } else {
+        Roots.push_back(TargetArg.NewArg);
+      }
+    }
+  }
+}
+
+AllocaInst *UniformValueResult::findAllocaFromPointer(Value *Pointer) {
+  while (Pointer) {
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Pointer)) {
+      return Alloca;
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Pointer)) {
+      Pointer = GEP->getPointerOperand();
+    } else if (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer)) {
+      Pointer = BC->getOperand(0);
+    } else if (LoadInst *Load = dyn_cast<LoadInst>(Pointer)) {
+      Pointer = Load->getPointerOperand();
+    } else {
+      return nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+void UniformValueResult::markVaryingValues(Value *V, Value *From) {
+  auto &vary = varying[V];
+  // Do not visit values twice.
+  if (vary == VaryingKind::eValueVarying) {
+    return;
+  }
+
+  if (CallInst *CI = dyn_cast<CallInst>(V)) {
+    // Some builtins produce a uniform value regardless of their inputs.
+    Function *Callee = CI->getCalledFunction();
+    if (Callee) {
+      compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      auto const Builtin = BI.analyzeBuiltinCall(*CI, dimension);
+      auto const Uniformity = Builtin.uniformity;
+      if (Uniformity == compiler::utils::eBuiltinUniformityAlways) {
+        return;
+      }
+
+      if (auto Op = MemOp::get(CI)) {
+        // The mask cannot affect the MemOp value, even though we may still
+        // need to packetize the mask..
+        auto *Mask = Op->getMaskOperand();
+        if (Mask && From == Mask) {
+          vary = VaryingKind::eMaskVarying;
+          return;
+        }
+      } else if (Ctx.isInternalBuiltin(Callee)) {
+        // A divergence reduction builtin's value is uniform even though its
+        // argument is not, since it is a reduction over the SIMD width.
+        if (isDivergenceReduction(*Callee)) {
+          vary = VaryingKind::eMaskVarying;
+          return;
+        }
+      }
+    }
+  }
+
+  // Mark V as being varying.
+  vary = VaryingKind::eValueVarying;
+  LLVM_DEBUG(dbgs() << "vecz: Needs packetization: " << *V << "\n");
+
+  // Visit all users of V, they are varying too.
+  for (Use &Use : V->uses()) {
+    User *User = Use.getUser();
+    markVaryingValues(User, V);
+  }
+
+  // Mark uses of V for certain kinds of values.
+  Instruction *VIns = dyn_cast<Instruction>(V);
+  if (!VIns) {
+    return;
+  }
+
+  if (StoreInst *Store = dyn_cast<StoreInst>(VIns)) {
+    // Find the base address for the store. Storing varying values to an
+    // alloca location requires the alloca to be vectorized.
+    // We don't want to use extractMemOffset here because this requires the
+    // uniform value analysis to be finished.
+    AllocaInst *Alloca = findAllocaFromPointer(Store->getPointerOperand());
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(VIns)) {
+    AllocaInst *Alloca = findAllocaFromPointer(Load->getPointerOperand());
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(VIns)) {
+    // Same as with the stores
+    AllocaInst *Alloca = findAllocaFromPointer(GEP->getPointerOperand());
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (BitCastInst *BC = dyn_cast<BitCastInst>(VIns)) {
+    // Same as with the stores
+    AllocaInst *Alloca = findAllocaFromPointer(BC->getOperand(0));
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (CallInst *CI = dyn_cast<CallInst>(VIns)) {
+    // Stores might be function calls as well
+    // Known MemOps have one known pointer operand which we can check.
+    if (auto Op = MemOp::get(CI)) {
+      if (auto *const Ptr = Op->getPointerOperand()) {
+        if (auto *Alloca = findAllocaFromPointer(Ptr)) {
+          markVaryingValues(Alloca);
+        }
+      }
+    } else {
+      // Check all parameters of unknown calls with pointer arguments.
+      for (auto &A : CI->args()) {
+        if (A->getType()->isPointerTy()) {
+          if (auto *Alloca = findAllocaFromPointer(A)) {
+            markVaryingValues(Alloca);
+          }
+        }
+      }
+    }
+  }
+}
+
+Value *UniformValueResult::extractMemBase(Value *Address) {
+  if (BitCastInst *BCast = dyn_cast<BitCastInst>(Address)) {
+    return extractMemBase(BCast->getOperand(0));
+  } else if (auto *ASCast = dyn_cast<AddrSpaceCastInst>(Address)) {
+    return extractMemBase(ASCast->getOperand(0));
+  } else if (isa<IntToPtrInst>(Address)) {
+    return Address;
+  } else if (isa<Argument>(Address)) {
+    return Address;
+  } else if (isa<GlobalVariable>(Address)) {
+    return Address;
+  } else if (isa<AllocaInst>(Address)) {
+    return Address;
+  } else if (auto *const Phi = dyn_cast<PHINode>(Address)) {
+    // If all the incoming values are the same, we can trace through it. In
+    // the general case, it's not trivial to check that the stride is the same
+    // from every incoming block, and since incoming values may not dominate
+    // the IRBuilder insert point, we might not even be able to build the
+    // offset expression instructions there.
+    if (auto *const CVal = Phi->hasConstantValue()) {
+      return extractMemBase(CVal);
+    }
+
+    // In the simple case of a loop-incremented pointer using a GEP, we can
+    // handle it thus:
+    auto NumIncoming = Phi->getNumIncomingValues();
+    if (NumIncoming != 2) {
+      // Perhaps we can handle more than one loop latch, but not yet.
+      return nullptr;
+    }
+
+    if (auto *const GEP =
+            dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(1))) {
+      // If it's a simple loop iterator, the base can be analyzed from the
+      // initial value.
+      if (GEP->getPointerOperand() == Phi) {
+        for (auto const &index : GEP->indices()) {
+          if (isVarying(index.get())) {
+            return nullptr;
+          }
+        }
+        return extractMemBase(Phi->getIncomingValue(0));
+      }
+    }
+
+    return nullptr;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Address)) {
+    // Try to recursively extract the base from the GEP base.
+    return extractMemBase(GEP->getPointerOperand());
+  } else if (isVarying(Address)) {
+    // If it's varying we can't analyze it any further.
+    return nullptr;
+  } else {
+    // If it's uniform we can just return the uniform address.
+    return Address;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+llvm::AnalysisKey UniformValueAnalysis::Key;
+
+UniformValueResult UniformValueAnalysis::run(
+    llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  UniformValueResult Res(F, VU);
+  std::vector<Value *> Roots;
+  Res.findVectorRoots(Roots);
+
+  // Mark all roots and their uses as being varying.
+  for (Value *Root : Roots) {
+    Res.markVaryingValues(Root);
+  }
+
+  compiler::utils::BuiltinInfo &BI = Res.Ctx.builtins();
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Find atomic instructions, these are always varying
+      if (I.isAtomic()) {
+        Res.markVaryingValues(&I);
+        continue;
+      }
+
+      // The same goes for the atomic builtins as well
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          auto const Builtin = BI.analyzeBuiltin(*Callee);
+          if (Builtin.properties & compiler::utils::eBuiltinPropertyAtomic) {
+            Res.markVaryingValues(&I);
+            continue;
+          }
+        }
+      }
+    }
+  }
+
+  // If an alloca has been initialized with a uniform value, findVectorLeaves()
+  // will not pick up the store instruction as a leaf, even when that alloca is
+  // used by some other leaves. We have to go through all the allocas and mark
+  // them as varying if any varying instructions use them. This is the case
+  // also for masked stores where only the mask is varying.
+  bool Changed = true;
+  while (Changed) {
+    DenseSet<Instruction *> Visited;
+    Changed = false;
+    bool Remaining = false;
+    for (Instruction &I : F.front()) {
+      if (isa<AllocaInst>(&I)) {
+        if (!Res.isVarying(&I)) {
+          if (findStrayLeaves(Res, I, Visited)) {
+            // We found a varying leaf, so this Alloca is non-uniform.
+            Res.markVaryingValues(&I);
+
+            // Marking an alloca as varying could mark a leaf as varying that
+            // may also depend on a different alloca, so we have to go again.
+            Changed = true;
+          } else {
+            Remaining = true;
+          }
+        }
+      } else {
+        break;
+      }
+    }
+    Changed &= Remaining;
+  }
+
+  return Res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
new file mode 100644
index 0000000000000..6043238dd4886
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/vectorizable_function_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "vectorization_context.h"
+
+#define DEBUG_TYPE "vecz-function-analysis"
+
+using namespace vecz;
+using namespace llvm;
+
+llvm::AnalysisKey VectorizableFunctionAnalysis::Key;
+
+/// @brief Tell Vecz to go ahead and handle calls to declaration-only functions
+///
+/// This flag is for testing and debugging purposes and it should not be used
+/// for normal code as instantiating undefined functions is not always valid.
+cl::opt<bool> HandleDeclOnlyCalls(
+    "vecz-handle-declaration-only-calls",
+    cl::desc("Go ahead and handle calls to declaration-only functions"));
+
+namespace {
+
+/// @brief Determine whether the instruction can be vectorized or not.
+///
+/// @param[in] I Instruction to check for vectorizability.
+/// @param[in] Ctx VectorizationContext for BuiltinInfo.
+///
+/// @return true if I can be vectorized.
+bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
+  // Certain instructions just cannot appear.
+  switch (I.getOpcode()) {
+    default:
+      break;
+    case Instruction::IndirectBr:
+    case Instruction::VAArg:
+    case Instruction::Invoke:
+    case Instruction::Resume:
+    case Instruction::LandingPad:
+      return false;
+  }
+
+  // User function calls.
+  if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+    if (const Function *Callee = CI->getCalledFunction()) {
+      // We are going to assume that we can handle LLVM intrinsics for now and
+      // let the later passes deal with them
+      if (Callee->isIntrinsic()) {
+        return true;
+      }
+
+      // All builtins should be vectorizable, in principle. "Invalid builtins"
+      // correspond to user functions.
+      const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      auto const Builtin = BI.analyzeBuiltin(*Callee);
+      if (!Builtin.isValid()) {
+        // If it is a user function missing a definition, we cannot safely
+        // instantiate it. For example, what if it contains calls to
+        // get_global_id internally?
+        if (Callee->isDeclaration()) {
+          return HandleDeclOnlyCalls;
+        }
+        // The same goes for functions we cannot inline, at least until we have
+        // a way of determining if a function can be safely instantiated or not.
+        if (Callee->hasFnAttribute(Attribute::NoInline)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/// @brief Determine whether the function can be vectorized or not.
+///
+/// @param[in] F Function to check for vectorizability.
+/// @param[in] Ctx VectorizationContext for BuiltinInfo.
+///
+/// @return the Instruction that prevents the function from vectorizing, or
+/// nullptr if the function can be vectorized.
+Value const *canVectorize(const Function &F, const VectorizationContext &Ctx) {
+  // Look for things that are not (yet?) supported.
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      if (!canVectorize(I, Ctx)) {
+        return &I;
+      }
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace
+
+VectorizableFunctionAnalysis::Result VectorizableFunctionAnalysis::run(
+    llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
+  Result res;
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+
+  // Do not vectorize functions with the OptNone attribute
+  if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+    res.canVectorize = false;
+    return res;
+  }
+
+  res.failedAt = canVectorize(F, Ctx);
+  res.canVectorize = !res.failedAt;
+  return res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
new file mode 100644
index 0000000000000..715d72e4daec0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/vectorization_unit_analysis.h"
+
+#define DEBUG_TYPE "vecz-unit-analysis"
+
+using namespace vecz;
+
+llvm::AnalysisKey VectorizationUnitAnalysis::Key;
+
+VectorizationUnitAnalysis::Result VectorizationUnitAnalysis::run(
+    llvm::Function &F, llvm::FunctionAnalysisManager &) {
+  return Result{Ctx.getActiveVU(&F)};
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "vecz-context-analysis"
+
+llvm::AnalysisKey VectorizationContextAnalysis::Key;
+
+VectorizationContextAnalysis::Result VectorizationContextAnalysis::run(
+    llvm::Function &, llvm::FunctionAnalysisManager &) {
+  return Result{Context};
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
new file mode 100644
index 0000000000000..0eb13795b4b51
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -0,0 +1,1407 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "control_flow_boscc.h"
+
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/ADT/SetOperations.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <numeric>
+#include <queue>
+#include <utility>
+
+#include "analysis/divergence_analysis.h"
+#include "analysis/liveness_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "llvm_helpers.h"
+#include "reachability.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+using RPOT = ReversePostOrderTraversal<Function *>;
+
+bool isUsedOutsideDefinitionBlock(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    return std::any_of(I->user_begin(), I->user_end(), [&I](User *U) {
+      return cast<Instruction>(U)->getParent() != I->getParent();
+    });
+  }
+  return false;
+}
+
+/// @brief Check whether a block is "trivial" according to a heuristic
+/// @param[in] BB the Basic Block to check
+/// @return true if the block is trivial
+bool isTrivialBlock(const BasicBlock &BB) {
+  if (BB.size() > 3) {
+    return false;
+  }
+
+  for (const auto &I : BB) {
+    if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects() ||
+        isa<PHINode>(&I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+/// @brief Check whether a uniform region is viable and worth keeping.
+/// @param[in] region the region to check
+/// @param[in] noDuplicateBlocks blocks the region is not alowed to contain
+/// @return false iff the region should be discarded.
+
+bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
+  LLVM_DEBUG(dbgs() << "DUPLICATE UNIFORM REGIONS\n");
+
+  // Keep tracks of blocks that contain NoDuplicate calls.
+  DenseSet<BasicBlock *> noDuplicateBlocks;
+  SmallPtrSet<Loop *, 16> noDuplicateLoops;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (CI->hasFnAttr(Attribute::NoDuplicate)) {
+          noDuplicateBlocks.insert(&BB);
+          auto *const loop = DR->getTag(&BB).loop;
+          if (loop) {
+            noDuplicateLoops.insert(loop->loop);
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  // First, create the regions.
+  VECZ_FAIL_IF(!createUniformRegions(noDuplicateBlocks));
+
+  // Keep track of blocks that belong to loops. If a whole loop is duplicated,
+  // then a new loop object should be created for the uniform version.
+  SmallVector<Loop *, 16> duplicatedLoops;
+  SmallPtrSet<Loop *, 16> duplicatedLoopSet;
+
+  size_t size = std::accumulate(uniformRegions.begin(), uniformRegions.end(), 0,
+                                [](size_t base, const UniformRegion &region) {
+                                  return base + region.predicatedBlocks.size();
+                                });
+  std::vector<BasicBlock *> newBlocks;
+  newBlocks.reserve(size);
+
+  // Conserve the original edges of the CFG.
+  for (BasicBlock &BB : F) {
+    for (BasicBlock *succ : successors(&BB)) {
+      uniformEdges[&BB].push_back(succ);
+    }
+  }
+
+  // Then duplicate them.
+  for (auto &region : uniformRegions) {
+    BasicBlock *entry = region.entryBlock;
+
+    std::vector<BasicBlock *> sortedNewRegionBlocks;
+    sortedNewRegionBlocks.reserve(region.predicatedBlocks.size());
+
+    // Process the region's predicated blocks in DCBI order.
+    // Gather the block indices, then sort them.
+    std::vector<size_t> predicatedBlockIndices;
+    predicatedBlockIndices.reserve(region.predicatedBlocks.size());
+    for (auto *const B : region.predicatedBlocks) {
+      predicatedBlockIndices.push_back(DR->getTagIndex(B));
+    }
+    std::sort(predicatedBlockIndices.begin(), predicatedBlockIndices.end());
+
+    for (auto const index : predicatedBlockIndices) {
+      auto const &BTag = DR->getBlockTag(index);
+      auto *const B = BTag.BB;
+      auto *const LTag = BTag.loop;
+
+      // If the block is the BOSCC entry block, we don't want to duplicate it
+      // unless it is part of a loop.
+      if (B == entry && !LTag) {
+        continue;
+      }
+
+      BasicBlock *newB = nullptr;
+      // If we have already cloned 'B', then we can reuse the cloned version.
+      if (VMap.count(B)) {
+        continue;
+      }
+
+      newB = CloneBasicBlock(B, VMap, ".uniform", &F);
+      VMap.insert({B, newB});
+      region.uniformBlocks.insert(newB);
+      newBlocks.push_back(newB);
+      sortedNewRegionBlocks.push_back(newB);
+
+      // The new blocks will remain uniform
+      BasicBlockTag &newBTag = DR->getOrCreateTag(newB);
+      DR->setFlag(*newB, eBlockIsUniform);
+
+      if (LTag) {
+        auto *const loop = LTag->loop;
+        if (LTag->header == B) {
+          duplicatedLoopSet.insert(loop);
+          duplicatedLoops.push_back(loop);
+        }
+
+        if (!duplicatedLoopSet.count(loop)) {
+          newBTag.loop = LTag;
+          loop->addBasicBlockToLoop(newB, *LI);
+        }
+      }
+    }
+
+    // Splice the newly inserted blocks into the function right before the
+    // first div_causing block.
+    if (!sortedNewRegionBlocks.empty() &&
+        entry->getNextNode() != sortedNewRegionBlocks[0]) {
+#if LLVM_VERSION_MAJOR >= 16
+      F.splice(entry->getNextNode()->getIterator(), &F,
+               sortedNewRegionBlocks[0]->getIterator(), F.end());
+#else
+      F.getBasicBlockList().splice(
+          entry->getNextNode()->getIterator(), F.getBasicBlockList(),
+          sortedNewRegionBlocks[0]->getIterator(), F.end());
+#endif
+    }
+  }
+
+  // Since we added all loops by their headers in DCBI order, inner loops will
+  // always follow outer loops, so there is no need to sort them.
+  for (Loop *L : duplicatedLoops) {
+    if (!LMap.count(L) && !noDuplicateLoops.count(L)) {
+      VECZ_FAIL_IF(!duplicateUniformLoops(L));
+    }
+  }
+
+  // Fix the duplicated instructions arguments.
+  for (BasicBlock *B : newBlocks) {
+    bool const notHeader = !DR->getTag(B).isLoopHeader();
+
+    for (Instruction &I : *B) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+      // Update the phi nodes if a uniform block has any incoming blocks*
+      // that are not div causing. In that case, the predicated incoming blocks
+      // will never be rewired to the uniform block so we can remove the
+      // incoming block from the phi node, unless 'B' is a loop header, in which
+      // case its predicated preheader (if any) will be rewired to it while we
+      // connect the regions).
+      //
+      // *NOTE a non-div-causing incoming block may or may not be a predicated
+      // block. A By All block with a non-varying branch can still branch into
+      // a BOSCC region, which would seem to break the SESE criteria.
+      if (notHeader) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) {
+            BasicBlock *PHIB = PHI->getIncomingBlock(i);
+            if (!DR->isUniform(*PHIB) &&
+                !DR->hasFlag(*PHIB,
+                             BlockDivergenceFlag::eBlockHasDivergentBranch)) {
+              PHI->removeIncomingValue(i--);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::duplicateUniformLoops(Loop *L) {
+  LoopTag const &LTag = DR->getTag(L);
+  Loop *const uniformL = LI->AllocateLoop();
+
+  // Either add 'uniformL' as a child of a loop or as a top level loop.
+  // If it is a child loop, either add it as a child of a uniform loop if it
+  // exists, otherwise as a child of a predicated loop.
+  if (Loop *parentL = L->getParentLoop()) {
+    auto it = LMap.find(parentL);
+    if (it != LMap.end()) {
+      it->second->addChildLoop(uniformL);
+    } else {
+      parentL->addChildLoop(uniformL);
+    }
+  } else {
+    LI->addTopLevelLoop(uniformL);
+  }
+
+  LMap.insert({L, uniformL});
+
+  LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " has been duplicated\n");
+
+  // Fill the loop tag.
+  LoopTag *uniformLTag = &DR->getOrCreateTag(uniformL);
+
+  // The preheader of the loop may not have been duplicated.
+  BasicBlock *preheader = LTag.preheader;
+  if (BasicBlock *uniformPreheader = getBlock(preheader)) {
+    preheader = uniformPreheader;
+  }
+  uniformLTag->preheader = preheader;
+  uniformLTag->header = getBlock(LTag.header);
+  uniformLTag->latch = getBlock(LTag.latch);
+
+  LLVM_DEBUG(dbgs() << "\tPreheader: " << uniformLTag->preheader->getName()
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "\tHeader: " << uniformLTag->header->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "\tLatch: " << uniformLTag->latch->getName() << "\n");
+
+  // Add all blocks to the uniform version.
+  for (BasicBlock *blockL : L->blocks()) {
+    if (DR->getTag(blockL).loop->loop == L) {
+      BasicBlockTag &uniformBlockLTag = DR->getTag(getBlock(blockL));
+      uniformL->addBasicBlockToLoop(uniformBlockLTag.BB, *LI);
+      uniformBlockLTag.loop = uniformLTag;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
+    DenseSet<BasicBlock *> const &noDuplicateBlocks) {
+  auto discardRegion =
+      [&noDuplicateBlocks](UniformRegion const &region) -> bool {
+    // To determine if it is worth it to duplicate the uniform region, we must
+    // take several elements into account:
+    // - The length of the duplicated code
+    // - branch probability
+    // - TODO: CA-1221
+    // size_t cost =
+    //    std::accumulate(Region->predicatedBlocks.begin(),
+    //    Region->predicatedBlocks.end(), 0,
+    //                    [](int x, BasicBlock *B) { return x +
+    //                    B->size(); });
+    // PercentageOfAllTrue =
+    // runTimeValuesOfVectorPredicateAllTrue /
+    // runTimeValuesOfVectorPredicate;
+    //
+    // It may not be worth to duplicate the whole uniform region but still worth
+    // to duplicate some of the divergent branches in it.
+
+    if (region.predicatedBlocks.empty() /*|| cost > max*/) {
+      return true;
+    }
+
+    // If the region we want to duplicate contains NoDuplicate
+    // function calls, then we cannot duplicate it.
+    if (std::any_of(region.predicatedBlocks.begin(),
+                    region.predicatedBlocks.end(),
+                    [&noDuplicateBlocks](BasicBlock *B) {
+                      return noDuplicateBlocks.count(B);
+                    })) {
+      LLVM_DEBUG(dbgs() << "Region of " << region.entryBlock->getName()
+                        << " cannot be duplicated because of "
+                           "NoDuplicate instructions\n");
+      return true;
+    }
+
+    // It's not worth BOSCCing if all the blocks are trivial
+    if (std::all_of(region.predicatedBlocks.begin(),
+                    region.predicatedBlocks.end(),
+                    [](BasicBlock *B) { return isTrivialBlock(*B); })) {
+      return true;
+    }
+
+    return false;
+  };
+
+  // We wish to identify Single-Entry, Single-Exit regions of the CFG
+  // that contain divergence-causing branches. A SESE region is defined
+  // as a subgraph of the CFG with an entry point at A and an exit point
+  // at B such that:
+  //   1. A dominates B
+  //   2. B post-dominates A
+  //   3. Any loop containing A also contains B, and vice-versa.
+  //
+  // The properties of the Dominance-Compact Block Indexing also happen to
+  // imply SESE-compactness, so once we identify an entry point, we can
+  // construct a SESE region by finding the exit block that post-dominates
+  // everything in a subsequence of the DCBI starting from A.
+  //
+  // We had assumed initailly that any divergence-causing block will be the
+  // start of a SESE region. However, certain edge cases have arisen during
+  // testing that demonstrate that this is not the case. In practice, this
+  // doesn't seem to matter, as long as we can fully identify the predicated
+  // subset of the SESE region, so we are really working with Multiple-Entry,
+  // Single-Exit regions here. This was the cause of the BOSCC Back Door bug
+  // that was encountered previously (CA-2711), where the entry block of a
+  // supposed SESE region did not actually dominate everything in the region,
+  // which in this case was caused by an additional non-divergent code path
+  // (the "back door" entry point), but it is equally possible for two
+  // divergence-causing branches to enter a predicated region (CA-3194).
+  //
+  // a)    A*      b)    A       c)    A       d)    A      .
+  //      / \           / \           / \           / \     .
+  //     B   D         B*  D         B*  D*        B*  D*   .
+  //    / \ / \       / \ / \       / \ / \       / \ / \   .
+  //   C   F   E     C   F   E     C   F   E     C   F   E  .
+  //    \  |  /       \  |  /       \  |  /       \ /   /   .
+  //     \ | /         \ | /         \ | /         G   /    .
+  //      \|/           \|/           \|/           \ /     .
+  //       X             X             X             X      .
+  //
+  // Figure 1. CFGs showing SESE regions. Divergence-causing blocks are marked
+  // with an asterisk. Blocks are labelled alphabetically in DCBI order.
+  //
+  // (1a) shows the case of a SESE region with a divergence-causing entry block.
+  //
+  // (1b) shows the "back door" case, where a block inside the predicated
+  // sub-region has a non-divergent predecessor outside of it.
+  //
+  // (1c) shows a SESE region with two divergence-causing entry points into the
+  // predicated sub-region. This will result in two overlapping regions.
+  //
+  // (1d) shows a case where the exit block of the SESE region is not the
+  // immediate post-dominator of B, the first-encountered divergence causing
+  // block. Therefore the two overlapping regions have different exit blocks.
+  //
+  // Another situation can arise (CA-3851) where the SESE region can contain
+  // two completely unconnected predicated subregions. Although the DCBI is
+  // SESE compact, a SESE region can still contain other, nested SESE regions.
+  // Since an entry point into the predicated subregion is not necessarily the
+  // SESE entry point, all predicated blocks may not be reachable from every
+  // entry point. Because of these cases, it is necessary to consider each
+  // divergence causing block that is not part of the predicated subregion of
+  // any other divergence causing block as the entry point of their own SESE
+  // regions, even though this does not strictly satisfy the SESE criteria.
+  //
+  // a)    A      b)       A      Figure 2.
+  //      / \             / \     .
+  //     B*  E*          /   D*   (2a) shows a case of two independent regions
+  //    / \ / \         /   / \   sharing an exit block.
+  //   C  D F  G       B*  E   F  .
+  //    \ | | /       / \   \ /   (2b) shows a case where a SESE subregion will
+  //     \| |/       C   \   G    appear in the middle of the DCBI of the
+  //      \ /         \   \ /     subregion beginning with B. G post-dominates
+  //       X           \   H      D, forming a complete nested SESE region.
+  //                    \ /       .
+  //                     X        .
+
+  struct SESEInfo {
+    BasicBlock *BB = nullptr;
+    bool divCausing = false;
+    bool predicated = false;
+  };
+
+  // Collect all the blocks in the worklist
+  auto const &DCBI = DR->getBlockOrdering();
+  size_t const numBlocks = DCBI.size();
+  SmallVector<SESEInfo, 16> SESE;
+  SESE.reserve(numBlocks);
+  for (auto const &BBTag : DCBI) {
+    SESE.emplace_back();
+    SESE.back().BB = BBTag.BB;
+  }
+
+  // Mark all the divergence-causing blocks
+  for (auto *const BB : DR->getDivCausingBlocks()) {
+    SESE[DR->getTagIndex(BB)].divCausing = true;
+  }
+
+  // Create the BOSCC regions
+  for (size_t i = 0; i != numBlocks;) {
+    auto &info = SESE[i];
+    if (!info.divCausing) {
+      ++i;
+      continue;
+    }
+
+    uniformRegions.emplace_back();
+    auto &region = uniformRegions.back();
+    size_t const entryPos = i;
+    size_t exitPos = 0u;
+    size_t firstPredicated = numBlocks;
+
+    region.entryBlock = info.BB;
+    region.divergentBranches.push_back(info.BB);
+
+    SmallVector<unsigned, 16> stack;
+
+    // If we are in a divergent loop, then the whole loop needs a uniform
+    // version.
+    auto const *const entryLoopTag = DR->getTag(info.BB).loop;
+    if (entryLoopTag && entryLoopTag->isLoopDivergent()) {
+      auto *const loop = entryLoopTag->loop;
+      for (BasicBlock *loopB : loop->blocks()) {
+        size_t const pos = DR->getTagIndex(loopB);
+        firstPredicated = std::min(firstPredicated, pos);
+        SESE[pos].predicated = true;
+        region.predicatedBlocks.insert(loopB);
+
+        if (loop->isLoopExiting(loopB)) {
+          stack.push_back(pos);
+        }
+      }
+    }
+
+    // Traverse the CFG from the entry point, marking blocks for predication
+    stack.push_back(entryPos);
+    while (!stack.empty()) {
+      auto *const cur = SESE[stack.pop_back_val()].BB;
+      for (BasicBlock *succ : successors(cur)) {
+        size_t const succPos = DR->getTagIndex(succ);
+
+        auto *const succLoopTag = DR->getBlockTag(succPos).loop;
+        if ((!succLoopTag || !succLoopTag->isLoopDivergent()) &&
+            // The region 'entry' creates contains only blocks that are
+            // contained in its SESE region.
+            PDT->properlyDominates(succ, region.entryBlock)) {
+          VECZ_ERROR_IF(exitPos != 0u && succPos != exitPos,
+                        "SESE region multiple exit blocks identified");
+          exitPos = succPos;
+          continue;
+        }
+
+        auto &succInfo = SESE[succPos];
+        if (!succInfo.predicated) {
+          firstPredicated = std::min(firstPredicated, succPos);
+          stack.push_back(succPos);
+          region.predicatedBlocks.insert(succ);
+          succInfo.predicated = true;
+        }
+      }
+    }
+    VECZ_ERROR_IF(exitPos == 0u, "SESE region exit block not identified");
+    region.exitBlock = SESE[exitPos].BB;
+    i = exitPos;
+
+    // Collect any other divergent branches in the predicated region, and clear
+    // the predication flags so regions can overlap.
+    for (unsigned j = firstPredicated; j != exitPos; ++j) {
+      auto &ji = SESE[j];
+      if (ji.divCausing && j > entryPos) {
+        if (ji.predicated) {
+          region.divergentBranches.push_back(ji.BB);
+          ji.divCausing = false;
+        } else if (j < i) {
+          // Found another unpredicated divergent branch between the entry
+          // point and the exit point. Reset the iterator so we can process it.
+          i = j;
+        }
+      }
+      ji.predicated = false;
+    }
+
+    if (discardRegion(region)) {
+      // It's not worth keeping this region.
+      uniformRegions.pop_back();
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
+  LLVM_DEBUG(dbgs() << "CONNECT BOSCC REGIONS\n");
+
+  // If we have not duplicated a loop but we have duplicated the preheader,
+  // then the loop now has 2 preheaders. We thus need to blend them into one
+  // single preheader.
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    if (!LTag->isLoopDivergent() && !LMap.count(LTag->loop)) {
+      BasicBlock *predicatedPreheader = LTag->preheader;
+      if (BasicBlock *uniformPreheader = getBlock(predicatedPreheader)) {
+        BasicBlock *header = LTag->header;
+
+        LLVM_DEBUG(dbgs() << "Loop " << header->getName()
+                          << " has two preheaders\n");
+
+        // Create a new loop preheader that blends both the uniform and
+        // predicated preheaders, to keep well formed loops (with only one
+        // incoming preheader).
+        BasicBlock *newPreheader = BasicBlock::Create(
+            F.getContext(), predicatedPreheader->getName() + ".blend", &F,
+            header);
+        BranchInst::Create(header, newPreheader);
+
+        // Set the successor of both preheaders to be the new preheader.
+        auto *predicatedPreheaderT = predicatedPreheader->getTerminator();
+        auto *uniformPreheaderT = uniformPreheader->getTerminator();
+        VECZ_ERROR_IF(predicatedPreheaderT->getNumSuccessors() != 1,
+                      "Preheader should have only one successor");
+        VECZ_ERROR_IF(uniformPreheaderT->getNumSuccessors() != 1,
+                      "Preheader should have only one successor");
+        predicatedPreheaderT->setSuccessor(0, newPreheader);
+        uniformPreheaderT->setSuccessor(0, newPreheader);
+
+        // Update the tags.
+        BasicBlockTag &newPreheaderTag = DR->getOrCreateTag(newPreheader);
+        newPreheaderTag.loop = DR->getTag(predicatedPreheader).loop;
+        LTag->preheader = newPreheader;
+
+        DR->setFlag(*newPreheader, DR->getFlag(*predicatedPreheader));
+
+        addInRegions(newPreheader, predicatedPreheader);
+      }
+    }
+  }
+
+  // We must make the outermost non duplicated loop's preheader target the
+  // outermost duplicated uniform and predicated loop's headers. The first
+  // iteration of the loop will necessarily have all lanes activated until it
+  // reaches the first divergent block. Also, once the loop starts diverging,
+  // there is no way to go back to a dynamically uniform loop, so there is no
+  // point allowing the loop to go back and forth between its uniform and
+  // predicated versions. Only going from the uniform to the predicated
+  // version makes sense.
+  for (const auto &pair : LMap) {
+    Loop *uniformL = pair.second;
+    const Loop *L = pair.first;
+
+    if (Loop *parentL = L->getParentLoop()) {
+      if (LMap.count(parentL)) {
+        continue;
+      }
+    }
+
+    auto const &LTag = DR->getTag(L);
+    BasicBlock *preheader = LTag.preheader;
+    if (!VMap.count(preheader)) {
+      auto *T = preheader->getTerminator();
+      VECZ_ERROR_IF(T->getNumSuccessors() != 1,
+                    "Preheader has more than one successor");
+
+      LLVM_DEBUG(dbgs() << "Non duplicated preheader " << preheader->getName()
+                        << "must target uniform loop " << uniformL->getName()
+                        << "\n");
+
+      // Add a path from 'preheader' to the uniform loop header and make it
+      // always branch to it. We want to keep the edge from 'preheader' to the
+      // predicated loop header (even though we will never branch to it) to ease
+      // some needed blendings later on.
+      IRCleanup::deleteInstructionNow(T);
+      BranchInst::Create(DR->getTag(uniformL).header, LTag.header,
+                         ConstantInt::getTrue(F.getContext()), preheader);
+    }
+  }
+
+  DenseSet<BasicBlock *> connectedBlocks;
+  for (auto &region : uniformRegions) {
+    // Each uniform version of div causing blocks need an entry point to the
+    // predicated CFG.
+    for (BasicBlock *B : region.divergentBranches) {
+      if (connectedBlocks.insert(B).second) {
+        if (BasicBlock *uniformB = getBlock(B)) {
+          VECZ_FAIL_IF(!connectUniformRegion(region, B, uniformB));
+        } else {
+          VECZ_FAIL_IF(!connectUniformRegion(region, B, B));
+        }
+      } else {
+        // No other region should have connected the entry block.
+        BasicBlock *entry = region.entryBlock;
+        VECZ_FAIL_IF(B == entry);
+      }
+    }
+  }
+
+  // If a uniform block targets a predicated block, the latter needs its
+  // operands that have a uniform and predicated version blended.
+  for (auto const &predicatedBTag : DR->getBlockOrdering()) {
+    if (BasicBlock *uniformB = getBlock(predicatedBTag.BB)) {
+      for (BasicBlock *succ : successors(uniformB)) {
+        // We've found a uniform block that targets a predicated block prior
+        // to connecting the regions.
+        if (!DR->isUniform(*succ)) {
+          LLVM_DEBUG(dbgs() << "Uniform block " << uniformB->getName()
+                            << " targets predicated block " << succ->getName()
+                            << "\n");
+          VECZ_FAIL_IF(
+              !blendConnectionPoint(succ, {predicatedBTag.BB, uniformB}));
+        }
+      }
+    }
+  }
+
+  // Add all the uniform blocks into the worklist now they got connected.
+  DT->recalculate(F);
+  PDT->recalculate(F);
+  VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+  VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+  VECZ_FAIL_IF(!computeBlockOrdering());
+
+  // NOTE doing the Liveness Analysis here is potentially dangerous, since we
+  // have yet to fully restore SSA form (CA-3703).
+  liveness = &AM.getResult<LivenessAnalysis>(F);
+  RC->recalculate(F);
+  VECZ_FAIL_IF(!blendFinalize());
+
+  // Sort URVBlender in a post order so that the replaced new values don't
+  // overlap with old ones.
+  if (!URVB.empty()) {
+    std::sort(URVB.begin(), URVB.end(),
+              [this](const URVBlender::value_type &LHS,
+                     const URVBlender::value_type &RHS) {
+                return DR->getTagIndex(LHS.first) > DR->getTagIndex(RHS.first);
+              });
+
+    // Now that the CFG has been fully rewired and every node is correctly
+    // connected, we can replace the blended values uses with their new
+    // value.
+    DenseSet<Instruction *> toDelete;
+    for (URVBlender::value_type &blender : URVB) {
+      BasicBlock *block = blender.first;
+      Value *from = blender.second.first;
+      Instruction *to = blender.second.second;
+      if (!isUsedOutsideDefinitionBlock(from)) {
+        toDelete.insert(to);
+      } else {
+        VECZ_ERROR_IF(!isa<Instruction>(from),
+                      "Trying to replace uses of a value");
+        VECZ_FAIL_IF(
+            !replaceReachableUses(*RC, cast<Instruction>(from), to, block));
+      }
+    }
+
+    for (Instruction *I : toDelete) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
+    UniformRegion &region, BasicBlock *predicatedB, BasicBlock *uniformB) {
+  auto replaceIncomingBlock = [](BasicBlock *B, BasicBlock *from,
+                                 BasicBlock *to) {
+    for (Instruction &I : *B) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+        int fromIdx = PHI->getBasicBlockIndex(from);
+        if (fromIdx != -1) {
+          PHI->setIncomingBlock(fromIdx, to);
+        }
+      } else {
+        break;
+      }
+    }
+  };
+
+  LLVM_DEBUG(dbgs() << "\tConnect uniform region of " << predicatedB->getName()
+                    << "\n");
+
+  ConstantInt *trueCI = ConstantInt::getTrue(F.getContext());
+
+  auto *T = uniformB->getTerminator();
+
+  BasicBlock *target = predicatedB->getTerminator()->getSuccessor(0);
+
+  // 1. For each pair {taken, fallthrough} of successors of uniformB,
+  //   a. 'taken' is taken if the exit mask towards that edge is full, i.e. if
+  //      it contains all-true values.
+  //   b. otherwise, we branch to a new block, 'boscc_indir'. If the exit mask
+  //      towards 'fallthrough' is full, branch to the latter.
+  //   c. Otherwise, it means the mask is not dynamically uniform, but varying,
+  //      so we need to branch into the varying counterpart of the uniformregion
+  //      region. The chosen block to branch to is the first successor of
+  //      predicatedB.
+  // 2. When a latch is divergent, we make the uniform latch target the
+  //    predicated header.
+  // 3. We need to feed the last computed uniform values when transitioning to
+  //    the varying version.
+  BasicBlock *runtimeCheckerBlock = uniformB;
+  DR->setFlag(*uniformB, eBlockNeedsAllOfMask);
+
+  // 1.
+  SmallVector<BasicBlock *, 2> succs = uniformEdges[predicatedB];
+  const size_t size = succs.size();
+  VECZ_ERROR_IF(size == 0, "BasicBlock has no successors");
+  for (size_t i = 0; i < size; ++i) {
+    // Not all successors of a BOSCC entry block may be duplicated.
+    if (BasicBlock *uniformSucc = getBlock(succs[i])) {
+      succs[i] = uniformSucc;
+    }
+    LLVM_DEBUG(dbgs() << "\tSuccessor " << i << ": " << succs[i]->getName()
+                      << "\n");
+  }
+
+  for (size_t i = 0; i + 1 < size; ++i) {
+    BasicBlock *succ = succs[i];
+
+    BasicBlock *BOSCCIndir = BasicBlock::Create(
+        uniformB->getContext(), uniformB->getName() + ".boscc_indir", &F,
+        succ->getNextNode());
+
+    region.uniformBlocks.insert(BOSCCIndir);
+
+    BasicBlockTag &BOSCCIndirTag = DR->getOrCreateTag(BOSCCIndir);
+    DR->setFlag(*BOSCCIndir, static_cast<BlockDivergenceFlag>(
+                                 eBlockNeedsAllOfMask | eBlockIsUniform));
+    BOSCCIndirTag.loop = DR->getTag(runtimeCheckerBlock).loop;
+    if (BOSCCIndirTag.loop) {
+      BOSCCIndirTag.loop->loop->addBasicBlockToLoop(BOSCCIndir, *LI);
+    }
+
+    ICmpInst *cond = new ICmpInst(
+        *runtimeCheckerBlock, CmpInst::ICMP_EQ,
+        PassState.getMaskInfo(uniformB).exitMasks.lookup(succ), trueCI);
+    BranchInst::Create(succ, BOSCCIndir, cond, runtimeCheckerBlock);
+
+    if (i > 0) {
+      // Update the incoming block of the phi nodes in 'succ' from 'uniformB'
+      // to 'runtimeCheckerBlock'.
+      replaceIncomingBlock(succ, uniformB, runtimeCheckerBlock);
+    }
+
+    runtimeCheckerBlock = BOSCCIndir;
+  }
+
+  BasicBlock *succ = succs[size - 1];
+  ICmpInst *cond = new ICmpInst(
+      *runtimeCheckerBlock, CmpInst::ICMP_EQ,
+      PassState.getMaskInfo(uniformB).exitMasks.lookup(succ), trueCI);
+
+  BasicBlock *connectionPoint = target;
+
+  auto const *const LTag = DR->getTag(predicatedB).loop;
+  const bool needsStore = LTag && LMap.count(LTag->loop);
+  if (needsStore) {
+    // 'store' is a block that will contain all the uniform versions of the
+    // live in instructions of the predicated target.
+    BasicBlock *store = BasicBlock::Create(
+        target->getContext(), uniformB->getName() + ".boscc_store", &F,
+        runtimeCheckerBlock->getNextNode());
+
+    region.uniformBlocks.insert(store);
+
+    BasicBlockTag &storeTag = DR->getOrCreateTag(store);
+    DR->setFlag(*store, eBlockIsUniform);
+
+    // 2.
+    auto *const uniformLTag = DR->getTag(uniformB).loop;
+    const bool isLoopLatch = uniformLTag && (uniformLTag->latch == uniformB);
+    if (isLoopLatch) {
+      BasicBlock *header = LTag->header;
+      PHINode *entryMask =
+          cast<PHINode>(PassState.getMaskInfo(header).entryMask);
+      Value *latchMask =
+          PassState.getMaskInfo(uniformB).exitMasks.lookup(uniformLTag->header);
+      VECZ_ERROR_IF(!latchMask, "Exit mask does not exist");
+      entryMask->addIncoming(latchMask, store);
+      connectionPoint = header;
+
+      if (succ == uniformLTag->header) {
+        uniformLTag->latch = runtimeCheckerBlock;
+      }
+    }
+
+    BranchInst::Create(connectionPoint, store);
+
+    // 'store' belongs in the first outer loop non duplicated.
+    Loop *parentLoop = LTag->loop->getParentLoop();
+    while (parentLoop && LMap.count(parentLoop)) {
+      parentLoop = parentLoop->getParentLoop();
+    }
+    if (parentLoop) {
+      storeTag.loop = &DR->getTag(parentLoop);
+      parentLoop->addBasicBlockToLoop(store, *LI);
+    }
+
+    target = store;
+  }
+
+  // 1.c. 'uniformB' has a new runtime check, we can remove its old one.
+  IRCleanup::deleteInstructionNow(T);
+  BranchInst::Create(succ, target, cond, runtimeCheckerBlock);
+
+  // Update the incoming block of the new successors of 'runTimeCheckerBlock'.
+  replaceIncomingBlock(succ, uniformB, runtimeCheckerBlock);
+
+  if (uniformB == predicatedB) {
+    replaceIncomingBlock(connectionPoint, predicatedB, runtimeCheckerBlock);
+  } else {
+    // 3.
+    VECZ_FAIL_IF(!blendConnectionPoint(
+        connectionPoint,
+        {predicatedB, needsStore ? target : runtimeCheckerBlock}));
+
+    if (needsStore) {
+      region.storeBlocks.emplace_back();
+      auto &sb = region.storeBlocks.back();
+      sb.connectionPoint = connectionPoint;
+      sb.target = target;
+      sb.runtimeCheckerBlock = runtimeCheckerBlock;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::blendConnectionPoint(
+    BasicBlock *CP, const std::pair<BasicBlock *, BasicBlock *> &incoming) {
+  auto const *const CPLTag = DR->getTag(CP).loop;
+  for (auto &region : uniformRegions) {
+    // Create blend instructions at each blend point following 'CP'.
+    if (region.contains(CP) || (CP == region.exitBlock) ||
+        (CP == region.entryBlock)) {
+      // Compute all the blend points that will need to have blend instructions
+      // because of 'CP'. These blocks are all the blocks that have more than
+      // one predecessor, that belong to the same region as 'CP', and that
+      // succeed it.
+      if (!region.blendPoints.count(CP)) {
+        // The first blend point impacted by 'CP' is 'CP' itself.
+        region.blendPoints.insert({CP, {CP}});
+
+        DenseSet<BasicBlock *> visited{CP};
+        std::queue<BasicBlock *> queue;
+        queue.push(CP);
+        while (!queue.empty()) {
+          BasicBlock *cur = queue.front();
+          queue.pop();
+          // The region exit block is the delimiter of the region.
+          if (cur == region.exitBlock) {
+            continue;
+          }
+          for (BasicBlock *succ : successors(cur)) {
+            if (visited.insert(succ).second) {
+              queue.push(succ);
+              if (std::distance(pred_begin(succ), pred_end(succ)) > 1) {
+                // Nested loops are dominated.
+                if (CPLTag == DR->getTag(succ).loop ||
+                    (CPLTag && !CPLTag->loop->contains(succ))) {
+                  region.blendPoints[CP].push_back(succ);
+                }
+              }
+            }
+          }
+        }
+      }
+
+      region.connections.push_back(UniformRegion::ConnectionInfo{CP, incoming});
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
+  for (auto &region : uniformRegions) {
+    for (const auto &connection : region.connections) {
+      BasicBlock *CP = connection.connectionPoint;
+      auto &incoming = connection.incoming;
+
+      // Create blend instructions at each blend point following 'CP'.
+      for (BasicBlock *blendPoint : region.blendPoints[CP]) {
+        LLVM_DEBUG(dbgs() << "BLEND CONNECTION POINT " << blendPoint->getName()
+                          << "\n");
+
+        for (Instruction &I : *blendPoint) {
+          if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+            // Only add 'incoming' for 'CP' because for the other blend points
+            // we don't actually add a new edge.
+            if (blendPoint != CP ||
+                PHI->getBasicBlockIndex(incoming.second) != -1) {
+              continue;
+            }
+
+            unsigned idx = 0;
+            for (; idx < PHI->getNumIncomingValues(); ++idx) {
+              // If one incoming block of the phi node is the predicated version
+              // of the new, uniform, incoming block, use its uniform incoming
+              // value version if it exists.
+              if (PHI->getIncomingBlock(idx) == incoming.first) {
+                if (Value *V = getUniformV(PHI->getIncomingValue(idx))) {
+                  if (Instruction *VI = dyn_cast<Instruction>(V)) {
+                    if (RC->isReachable(VI->getParent(), incoming.second)) {
+                      PHI->addIncoming(VI, incoming.second);
+                      break;
+                    }
+                  }
+                }
+              }
+            }
+            if (idx == PHI->getNumIncomingValues()) {
+              PHI->addIncoming(getDefaultValue(PHI->getType()),
+                               incoming.second);
+            }
+            LLVM_DEBUG(
+                dbgs()
+                << "PHINode " << PHI->getName() << ": Add incoming value "
+                << PHI->getIncomingValueForBlock(incoming.second)->getName()
+                << " from " << incoming.second->getName() << " in "
+                << blendPoint->getName() << "\n");
+          } else {
+            break;
+          }
+        }
+      }
+    }
+    region.connections.clear();
+  }
+
+  DenseSet<BasicBlock *> blendBlocks;
+  for (const auto &region : uniformRegions) {
+    for (auto &CP : region.blendPoints) {
+      for (BasicBlock *blendPoint : CP.second) {
+        blendBlocks.insert(blendPoint);
+      }
+    }
+  }
+
+  for (auto const &tag : DR->getBlockOrdering()) {
+    BasicBlock *blendPoint = tag.BB;
+    if (blendBlocks.count(blendPoint) == 0) {
+      continue;
+    }
+
+    DenseSet<Value *> blendedValues;
+    for (Instruction &I : *blendPoint) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+        if (PHI->getName().contains(".boscc_blend")) {
+          for (Value *v : PHI->incoming_values()) {
+            blendedValues.insert(v);
+          }
+        }
+      } else {
+        break;
+      }
+    }
+
+    for (auto *liveInVal : liveness->getBlockInfo(blendPoint).LiveIn) {
+      if (blendedValues.count(liveInVal)) {
+        continue;
+      }
+
+      auto *liveIn = dyn_cast<Instruction>(liveInVal);
+      if (!liveIn) {
+        continue;
+      }
+
+      BasicBlock *src = liveIn->getParent();
+
+      // Nothing to be done if the definition block has no uniform
+      // equivalent.
+      BasicBlock *uniformSrc = getBlock(src);
+      if (!uniformSrc) {
+        continue;
+      }
+
+      // Nothing to be done if the instruction:
+      // - dominates the connection point,
+      // - cannot reach 'CP'.
+      if (DT->dominates(src, blendPoint)) {
+        continue;
+      }
+
+      if (!RC->isReachable(src, blendPoint)) {
+        continue;
+      }
+
+      Value *uniformLiveIn = getDefaultValue(liveIn->getType());
+      if (Value *V = getUniformV(liveIn)) {
+        uniformLiveIn = V;
+      }
+
+      LLVM_DEBUG(dbgs() << "Blend live in " << liveIn->getName() << " in "
+                        << blendPoint->getName() << "\n");
+
+      PHINode *blend = PHINode::Create(liveIn->getType(), 2,
+                                       liveIn->getName() + ".boscc_blend",
+                                       &blendPoint->front());
+      bool replaceUniform = false;
+      bool replacePredicate = false;
+      // For each predecessor, if it can reach the instruction, set the
+      // latter as the incoming value, otherwise set a default value.
+      for (BasicBlock *pred : predecessors(blendPoint)) {
+        if (DR->isUniform(*pred)) {
+          Instruction *uniformLiveInI = dyn_cast<Instruction>(uniformLiveIn);
+          if (uniformLiveInI &&
+              !RC->isReachable(uniformLiveInI->getParent(), pred)) {
+            blend->addIncoming(getDefaultValue(uniformLiveInI->getType()),
+                               pred);
+          } else {
+            replaceUniform = true;
+            blend->addIncoming(uniformLiveIn, pred);
+          }
+        } else if (DR->getTag(pred).isLoopBackEdge(blendPoint)) {
+          blend->addIncoming(blend, pred);
+        } else {
+          if (!RC->isReachable(liveIn->getParent(), pred)) {
+            blend->addIncoming(getDefaultValue(liveIn->getType()), pred);
+          } else {
+            replacePredicate = true;
+            blend->addIncoming(liveIn, pred);
+          }
+        }
+        LLVM_DEBUG(dbgs() << "\tAdd incoming value "
+                          << blend->getIncomingValueForBlock(pred)->getName()
+                          << " from " << pred->getName() << "\n");
+      }
+
+      // If we have blended 'liveIn' in 'CP', update the uses.
+      if (replacePredicate) {
+        URVB.push_back({blendPoint, {liveIn, blend}});
+        addReference(blend, liveIn);
+      }
+      // If we have blended 'uniformLiveIn' in 'CP', update the uses.
+      if (replaceUniform && isa<Instruction>(uniformLiveIn)) {
+        URVB.push_back({blendPoint, {uniformLiveIn, blend}});
+      }
+
+      // Update the blend instructions in the loop header, if any.
+      VECZ_FAIL_IF(
+          !updateLoopBlendValues(DR->getTag(blendPoint).loop, liveIn, blend));
+      blendedValues.insert(liveIn);
+    }
+  }
+
+  for (const auto &region : uniformRegions) {
+    for (auto &sb : region.storeBlocks) {
+      BasicBlock *connectionPoint = sb.connectionPoint;
+      BasicBlock *target = sb.target;
+      BasicBlock *runtimeCheckerBlock = sb.runtimeCheckerBlock;
+
+      // Create a bunch of lcssa instructions into 'store' so that the repair
+      // SSA doesn't have to look for the instructions inside the uniform loop.
+      for (Instruction &I : *connectionPoint) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          int idx = PHI->getBasicBlockIndex(target);
+          VECZ_ERROR_IF(idx == -1,
+                        "Connection point PHIs must have incoming "
+                        "block from the target");
+          if (Instruction *incoming =
+                  dyn_cast<Instruction>(PHI->getIncomingValue(idx))) {
+            LLVM_DEBUG(dbgs()
+                       << "Create live-in lcssa of " << incoming->getName()
+                       << " in " << target->getName() << "\n");
+
+            PHINode *blend = PHINode::Create(
+                incoming->getType(), 1, incoming->getName() + ".boscc_lcssa",
+                &target->front());
+            blend->addIncoming(incoming, runtimeCheckerBlock);
+            PHI->setIncomingValue(idx, blend);
+          }
+        } else {
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+BasicBlock *ControlFlowConversionState::BOSCCGadget::getBlock(BasicBlock *B) {
+  auto BUniform = VMap.find(B);
+  if (BUniform != VMap.end()) {
+    return cast<BasicBlock>(BUniform->second);
+  }
+  return nullptr;
+}
+
+Loop *ControlFlowConversionState::BOSCCGadget::getLoop(Loop *L) {
+  auto LUniform = LMap.find(L);
+  if (LUniform != LMap.end()) {
+    return LUniform->second;
+  }
+  return nullptr;
+}
+
+void ControlFlowConversionState::BOSCCGadget::getUnduplicatedEntryBlocks(
+    SmallVectorImpl<BasicBlock *> &blocks) const {
+  for (auto const &region : uniformRegions) {
+    if (VMap.count(region.entryBlock) == 0) {
+      blocks.push_back(region.entryBlock);
+    }
+  }
+}
+
+void ControlFlowConversionState::BOSCCGadget::createReference(
+    Value *pred, Value *uni, bool needsMapping) {
+  if (!pred || !uni) {
+    return;
+  }
+  auto predIt = VMap.find(pred);
+  if (predIt != VMap.end()) {
+    predIt->second = uni;
+  } else {
+    VMap.insert({pred, uni});
+  }
+
+  if (needsMapping) {
+    if (Instruction *uniI = dyn_cast<Instruction>(uni)) {
+      RemapInstruction(uniI, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    }
+  }
+}
+
+void ControlFlowConversionState::BOSCCGadget::addReference(Value *pred,
+                                                           Value *old) {
+  auto uniformOldIt = VMap.find(old);
+  if (uniformOldIt != VMap.end()) {
+    VMap.insert({pred, uniformOldIt->second});
+  }
+}
+
+void ControlFlowConversionState::BOSCCGadget::addInRegions(BasicBlock *newB,
+                                                           BasicBlock *refB) {
+  for (auto &region : uniformRegions) {
+    if (region.contains(refB)) {
+      if (region.predicatedBlocks.insert(newB).second) {
+        LLVM_DEBUG(dbgs() << "BasicBlock " << newB->getName()
+                          << " added to BOSCC region: "
+                          << region.entryBlock->getName() << "\n");
+      }
+    }
+  }
+}
+
+Value *ControlFlowConversionState::BOSCCGadget::getUniformV(
+    Value *predicatedV) {
+  auto uniformVIt = VMap.find(predicatedV);
+  if (uniformVIt != VMap.end()) {
+    return uniformVIt->second;
+  }
+  return nullptr;
+}
+
+void ControlFlowConversionState::BOSCCGadget::updateValue(Value *from,
+                                                          Value *to) {
+  auto fromIt = VMap.find(from);
+  if (fromIt != VMap.end()) {
+    Value *fromUniform = fromIt->second;
+    VMap.erase(from);
+    VMap.insert({to, fromUniform});
+  }
+}
+
+bool ControlFlowConversionState::BOSCCGadget::linkMasks() {
+  for (auto const &BTag : DR->getBlockOrdering()) {
+    auto *const BB = BTag.BB;
+    if (auto *const uniformB = getBlock(BB)) {
+      // Both sets of masks had better exist by this point.
+      auto &masks = PassState.getMaskInfo(BB);
+      auto &masksUniform = PassState.getMaskInfo(uniformB);
+      createReference(masks.entryMask, masksUniform.entryMask);
+
+      for (auto *const succ : successors(BB)) {
+        auto *const uniformSucc = getBlock(succ);
+        auto *const target = uniformSucc ? uniformSucc : succ;
+        createReference(masks.exitMasks.lookup(succ),
+                        masksUniform.exitMasks.lookup(target));
+      }
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues(
+    LoopTag *LTag, Instruction *from, Instruction *to) {
+  auto createLatchIncoming = [&from, &LTag, this] {
+    auto *ret =
+        PHINode::Create(from->getType(), 2, from->getName() + ".boscc_blend",
+                        &LTag->latch->front());
+    Value *uniform = getUniformV(from);
+    Value *default_val = getDefaultValue(from->getType());
+    for (BasicBlock *pred : predecessors(LTag->latch)) {
+      Value *incoming = default_val;
+      if (RC->isReachable(from->getParent(), pred)) {
+        incoming = from;
+      } else if (uniform) {
+        Instruction *uinst = dyn_cast<Instruction>(uniform);
+        if (!uinst || RC->isReachable(uinst->getParent(), pred)) {
+          incoming = uniform;
+        }
+      }
+      ret->addIncoming(incoming, pred);
+    }
+    URVB.push_back({LTag->latch, {from, ret}});
+    addReference(ret, from);
+    return ret;
+  };
+
+  while (LTag) {
+    PHINode *latchIncoming = nullptr;
+    // Try looking for an existing `boscc_blend` value for `from` to avoid
+    // creating a new one in the latch.
+    for (Instruction &latchI : *LTag->latch) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&latchI)) {
+        if (PHI->getName().contains(".boscc_blend")) {
+          for (Value *incomingValue : PHI->incoming_values()) {
+            if (incomingValue == from) {
+              latchIncoming = PHI;
+              break;
+            }
+          }
+          if (latchIncoming) {
+            break;
+          }
+        }
+      } else {
+        break;
+      }
+    }
+    // Update all uses of `from` in the header with the blended value from the
+    // latch. Since the CFG is final now, this should cover everything.
+    for (Instruction &headerI : *LTag->header) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&headerI)) {
+        int latchIdx = PHI->getBasicBlockIndex(LTag->latch);
+        VECZ_ERROR_IF(latchIdx == -1,
+                      "Header has no incoming value from the latch");
+        if ((PHI == to) || (PHI->getIncomingValue(latchIdx) == from)) {
+          if (!latchIncoming) {
+            latchIncoming = createLatchIncoming();
+          }
+          PHI->setIncomingValue(latchIdx, latchIncoming);
+        }
+      } else {
+        break;
+      }
+    }
+
+    if (Loop *L = LTag->loop->getParentLoop()) {
+      LTag = &DR->getTag(L);
+    } else {
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() {
+  // Create a map from entry blocks to their uniform regions
+  DenseMap<BasicBlock *, UniformRegion const *> entryMap;
+  unsigned maxUBlocks = 0;
+  for (const auto &region : uniformRegions) {
+    if (!region.uniformBlocks.empty()) {
+      entryMap[region.entryBlock] = &region;
+    }
+    maxUBlocks = std::max(maxUBlocks, region.uniformBlocks.size());
+  }
+
+  // Gather the blocks outside of the uniform regions according to the already
+  // computed order, leaving gaps for the uniform regions to fill in.
+  // Note that uniform region blocks do not appear in the block ordering yet.
+  // Also note that we can't use pointers to BasicBlockTags here since
+  // `PassState.computeBlockOrdering()` re-orders the tags vector.
+  SmallVector<BasicBlock *, 16> filtered;
+  for (auto const &tag : DR->getBlockOrdering()) {
+    filtered.push_back(tag.BB);
+    auto const found = entryMap.find(tag.BB);
+    if (found != entryMap.end()) {
+      auto const *const region = found->second;
+      filtered.resize(filtered.size() + region->uniformBlocks.size());
+    }
+  }
+
+  // Recompute the ordering over the uniform regions
+  VECZ_FAIL_IF(!PassState.computeBlockOrdering());
+
+  // Filter by region and fill in the gaps
+  SmallVector<size_t, 16> uniformBlocks;
+  uniformBlocks.reserve(maxUBlocks);
+  for (auto it = filtered.begin(), ie = filtered.end(); it != ie;) {
+    auto *const BB = *it;
+
+    auto const found = entryMap.find(BB);
+    if (found != entryMap.end()) {
+      // If the entry block of the region is NOT duplicated, add the uniform
+      // blocks after it.
+      bool const entryDupe = getBlock(BB);
+      if (!entryDupe) {
+        ++it;
+      }
+
+      // Gather the indices of the uniform blocks and sort them.
+      auto const &region = *found->second;
+      uniformBlocks.clear();
+      for (auto *const uBB : region.uniformBlocks) {
+        uniformBlocks.push_back(DR->getTagIndex(uBB));
+      }
+      std::sort(uniformBlocks.begin(), uniformBlocks.end());
+
+      // Insert the uniform blocks into the gap.
+      for (auto const uBBi : uniformBlocks) {
+        (*it++) = DR->getBlockTag(uBBi).BB;
+      }
+
+      // If the entry block of the region IS duplicated, add it after the
+      // uniform blocks.
+      if (entryDupe) {
+        (*it++) = BB;
+      }
+    } else {
+      ++it;
+    }
+  }
+
+  uint32_t pos = 0;
+  for (auto *const BB : filtered) {
+    DR->getTag(BB).pos = pos++;
+  }
+  DR->reorderTags(filtered.size());
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::cleanUp() {
+  // BOSCC can create a lot of PHI nodes that are not really necessary.
+  // LCSSA PHI nodes (in Store Blocks) are only required as an intermediate
+  // state and are trivially redundant, and sometimes blends are created that
+  // blend the same two values together. Also, sometimes values are blended
+  // even though they have no further uses and can be removed as dead code.
+
+  RPOT rpot(&F);
+  std::vector<PHINode *> blends;
+  for (auto *BB : rpot) {
+    for (auto I = BB->begin(); I != BB->end();) {
+      auto *PHI = dyn_cast<PHINode>(&*(I++));
+      if (!PHI) {
+        break;
+      }
+      if (!PHI->getName().contains(".boscc_")) {
+        continue;
+      }
+
+      if (auto *V = PHI->hasConstantValue()) {
+        PHI->replaceAllUsesWith(V);
+        IRCleanup::deleteInstructionNow(PHI);
+      } else {
+        blends.push_back(PHI);
+      }
+    }
+  }
+
+  while (!blends.empty()) {
+    PHINode *PHI = blends.back();
+    if (PHI->use_empty()) {
+      IRCleanup::deleteInstructionNow(PHI);
+    }
+    blends.pop_back();
+  }
+
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
new file mode 100644
index 0000000000000..5302486ba60c6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
@@ -0,0 +1,150 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "control_flow_roscc.h"
+
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+// WHAT THIS DOES
+//
+// A common pattern in OpenCL kernels is a line near the start of the program
+// like the following:
+//
+//    if (some_condition) return;
+//
+// Where "some_condition" is non-uniform, the BOSCC control flow optimization
+// can do very well with this. However, without BOSCC, the entire program will
+// have been linearized and the early return will disappear entirely. It is
+// desirable to maintain this sort of early exit branch in order to avoid
+// doing unnecessary work. We can do this by inserting a uniform branch to the
+// return block without the need to duplicate the rest of the kernel into
+// uniform and non-uniform versions, as BOSCC does. This can improve the
+// performance significantly without requiring complex CFG changes.
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+/// @brief checks if the given block contains only a return instruction
+bool isReturnBlock(const llvm::BasicBlock &BB) {
+  if (BB.size() != 1) {
+    return false;
+  }
+
+  auto *T = BB.getTerminator();
+  if (auto *const branch = dyn_cast<BranchInst>(T)) {
+    if (branch->isUnconditional()) {
+      // We can see straight through a block that only contains a single
+      // unconditional branch.
+      return isReturnBlock(*branch->getSuccessor(0));
+    }
+  }
+
+  return isa<ReturnInst>(T);
+}
+}  // namespace
+
+bool ControlFlowConversionState::ROSCCGadget::run(Function &F) {
+  bool changed = false;
+
+  SmallVector<BranchInst *, 4> RetBranches;
+  for (auto &BB : F) {
+    if (LI->getLoopFor(&BB)) {
+      // No need to do this transform on loop exits
+      continue;
+    }
+
+    auto *T = BB.getTerminator();
+    if (auto *Branch = dyn_cast<BranchInst>(T)) {
+      if (Branch->isConditional() && Branch->getNumSuccessors() == 2) {
+        Value *cond = Branch->getCondition();
+        if (UVR->isVarying(cond)) {
+          size_t countReturns = 0;
+          for (auto *succ : Branch->successors()) {
+            if (isReturnBlock(*succ)) {
+              ++countReturns;
+            }
+          }
+
+          // Only consider ROSCC when there is exactly one returning successor.
+          if (countReturns == 1) {
+            RetBranches.push_back(Branch);
+          }
+        }
+      }
+    }
+  }
+
+  ConstantInt *trueCI = ConstantInt::getTrue(F.getContext());
+  ConstantInt *falseCI = ConstantInt::getFalse(F.getContext());
+
+  for (auto *Branch : RetBranches) {
+    BasicBlock *BB = Branch->getParent();
+
+    BasicBlock *newBB = SplitBlock(BB, Branch, DT, LI);
+    newBB->setName(Twine(BB->getName(), ".ROSCC"));
+
+    // update the PostDominatorTree manually..
+    auto *Node = PDT->getNode(BB);
+    assert(Node && "Could not get node");
+    auto *IDom = Node->getIDom();
+    assert(IDom && "Could not get IDom");
+    auto *Block = IDom->getBlock();
+    assert(Block && "Could not get Block");
+    PDT->addNewBlock(newBB, Block);
+
+    // Remove the unconditional branch created by splitting..
+    IRCleanup::deleteInstructionNow(BB->getTerminator());
+
+    // Create a new Uniform branch condition to the Return block..
+    // Note that a conditional branch's successors are returned in reverse
+    // order, relative to how they appear in the IR, with the "true" target
+    // last. However, "getSuccessor(n)" also indexes backwards, from the end.
+    BasicBlock *SuccT = Branch->getSuccessor(0);
+    BasicBlock *SuccF = Branch->getSuccessor(1);
+    bool Which = isReturnBlock(*SuccT);
+
+    BasicBlock *ReturnBlock = Which ? SuccT : SuccF;
+    Value *Cond = Branch->getCondition();
+    ICmpInst *newCond =
+        new ICmpInst(*BB, CmpInst::ICMP_EQ, Cond, Which ? falseCI : trueCI);
+    newCond->setName(Twine(Cond->getName(), ".ROSCC"));
+    BranchInst::Create(newBB, ReturnBlock, newCond, BB);
+
+    // Update Dominator and PostDominator trees..
+    DT->insertEdge(BB, ReturnBlock);
+    PDT->insertEdge(BB, ReturnBlock);
+
+    changed = true;
+  }
+
+  assert((!changed || DT->verify()) &&
+         "ROSCC: Dominator Tree failed verification");
+
+  assert((!changed || PDT->verify()) &&
+         "ROSCC: Post-Dominator Tree failed verification");
+
+  return changed;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
new file mode 100644
index 0000000000000..b8fd3aa2b7756
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
@@ -0,0 +1,80 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "debugging.h"
+
+#include <llvm/Analysis/OptimizationRemarkEmitter.h>
+
+using namespace llvm;
+
+namespace vecz {
+
+/// @brief Create the std::string containing the message for the remark
+///
+/// @param[in] V The value (can be `nullptr`) to be included in the remark
+/// @param[in] Msg The main remark message
+/// @return The remark message as it is to be printed
+static std::string createRemarkMessage(const Value *V, StringRef Msg) {
+  std::string helper_str("Vecz: ");
+  raw_string_ostream helper_stream(helper_str);
+  helper_stream << Msg;
+  if (V) {
+    if (isa<Instruction>(V)) {
+      // Instructions are already prefixed by two spaces when printed
+      V->print(helper_stream, true);
+    } else if (const Function *F = dyn_cast<Function>(V)) {
+      // Printing a functions leads to it's whole body being printed
+      helper_stream << " function \"" << F->getName() << "\"";
+    } else {
+      helper_stream << " ";
+      V->print(helper_stream, true);
+    }
+  }
+  helper_stream << '\n';
+
+  return helper_stream.str();
+}
+
+void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg) {
+  const Instruction *I = V ? dyn_cast<Instruction>(V) : nullptr;
+  auto RemarkMsg = createRemarkMessage(V, Msg);
+  OptimizationRemarkEmitter ORE(F);
+  if (I) {
+    ORE.emit(OptimizationRemarkMissed("vecz", "vecz", I) << RemarkMsg);
+  } else {
+    DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
+    ORE.emit(OptimizationRemarkMissed("vecz", "vecz", D, &(F->getEntryBlock()))
+             << RemarkMsg);
+  }
+}
+
+void emitVeczRemarkMissed(const Function *F, StringRef Msg) {
+  emitVeczRemarkMissed(F, nullptr, Msg);
+}
+
+void emitVeczRemark(const Function *F, const Value *V, StringRef Msg) {
+  const Instruction *I = V ? dyn_cast<Instruction>(V) : nullptr;
+  DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
+
+  auto RemarkMsg = createRemarkMessage(V, Msg);
+  OptimizationRemarkEmitter ORE(F);
+  ORE.emit(OptimizationRemark("vecz", "vecz", F) << RemarkMsg);
+}
+
+void emitVeczRemark(const Function *F, StringRef Msg) {
+  emitVeczRemark(F, nullptr, Msg);
+}
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
new file mode 100644
index 0000000000000..3bd7e78538dc5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
@@ -0,0 +1,98 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @brief Analysis of control flow.
+
+#ifndef VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class BasicBlock;
+}  // namespace llvm
+
+namespace vecz {
+
+/// @brief Holds the results and state for CFG analysis.
+struct CFGResult {
+  /// @brief true if analysis failed, e.g. CFG conversion cannot be done.
+  bool failed = false;
+  /// @brief true if CFG conversion is needed to vectorize the function.
+  bool convNeeded = false;
+  /// @brief Single basic block that exits the function.
+  llvm::BasicBlock *exitBB = nullptr;
+
+  /// @brief Create new analysis results for the given function.
+  CFGResult() = default;
+
+  /// @brief Deleted copy constructor.
+  CFGResult(const CFGResult &) = delete;
+
+  /// @brief Move constructor.
+  ///
+  /// @param[in,out] Res Existing results to move.
+  CFGResult(CFGResult &&Res) = default;
+
+  /// @brief Access the failed flag.
+  /// @return true if analysis failed.
+  bool getFailed() const { return failed; }
+
+  /// @brief Access the failed flag.
+  /// @param[in] newVal New value for the flag.
+  void setFailed(bool newVal) { failed = newVal; }
+
+  /// @brief Determine whether CFG conversion is needed for the function or not.
+  bool isConversionNeeded() const { return convNeeded; }
+  /// @brief Set whether CFG conversion is needed for the function or not.
+  /// @param[in] newVal Whether conversion is needed or not.
+  void setConversionNeeded(bool newVal) { convNeeded = newVal; }
+
+  /// @brief Single block in the function that returns to the caller or null.
+  llvm::BasicBlock *getExitBlock() const { return exitBB; }
+};
+
+/// @brief Analysis that determines whether a function can have divergent
+/// control flow and so whether CFG conversion is needed or not.
+class CFGAnalysis : public llvm::AnalysisInfoMixin<CFGAnalysis> {
+ public:
+  /// @brief Create a new CFG analysis object.
+  CFGAnalysis() = default;
+
+  /// @brief Type of the analaysis result.
+  using Result = CFGResult;
+
+  /// @brief Perform CFG analysis on the function to determine whether control
+  /// flow conversion is required and possible or not.
+  ///
+  /// @param[in,out] F Function to analyze.
+  /// @param[in,out] AM FunctionAnalysisManager providing analyses
+  ///
+  /// @return CFG analysis result.
+  CFGResult run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Analysis name.
+  static llvm::StringRef name() { return "CFG analysis"; }
+
+ private:
+  friend llvm::AnalysisInfoMixin<CFGAnalysis>;
+  /// @brief Unique identifier for the analysis.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
new file mode 100644
index 0000000000000..f941eda04a3c0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
@@ -0,0 +1,480 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Divergence analysis.
+
+#ifndef VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include <vector>
+
+namespace llvm {
+class BasicBlock;
+class Loop;
+}  // namespace llvm
+
+namespace vecz {
+struct BasicBlockTag;
+struct LoopTag;
+
+/// @brief Analysis flags that can be attached to LLVM basic blocks.
+enum BlockDivergenceFlag {
+  /// @brief Flag value where no flag is set.
+  eBlockHasNoFlag = 0,
+  /// @brief True if the block has a divergent branch (different paths might be
+  /// taken by different work items.
+  eBlockHasDivergentBranch = (1 << 0),
+  /// @brief True if the block has no divergent branch but has all its
+  /// successors divergent.
+  eBlockHasDivergentBranchFake = (1 << 1),
+  /// @brief True if the block belongs in a diverged path.
+  eBlockIsDivergent = (1 << 2),
+  /// @brief True if the block is an introduced divergent conditional loop exit.
+  /// The operation is performed during the transformation of a divergent loop.
+  eBlockIsVirtualDivergentLoopExit = (1 << 3),
+  /// @brief True if the block is a join point of a divergent branch.
+  eBlockIsBlend = (1 << 4),
+  /// @brief True if no divergence is present when reaching the block.
+  eBlockIsByAll = (1 << 5),
+  /// @brief True if the block is uniform (duplicated version of a predicated
+  /// block from BOSCC).
+  eBlockIsUniform = (1 << 6),
+  /// @brief True if the block needs an all-of mask.
+  eBlockNeedsAllOfMask = (1 << 7)
+};
+
+/// @brief Analysis flags that can be attached to LLVM loops.
+enum LoopDivergenceFlag {
+  /// @brief Flag value where no flag is set.
+  eLoopNoFlag = 0,
+  /// @brief Whether or not the loop may diverge because of a diverging block.
+  eLoopIsDivergent = (1 << 0)
+};
+
+/// @brief Type that maps basic blocks to tags.
+using DenseBBMap = llvm::DenseMap<const llvm::BasicBlock *, size_t>;
+/// @brief Type that maps loops to tags.
+using DenseLoopMap =
+    llvm::DenseMap<const llvm::Loop *, std::unique_ptr<LoopTag>>;
+/// @brief Type that maps loop live values and their associated state from the
+///        previous loop iteration.
+using DenseLoopResultPHIsMap =
+    llvm::SmallDenseMap<llvm::Value *, llvm::PHINode *, 32>;
+/// @brief Type that maps loop live values and updated value.
+using DenseLoopResultUpdatesMap =
+    llvm::SmallDenseMap<llvm::Value *, llvm::SelectInst *, 32>;
+
+class DivergenceResult;
+
+/// @brief Queue that orders blocks by their DCBI (smallest first).
+struct BlockQueue {
+  using index_type = uint32_t;
+  using index_list = std::vector<index_type>;
+
+  DivergenceResult const &DR;
+
+  /// @brief The DCBI indices of the blocks in the queue, in min-heap order.
+  /// Since we can easily retrieve the BasicBlockTag from the DCBI ordered
+  /// `blockOrdering` vector, and since the queue priority is entirly based on
+  /// the index, it is sufficient to store only the indices to perform the
+  /// queue operations.
+  index_list indices;
+
+  /// @brief Constructs an empty BlockQueue
+  BlockQueue(DivergenceResult const &dr) : DR(dr){};
+
+  /// @brief Constructs a BlockQueue from a set of blocks.
+  BlockQueue(DivergenceResult const &dr,
+             llvm::DenseSet<llvm::BasicBlock *> const &blocks);
+
+  /// @brief Returns the number of blocks in the queue.
+  size_t size() const { return indices.size(); }
+
+  /// @brief Returns whether the queue is empty.
+  bool empty() const { return indices.empty(); }
+
+  /// @brief Pushes a block on the queue by its DCBI index.
+  void push(size_t index);
+
+  /// @brief Pushes a block on the queue by pointer.
+  /// Prefer `push(size_t)` if the tag index is available.
+  void push(llvm::BasicBlock const *bb);
+
+  /// @brief Pops a block from the queue and returns it.
+  const BasicBlockTag &pop();
+
+  /// @brief Const iterator to beginning of index list, for inspection.
+  index_list::const_iterator begin() const { return indices.begin(); }
+
+  /// @brief Const iterator to end of index list, for inspection.
+  index_list::const_iterator end() const { return indices.end(); }
+};
+
+/// @brief Describes a loop contained in the function to vectorize.
+struct LoopTag {
+  /// @brief Compiler loop info.
+  llvm::Loop *loop = nullptr;
+  /// @brief Loop entering point.
+  llvm::BasicBlock *preheader = nullptr;
+  /// @brief Loop entry point.
+  llvm::BasicBlock *header = nullptr;
+  /// @brief Single block that jumps back to the loop header.
+  llvm::BasicBlock *latch = nullptr;
+  /// @brief Loop live values on the loop.
+  llvm::SmallPtrSet<llvm::Value *, 32> loopLiveValues;
+  /// @brief Map between loop live values and their associated state from the
+  ///        previous loop iteration.
+  DenseLoopResultPHIsMap loopResultPrevs;
+  /// @brief Map between loop live values and their updated value.
+  DenseLoopResultUpdatesMap loopResultUpdates;
+  /// @brief Loop exit that has been chosen during partial linearization.
+  llvm::BasicBlock *pureExit = nullptr;
+
+  LoopDivergenceFlag divergenceFlag = LoopDivergenceFlag::eLoopNoFlag;
+
+  bool isLoopDivergent() const {
+    return divergenceFlag & LoopDivergenceFlag::eLoopIsDivergent;
+  }
+};
+
+/// @brief Describes a basic block contained in the function to vectorize.
+struct BasicBlockTag {
+  /// @brief Compiler basic block object.
+  llvm::BasicBlock *BB = nullptr;
+  /// @brief Inner most loop this block belongs to, if any.
+  LoopTag *loop = nullptr;
+  /// @brief Outermost loop left by this block.
+  LoopTag *outermostExitedLoop = nullptr;
+
+  /// @brief Unique sorted block index.
+  uint32_t pos = ~0u;
+
+  /// @brief Create a new basic block tag.
+  BasicBlockTag() = default;
+  /// @brief Deleted address-of operator
+  BasicBlockTag *operator&() = delete;
+  /// @brief Deleted const address-of operator
+  BasicBlockTag const *operator&() const = delete;
+
+  BlockDivergenceFlag divergenceFlag = BlockDivergenceFlag::eBlockHasNoFlag;
+
+  /// @brief Convenience function for finding the varying property of the branch
+  /// without having to query the Uniform Value Analysis
+  bool hasVaryingBranch() const {
+    return divergenceFlag & BlockDivergenceFlag::eBlockHasDivergentBranch;
+  }
+
+  /// @brief Determine whether there is a backedge from this tag's basic block
+  /// to the target basic block.
+  ///
+  /// @param[in] toBB Potential target for the backedge.
+  ///
+  /// @return true if there is a backedge, false otherwise.
+  bool isLoopBackEdge(llvm::BasicBlock *toBB) const {
+    return loop && (loop->latch == BB) && (loop->header == toBB);
+  }
+
+  /// @brief Determine whether this block is the header of its loop (if any).
+  /// @return true iff the block is the loop header for its loop
+  bool isLoopHeader() const { return loop && loop->header == BB; }
+};
+
+/// @brief Divergent blocks whose PHI nodes may vary.
+using DivergenceInfo = llvm::DenseSet<llvm::BasicBlock *>;
+
+/// @brief Holds the result of Divergence Analysis for a given function.
+class DivergenceResult {
+ public:
+  /// @brief Create a new DA result for the given unit.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  DivergenceResult(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Generate a block ordering.
+  ///
+  /// This is based on a dominance-compact block indexing (DCBI) where we
+  /// topologically order blocks that belong to the same dominator tree.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeBlockOrdering(llvm::DominatorTree &DT);
+
+  /// @brief Reorders the tags in the tags vector according to their DBCI
+  /// indices.
+  /// @param[in] n the number of tags in the DCBI
+  void reorderTags(size_t n);
+
+  /// @brief Generate a loop ordering.
+  ///
+  /// This populates the `loopOrdering` vector with loop tags sorted by depth.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeLoopOrdering();
+
+  /// @brief Gets a BasicBlockTag by its DCBI index
+  /// @param[in] index the DCBI index
+  /// @returns reference to the BasicBlockTag
+  BasicBlockTag const &getBlockTag(size_t index) const {
+    return basicBlockTags[index];
+  }
+
+  /// @brief Gets the DCBI ordered range of BasicBlockTags.
+  llvm::ArrayRef<BasicBlockTag> getBlockOrdering() const {
+    return llvm::ArrayRef<BasicBlockTag>(basicBlockTags.data(),
+                                         numOrderedBlocks);
+  }
+
+  llvm::ArrayRef<LoopTag *> getLoopOrdering() { return loopOrdering; }
+
+  size_t getTagIndex(const llvm::BasicBlock *BB) const;
+
+  /// @brief Retrieve a tag for the given basic block.
+  ///
+  /// @param[in] BB Basic block to retrieve a tag for.
+  ///
+  /// @return Basic block tag.
+  BasicBlockTag &getTag(const llvm::BasicBlock *BB) {
+    return basicBlockTags[getTagIndex(BB)];
+  }
+
+  BasicBlockTag const &getTag(const llvm::BasicBlock *BB) const {
+    return basicBlockTags[getTagIndex(BB)];
+  }
+
+  /// @brief Retrieve or create a tag for the given basic block.
+  ///
+  /// @param[in] BB Basic block to retrieve or create a tag for.
+  ///
+  /// @return Basic block tag.
+  BasicBlockTag &getOrCreateTag(llvm::BasicBlock *BB);
+
+  /// @brief Try to retrieve a tag for the given loop.
+  ///
+  /// @param[in] L Loop to retrieve a tag for.
+  ///
+  /// @return Loop tag.
+  LoopTag &getTag(const llvm::Loop *L) const;
+
+  /// @brief Retrieve or create a tag for the given loop.
+  ///
+  /// @param[in] L Loop to retrieve a tag for.
+  ///
+  /// @return Loop tag.
+  LoopTag &getOrCreateTag(llvm::Loop *L);
+
+  /// @brief Determine whether the tag contains the given flags or not.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  /// @param[in] F Flags to test.
+  ///
+  /// @return true if the tag contains all the given flags, false otherwise.
+  bool hasFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F) const;
+  /// @brief Get the given flags for the tag.
+  ///
+  /// @param[in] BB Basic block whose flag we want to get.
+  BlockDivergenceFlag getFlag(const llvm::BasicBlock &BB) const;
+  /// @brief Set the given flags for the tag.
+  ///
+  /// @param[in] BB Basic block whose flag we set.
+  /// @param[in] F Flags to set for the tag.
+  void setFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F);
+  /// @brief Clear the given flags for the tag.
+  ///
+  /// @param[in] BB Basic block whose flag we clear.
+  /// @param[in] F Flags to clear for the tag.
+  void clearFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F);
+  /// @brief Check whether the basic block contains a div causing flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is div causing, false otherwise.
+  bool isDivCausing(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a divergent flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is divergent, false otherwise.
+  bool isDivergent(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains an optional flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is optional, false otherwise.
+  bool isOptional(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a by_all flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is by_all, false otherwise.
+  bool isByAll(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a blend flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is blend, false otherwise.
+  bool isBlend(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a uniform flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is uniform, false otherwise.
+  bool isUniform(const llvm::BasicBlock &BB) const;
+
+  /// @brief Determine whether the tag contains the given flags or not.
+  ///
+  /// @param[in] L Loop whose flag we check.
+  /// @param[in] F Flags to test.
+  ///
+  /// @return true if the tag contains all the given flags, false otherwise.
+  bool hasFlag(const llvm::Loop &L, LoopDivergenceFlag F) const;
+  /// @brief Get the given flags for the tag.
+  ///
+  /// @param[in] L Loop whose flag we want to get.
+  LoopDivergenceFlag getFlag(const llvm::Loop &L) const;
+  /// @brief Set the given flags for the tag.
+  ///
+  /// @param[in] L Loop whose flag we set.
+  /// @param[in] F Flags to set for the tag.
+  void setFlag(const llvm::Loop &L, LoopDivergenceFlag F);
+  /// @brief Clear the given flags for the tag.
+  ///
+  /// @param[in] L Loop whose flag we clear.
+  /// @param[in] F Flags to clear for the tag.
+  void clearFlag(const llvm::Loop &L, LoopDivergenceFlag F);
+
+  /// @brief Check if a block Src can reach a block Dst, either within the same
+  ///        SESE region, or outside too.
+  /// @param[in] src Source node.
+  /// @param[in] dst Destination node.
+  /// @param[in] allowLatch Whether reachability is computed with latches or
+  /// not.
+  /// @return Whether or not dst is reachable from src.
+  bool isReachable(llvm::BasicBlock *src, llvm::BasicBlock *dst,
+                   bool allowLatch = false) const;
+
+  /// @brief List of blocks having a divergent branch.
+  std::vector<llvm::BasicBlock *> const &getDivCausingBlocks() const {
+    return divCausingBlocks;
+  }
+
+ private:
+  friend class DivergenceAnalysis;
+
+  /// @brief Mark a block div causing and mark blocks that are control dependent
+  ///        to be divergent
+  /// @param[in] BB Div causing block.
+  /// @param[in,out] DI Divergence information of the function.
+  /// @param[in,out] PDT PostDominatorTree of the function.
+  void markDivCausing(llvm::BasicBlock &BB, DivergenceInfo &DI,
+                      llvm::PostDominatorTree &PDT);
+  /// @brief Mark divergent blocks in a loop (loop exits and latch) that are
+  ///        control dependent of a divergent branch.
+  /// @param[in] BB Div causing block.
+  /// @param[in] L Loop that BB diverges.
+  /// @param[in,out] DI Divergence information of the function.
+  void markDivLoopDivBlocks(llvm::BasicBlock &BB, llvm::Loop &L,
+                            DivergenceInfo &DI);
+  /// @brief Mark a block to be divergent.
+  /// @param[in] BB Block to mark.
+  void markDivergent(const llvm::BasicBlock &BB);
+  /// @brief Mark a loop to be divergent.
+  /// @param[in] L Loop to mark.
+  void markDivergent(const llvm::Loop &L);
+  /// @brief Recursively mark a block by_all.
+  /// @param[in] BB Block to mark.
+  void markByAll(llvm::BasicBlock &BB);
+
+  /// @brief Find join points of a block.
+  /// @param[in] src Starting block
+  /// @return List of blocks that have a disjoint path from the starting block.
+  llvm::DenseSet<llvm::BasicBlock *> joinPoints(llvm::BasicBlock &src) const;
+  /// @brief Find escape points of a divergent loop.
+  ///
+  /// Escape points are loop exit blocks from which some work-items may leave
+  /// through because of a divergent branch.
+  /// @param[in] src Divergent branch
+  /// @param[in] L Divergent loop
+  /// @return List of exit blocks some work-item may leave through.
+  llvm::DenseSet<llvm::BasicBlock *> escapePoints(llvm::BasicBlock const &src,
+                                                  llvm::Loop const &L) const;
+
+  /// @brief the Function the analysis was run on
+  llvm::Function &F;
+  /// @brief AM FunctionAnalysisManager providing analyses.
+  llvm::FunctionAnalysisManager &AM;
+
+  /// @brief Basic block tag mappings.
+  DenseBBMap BBMap;
+  /// @brief Loop tag mappings.
+  DenseLoopMap LMap;
+
+  /// @brief Storage for the Basic Block Tags
+  std::vector<BasicBlockTag> basicBlockTags;
+  /// @brief The number of blocks in the DCBI ordering.
+  size_t numOrderedBlocks = 0;
+
+  /// @brief List of Loop Tags ordered by loop depth
+  llvm::SmallVector<LoopTag *, 16> loopOrdering;
+
+  /// @brief Blocks that have a divergent branch.
+  std::vector<llvm::BasicBlock *> divCausingBlocks;
+
+  /// @brief Blocks with uniform conditions that must be considered div causing
+  ///        because they have a join point of a div causing block as their
+  ///        successor.
+  llvm::DenseSet<llvm::BasicBlock *> fakeDivCausingBlocks;
+};
+
+/// @brief Analysis that determines divergent blocks, i.e. program points
+///        that must not be skipped during SIMD execution.
+class DivergenceAnalysis : public llvm::AnalysisInfoMixin<DivergenceAnalysis> {
+  friend llvm::AnalysisInfoMixin<DivergenceAnalysis>;
+
+ public:
+  /// @brief Create a new analysis object.
+  DivergenceAnalysis() = default;
+
+  /// @brief Type of result produced by the analysis.
+  using Result = DivergenceResult;
+
+  /// @brief Determine which values in the function are uniform and which are
+  /// potentially varying.
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Divergence analysis"; }
+
+ private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
new file mode 100644
index 0000000000000..1a837347d6d39
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
@@ -0,0 +1,36 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
+
+namespace llvm {
+class Instruction;
+}  // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @brief Determine whether the given instruction needs to be instantiated.
+///
+/// @param[in] CTx the vectorization context
+/// @param[in] I Instruction to analyze.
+///
+/// @return true iff the instruction requires instantiation.
+bool needsInstantiation(VectorizationContext const &Ctx, llvm::Instruction &I);
+};  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
new file mode 100644
index 0000000000000..0d6ed87f25a31
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
@@ -0,0 +1,100 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file liveness_analysis.h
+///
+/// @brief Live Variable Set Analysis
+
+#ifndef VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
+#define VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Loop;
+class LoopInfo;
+class Function;
+class BasicBlock;
+class Value;
+}  // namespace llvm
+
+namespace vecz {
+class VectorizationUnit;
+
+struct BlockLivenessInfo {
+  using LiveSet = llvm::SmallVector<llvm::Value *, 16>;
+
+  LiveSet LiveIn;
+  LiveSet LiveOut;
+  size_t MaxRegistersInBlock = 0;
+};
+
+class LivenessResult {
+ public:
+  LivenessResult(llvm::Function &F) : F(F) {}
+
+  LivenessResult() = delete;
+  LivenessResult(const LivenessResult &) = delete;
+  LivenessResult(LivenessResult &&) = default;
+  ~LivenessResult() = default;
+
+  void recalculate();
+
+  size_t getMaxLiveVirtualRegisters() const;
+  const BlockLivenessInfo &getBlockInfo(const llvm::BasicBlock *) const;
+
+ private:
+  class Impl;
+
+  llvm::Function &F;
+
+  size_t maxNumberOfLiveValues;
+
+  llvm::DenseMap<const llvm::BasicBlock *, BlockLivenessInfo> BlockInfos;
+};
+
+/// Analysis pass to perform liveness analysis and estimate register pressure by
+/// counting the number of live virtual registers in a function.
+///
+/// Values in a basic block's live set are guaranteed to be in program order.
+class LivenessAnalysis : public llvm::AnalysisInfoMixin<LivenessAnalysis> {
+  friend llvm::AnalysisInfoMixin<LivenessAnalysis>;
+
+ public:
+  using Result = LivenessResult;
+
+  LivenessAnalysis() = default;
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Liveness analysis"; }
+
+  /// Estimate the number of registers needed by F by counting the number of
+  /// live values.
+  ///
+  /// Assumes a reducible CFG. In OpenCL 1.2 whether or not irreducible control
+  /// flow is illegal is implementation defined.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
new file mode 100644
index 0000000000000..5fc33d3857223
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
@@ -0,0 +1,106 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Stride analysis.
+
+#ifndef VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Function;
+class Value;
+}  // namespace llvm
+
+namespace vecz {
+
+class StrideAnalysisResult;
+struct UniformValueResult;
+
+/// @brief Holds the result of Packetization Analysis for a given function.
+class PacketizationAnalysisResult {
+ public:
+  /// @brief The function being analyzed
+  llvm::Function &F;
+  /// @brief The Stride Analysis Result to use during analysis
+  StrideAnalysisResult &SAR;
+  /// @brief The Uniform Value Result to use during analysis
+  UniformValueResult &UVR;
+
+  /// @brief Traverse the function, starting from the vector leaves, and mark
+  /// instructions for packetization where needed. Note that the resulting set
+  /// MAY not be exhaustive, since it is not always easy to predict where the
+  /// packetizer might fail and fall back on instantiation, in which case
+  /// pointers will need to be packetized regardless of linear stride.
+  PacketizationAnalysisResult(llvm::Function &f, StrideAnalysisResult &sar);
+
+  /// @brief Returns whether the packetization set is empty or not.
+  bool isEmpty() const { return toPacketize.empty(); }
+
+  /// @brief query whether the given value has been marked for packetization.
+  ///
+  /// @param[in] V the value to query
+  /// @return true if the value was marked for packetization, false otherwise.
+  bool needsPacketization(const llvm::Value *V) const {
+    return toPacketize.count(V) != 0;
+  }
+
+ private:
+  void markForPacketization(llvm::Value *V);
+
+  /// @brief The set of instructions that need to be packetized.
+  /// This equates to all non-uniform values except for values used only in
+  /// address computations with constant linear strides.
+  llvm::DenseSet<const llvm::Value *> toPacketize;
+};
+
+/// @brief Analysis that determines whether pointer operands of memory
+/// operations have a linear dependence on the work item ID.
+class PacketizationAnalysis
+    : public llvm::AnalysisInfoMixin<PacketizationAnalysis> {
+  friend AnalysisInfoMixin<PacketizationAnalysis>;
+
+ public:
+  /// @brief Create a new analysis object.
+  PacketizationAnalysis() {}
+
+  using Result = PacketizationAnalysisResult;
+
+  /// @brief Run the Packetization Analysis
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Packetization analysis"; }
+
+ private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
new file mode 100644
index 0000000000000..43ea34eb9ed96
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief SIMD width analysis.
+
+#ifndef VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include "vectorization_unit.h"
+
+namespace vecz {
+
+class LivenessResult;
+
+/// @brief Choose a good SIMD width for the given function.
+class SimdWidthAnalysis : public llvm::AnalysisInfoMixin<SimdWidthAnalysis> {
+  friend AnalysisInfoMixin<SimdWidthAnalysis>;
+
+ public:
+  /// @brief Create a new instance of the pass.
+  SimdWidthAnalysis() = default;
+
+  /// @brief Type of result produced by the analysis.
+  struct Result {
+    Result(unsigned value) : value(value) {}
+    unsigned value;
+  };
+
+  /// @brief Run the SIMD width analysis pass on the given function.
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  /// @return Preferred SIMD vectorization factor for the function or zero.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "SIMD width analysis"; }
+
+ private:
+  unsigned avoidSpillImpl(llvm::Function &, llvm::FunctionAnalysisManager &,
+                          unsigned MinWidth = 2);
+
+  /// @brief Vector register width from TTI, if available.
+  unsigned MaxVecRegBitWidth;
+
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
new file mode 100644
index 0000000000000..f013fd1259c21
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -0,0 +1,127 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Stride analysis.
+
+#ifndef VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Analysis/AssumptionCache.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include "offset_info.h"
+
+namespace llvm {
+class Function;
+class Value;
+}  // namespace llvm
+
+namespace vecz {
+
+struct UniformValueResult;
+
+/// @brief Holds the result of Stride Analysis for a given function.
+class StrideAnalysisResult {
+ public:
+  /// @brief The function being analyzed
+  llvm::Function &F;
+  /// @brief The Uniform Value Result to use during analysis
+  UniformValueResult &UVR;
+  /// @brief AssumptionCache for computing live bits of uniform values
+  llvm::AssumptionCache assumptions;
+
+  StrideAnalysisResult(llvm::Function &f, UniformValueResult &uvr);
+
+  /// @brief generate stride `ConstantInt`s or `Instruction`s for all analyzed
+  /// values.
+  void manifestAll(llvm::IRBuilder<> &B);
+
+  /// @brief gets a pointer to the info struct for this value's analysis.
+  OffsetInfo *getInfo(llvm::Value *V) {
+    auto const find = analyzed.find(V);
+    return (find != analyzed.end()) ? &find->second : nullptr;
+  }
+
+  /// @brief gets a pointer to the info struct for this value's analysis.
+  OffsetInfo const *getInfo(llvm::Value *V) const {
+    auto const find = analyzed.find(V);
+    return (find != analyzed.end()) ? &find->second : nullptr;
+  }
+
+  /// @brief construct the offset info for the given value.
+  OffsetInfo &analyze(llvm::Value *V);
+
+  /// @brief build the strides as `Instructions` or `ConstantInts`.
+  /// Strides may be needed as `llvm::Values` by transform passes, but we are
+  /// not allowed to construct them during an analysis pass. However, note that
+  /// information about manifested stride `Value`s will survive until the
+  /// analysis is invalidated.
+  OffsetInfo const &manifest(llvm::IRBuilder<> &B, llvm::Value *V) {
+    auto const find = analyzed.find(V);
+    assert(find != analyzed.end() &&
+           "Trying to manifest unanalyzed OffsetInfo");
+    return find->second.manifest(B, *this);
+  }
+
+  /// @brief gets the manifested memory stride for this value, if present.
+  ///
+  /// @param[in] B IRBuilder for creating new instructions/values
+  /// @param[in] Ptr the pointer to calculate the stride for
+  /// @param[in] EleTy the type that the pointer points to
+  /// @returns the stride of the memory operation, in number of elements
+  llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Value *Ptr,
+                                 llvm::Type *EleTy) const;
+
+ private:
+  /// @brief A map of values onto OffsetInfos that were already analyzed.
+  llvm::DenseMap<llvm::Value *, OffsetInfo> analyzed;
+};
+
+/// @brief Analysis that determines whether pointer operands of memory
+/// operations have a linear dependence on the work item ID.
+class StrideAnalysis : public llvm::AnalysisInfoMixin<StrideAnalysis> {
+  friend AnalysisInfoMixin<StrideAnalysis>;
+
+ public:
+  /// @brief Create a new analysis object.
+  StrideAnalysis() {}
+
+  using Result = StrideAnalysisResult;
+
+  /// @brief Run the Stride Analysis
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Stride analysis"; }
+
+ private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
new file mode 100644
index 0000000000000..2abd0d396d8d8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
@@ -0,0 +1,188 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Uniform Value analysis.
+
+#ifndef VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include <vector>
+
+namespace llvm {
+class Value;
+class Instruction;
+}  // namespace llvm
+
+namespace vecz {
+
+class VectorizationContext;
+class VectorizationUnit;
+
+/// @brief Holds the result of Uniform Value Analysis for a given function.
+struct UniformValueResult {
+  enum class VaryingKind {
+    /// @brief The value is uniform.
+    eValueUniform,
+    /// @brief The value is varying and lanes may see different values.
+    eValueVarying,
+    /// @brief The value is uniform, but its mask is not.
+    /// Used for masked memory operations with a uniform address but varying
+    /// mask.
+    eMaskVarying,
+  };
+
+  /// @brief The function the analysis was run on.
+  llvm::Function &F;
+  /// @brief Vectorization unit the analysis was run on.
+  VectorizationUnit &VU;
+  /// @brief The Vectorization Context of the analysis.
+  VectorizationContext &Ctx;
+  /// @brief The vectorization dimension
+  unsigned dimension;
+  /// @brief The actual results of the analysis.
+  llvm::DenseMap<const llvm::Value *, VaryingKind> varying;
+
+  /// @brief Create a new UVA result for the given unit.
+  /// @param[in] F Function to analyze.
+  /// @param[in] VU Function to analyze.
+  UniformValueResult(llvm::Function &F, VectorizationUnit &VU);
+
+  /// @brief Determine whether the given value needs to be packetized or not.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @brief true if the value needs to be packetized, false otherwise.
+  bool isVarying(const llvm::Value *V) const;
+
+  /// @brief Determine whether the given value has a varying mask or not.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @brief true if the value has a varying mask, false otherwise.
+  bool isMaskVarying(const llvm::Value *V) const;
+
+  /// @brief Determine whether the given value has a varying mask or not.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @brief true if the value is varying or has a varying mask, false
+  /// otherwise.
+  bool isValueOrMaskVarying(const llvm::Value *V) const;
+
+  /// @brief Remove the value from the analysis.
+  ///
+  /// @param[in] V Value to remove.
+  void remove(const llvm::Value *V) { varying.erase(V); }
+
+  /// @brief Uncritically set a value to varying.
+  /// This can be used to keep the result valid after expression transforms.
+  /// Use with care, since it does not recursively update value users.
+  ///
+  /// @param[in] V Value to set.
+  void setVarying(const llvm::Value *V) {
+    varying[V] = VaryingKind::eValueVarying;
+  }
+
+  /// @brief Look for vector roots in the function.
+  ///
+  /// Roots are values which are scalar in the original function but are defined
+  /// to be vector in the vectorized function.
+  ///
+  /// Users of roots need to be vectorized too but are not considered roots.
+  /// As such they will not be returned in Roots.
+  ///
+  /// Examples:
+  /// * Calls to get_global_id()
+  /// * Calls to get_local_id()
+  ///
+  /// @param[in,out] Roots List of roots to update.
+  void findVectorRoots(std::vector<llvm::Value *> &Roots) const;
+
+  /// @brief Look for vector leaves in the function.
+  ///
+  /// Leaves are instructions that allow vectorized values to 'escape' from the
+  /// function.
+  ///
+  /// Examples:
+  /// * Store instructions (when the value to store is vectorized)
+  /// * Operands of call instructions (when the call needs to be vectorized)
+  /// * Return instructions
+  ///
+  /// @param[in,out] Leaves List of leaves to update.
+  void findVectorLeaves(std::vector<llvm::Instruction *> &Leaves) const;
+
+  /// @brief Find the alloca that this pointer points to
+  ///
+  /// @param[in] Pointer The pointer that is (potentially) pointing in an alloca
+  ///
+  /// @return the alloca if found, or nullptr otherwise
+  static llvm::AllocaInst *findAllocaFromPointer(llvm::Value *Pointer);
+
+  /// @brief Try to extract the base pointer of the address.
+  ///
+  /// @param[in] Address Address to split into base and offset.
+  ///
+  /// @return Base address.
+  llvm::Value *extractMemBase(llvm::Value *Address);
+
+  // private:
+  /// @brief Mark any value in the function that depends on V as being varying.
+  ///
+  /// @param[in] V Value used to start the vectorization search.
+  /// @param[in] From Optional value being used by `V`.
+  void markVaryingValues(llvm::Value *V, llvm::Value *From = nullptr);
+};
+
+/// @brief Analysis that determine whether values in a function are uniform or
+/// varying.
+class UniformValueAnalysis
+    : public llvm::AnalysisInfoMixin<UniformValueAnalysis> {
+  friend AnalysisInfoMixin<UniformValueAnalysis>;
+
+ public:
+  /// @brief Create a new analysis object.
+  UniformValueAnalysis() {}
+
+  /// @brief Type of result produced by the analysis.
+  using Result = UniformValueResult;
+
+  /// @brief Determine which values in the function are uniform and which are
+  /// potentially varying.
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Uniform value analysis"; }
+
+ private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
new file mode 100644
index 0000000000000..bb4f248b121a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -0,0 +1,72 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Vectorizable Function analysis.
+
+#ifndef VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+/// @brief Determines whether vectorization of a function is possible.
+class VectorizableFunctionAnalysis
+    : public llvm::AnalysisInfoMixin<VectorizableFunctionAnalysis> {
+  friend AnalysisInfoMixin<VectorizableFunctionAnalysis>;
+
+ public:
+  /// @brief Create a new instance of the pass.
+  VectorizableFunctionAnalysis() = default;
+
+  /// @brief Type of result produced by the analysis.
+  struct Result {
+    /// @brief Whether the function can be vectorized.
+    bool canVectorize = false;
+
+    /// @brief If the function can not be vectorized, the value (if any) that
+    /// is the cause of the problem.
+    llvm::Value const *failedAt = nullptr;
+
+   public:
+    /// @brief Handle invalidation events from the new pass manager.
+    ///
+    /// @return false, as this analysis can never be invalidated.
+    bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &,
+                    llvm::FunctionAnalysisManager::Invalidator &) {
+      return false;
+    }
+  };
+
+  /// @brief Determine whether vectorization of a function is possible.
+  /// @param[in] F Function to analyze.
+  /// @return VectorizationUnit corresponding to this function
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Vectorizable Function analysis"; }
+
+ private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
new file mode 100644
index 0000000000000..fda2d27c328c0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
@@ -0,0 +1,121 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vectorization_unit_analysis.h
+///
+/// @brief VectorizationUnit analysis.
+
+#ifndef VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
+#define VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <cassert>
+
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+namespace vecz {
+
+/// @brief Caches and returns the VectorizationUnit for a Function.
+class VectorizationUnitAnalysis
+    : public llvm::AnalysisInfoMixin<VectorizationUnitAnalysis> {
+  friend AnalysisInfoMixin<VectorizationUnitAnalysis>;
+
+ public:
+  /// @brief Create a new instance of the pass.
+  VectorizationUnitAnalysis(const VectorizationContext &Ctx) : Ctx(Ctx) {}
+
+  /// @brief Type of result produced by the analysis.
+  class Result {
+    VectorizationUnit *VU = nullptr;
+
+   public:
+    Result() = default;
+    Result(VectorizationUnit *VU) : VU(VU) {}
+    VectorizationUnit &getVU() {
+      assert(hasResult());
+      return *VU;
+    }
+    bool hasResult() { return VU; }
+
+    /// @brief Handle invalidation events from the new pass manager.
+    ///
+    /// @return false, as this analysis can never be invalidated.
+    bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &,
+                    llvm::FunctionAnalysisManager::Invalidator &) {
+      return false;
+    }
+  };
+
+  /// @brief Retrieve the VectorizationUnit for the requested function.
+  /// @param[in] F Function to analyze.
+  /// @return VectorizationUnit corresponding to this function
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "VectorizationUnit analysis"; }
+
+ private:
+  const VectorizationContext &Ctx;
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+/// @brief Caches and returns the VectorizationContext for a Function.
+class VectorizationContextAnalysis
+    : public llvm::AnalysisInfoMixin<VectorizationContextAnalysis> {
+  friend AnalysisInfoMixin<VectorizationContextAnalysis>;
+
+ public:
+  /// @brief Create a new instance of the pass.
+  VectorizationContextAnalysis(VectorizationContext &Ctx) : Context(Ctx) {}
+
+  /// @brief Type of result produced by the analysis.
+  class Result {
+    VectorizationContext &Ctx;
+
+   public:
+    Result(VectorizationContext &Ctx) : Ctx(Ctx) {}
+    VectorizationContext &getContext() { return Ctx; }
+    const VectorizationContext &getContext() const { return Ctx; }
+
+    /// @brief Handle invalidation events from the new pass manager.
+    ///
+    /// @return false, as this analysis can never be invalidated.
+    bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &,
+                    llvm::FunctionAnalysisManager::Invalidator &) {
+      return false;
+    }
+  };
+
+  /// @brief Retrieve the VectorizationContext for the requested function.
+  /// @param[in] F Function to analyze.
+  /// @return VectorizationContext corresponding to this function
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "VectorizationContext analysis"; }
+
+ private:
+  VectorizationContext &Context;
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+}  // namespace vecz
+
+#endif  // VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
new file mode 100644
index 0000000000000..d24c0ca6b31bb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
@@ -0,0 +1,273 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief BOSCC control flow transformation.
+///
+/// Style guideline 004 exemption note: This inner class declaration is in its
+/// own header file, because it's quite large.
+
+#ifndef VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
+#define VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+
+#include <utility>
+#include <vector>
+
+#include "transform/control_flow_conversion_pass.h"
+
+namespace llvm {
+class Instruction;
+class BasicBlock;
+class Function;
+class Loop;
+}  // namespace llvm
+
+namespace vecz {
+
+class LivenessResult;
+
+class ControlFlowConversionState::BOSCCGadget final {
+ public:
+  BOSCCGadget(ControlFlowConversionState &Pass)
+      : PassState(Pass),
+        F(Pass.F),
+        AM(Pass.AM),
+        DT(Pass.DT),
+        PDT(Pass.PDT),
+        LI(Pass.LI),
+        DR(Pass.DR),
+        RC(Pass.RC.get()) {}
+
+  /// @brief Region of code that will remain uniform after vectorization.
+  ///
+  /// Such regions won't have heir instructions predicated. A UniformRegion
+  /// is delimited by a single-entry-single-exit region and is represented
+  /// by the blocks it contains.
+  struct UniformRegion final {
+    /// @brief Predicated blocks duplicated in the region.
+    llvm::DenseSet<llvm::BasicBlock *> predicatedBlocks;
+    /// @brief Uniform blocks created in the region.
+    llvm::DenseSet<llvm::BasicBlock *> uniformBlocks;
+    /// @brief Divergent branches that need a connection from the uniform
+    /// region.
+    std::vector<llvm::BasicBlock *> divergentBranches;
+    /// @brief The entry block of the uniform region.
+    llvm::BasicBlock *entryBlock;
+    /// @brief The exit block of the uniform region.
+    llvm::BasicBlock *exitBlock;
+
+    /// @brief Mapping between a connection point of a predicated region
+    ///        and the blend points of that region impacted by the former.
+    ///
+    /// Said "impacted blocks" are blocks with more than one predecessors that
+    /// need to have blend instructions because instructions defined within
+    /// that region may no longer dominate said "impacted blocks".
+    llvm::DenseMap<llvm::BasicBlock *, llvm::SmallVector<llvm::BasicBlock *, 2>>
+        blendPoints;
+
+    /// @brief It stores up information about the connection points while
+    ///        the CFG is being updated, to be applied afterwards.
+    struct ConnectionInfo {
+      llvm::BasicBlock *connectionPoint;
+      std::pair<llvm::BasicBlock *, llvm::BasicBlock *> incoming;
+    };
+
+    /// @brief The list of ConnectionInfos to be applied at finalization.
+    std::vector<ConnectionInfo> connections;
+
+    /// @brief It stores up information about new blocks created to contain
+    ///        blend LCSSA PHI nodes, so they can be created after the CFG
+    ///        has been updated.
+    struct StoreBlock {
+      llvm::BasicBlock *connectionPoint;
+      llvm::BasicBlock *target;
+      llvm::BasicBlock *runtimeCheckerBlock;
+    };
+
+    /// @brief The list of blend `StoreBlocks` to be applied at finalization.
+    llvm::SmallVector<StoreBlock, 4> storeBlocks;
+
+    /// @brief Find if a predicated block belongs to this region.
+    /// @param[in] B Block to look for in the region
+    /// @return Whether the block belong to the region or not.
+    bool contains(llvm::BasicBlock *B) const {
+      return predicatedBlocks.count(B);
+    }
+  };
+  /// @brief List of all duplicated uniform regions.
+  using UniformRegions = std::vector<UniformRegion>;
+
+  /// @brief Create uniform regions to duplicate the blocks within such
+  /// regions.
+  ///
+  /// This allows to retain their uniform version to skip divergent branches
+  /// when the entry mask of a div causing block is dynamically uniform (i.e.
+  /// all true or all false). Nested uniform regions need not be duplicated
+  /// multiple times.
+  ///
+  /// @return true if no problem occurred, false otherwise.
+  bool duplicateUniformRegions();
+
+  /// @brief Connect the BOSCC regions.
+  /// @return true if no problem occured, false otherwise.
+  bool connectBOSCCRegions();
+
+  /// @brief Get the uniform version of 'B'.
+  /// @param[in] B The predicated block whose uniform version we want.
+  /// @return A uniform block if it exists, nullptr otherwise.
+  llvm::BasicBlock *getBlock(llvm::BasicBlock *B);
+  /// @brief Get the uniform version of 'L'.
+  /// @param[in] L The predicated loop whose uniform version we want.
+  /// @return A uniform loop if it exists, nullptr otherwise.
+  llvm::Loop *getLoop(llvm::Loop *L);
+
+  /// @brief Get the region entry blocks that have not been duplicated.
+  /// @param[out] blocks SmallVector to hold the result
+  void getUnduplicatedEntryBlocks(
+      llvm::SmallVectorImpl<llvm::BasicBlock *> &blocks) const;
+
+  /// @brief Create an entry in the VMap so that 'uni' becomes a uniform
+  ///        equivalent of 'pred'.
+  /// @param[in] pred Predicate value
+  /// @param[in] uni Uniform value
+  /// @param[in] needsMapping Whether 'uni' needs to me remapped
+  void createReference(llvm::Value *pred, llvm::Value *uni,
+                       bool needsMapping = false);
+  /// @brief Add an entry in the VMap so that the uniform equivalent of
+  ///        'old' becomes the uniform equivalent of 'pred' as well.
+  /// @param[in] pred Predicate value
+  /// @param[in] old Predicate value whose uniform equivalent we want
+  void addReference(llvm::Value *pred, llvm::Value *old);
+  /// @brief Add a new block to all the regions the reference block is part
+  /// of.
+  /// @param[in] newB New block
+  /// @param[in] refB Rference block
+  void addInRegions(llvm::BasicBlock *newB, llvm::BasicBlock *refB);
+
+  /// @brief Link the masks of the predicated regions to the uniform regions.
+  /// @return true on success, false on failure.
+  bool linkMasks();
+
+  /// @brief Retrieve the uniform version, if one exists, of predicatedV
+  ///        defined in src.
+  /// @param[in] predicatedV The predicated value whose uniform version we
+  ///            want to get.
+  /// @return the uniform version if it exists, null otherwise.
+  llvm::Value *getUniformV(llvm::Value *predicatedV);
+  /// @brief Update the value a uniform value should be a duplicate of.
+  /// @param[in] from The old value
+  /// @param[in] to The new value
+  void updateValue(llvm::Value *from, llvm::Value *to);
+
+  /// @brief Clean up redundant PHI nodes created by BOSCC.
+  /// @return true if no problem occured, false otherwise.
+  bool cleanUp();
+
+ private:
+  ControlFlowConversionState &PassState;
+  llvm::Function &F;
+  llvm::FunctionAnalysisManager &AM;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+  llvm::LoopInfo *LI = nullptr;
+  DivergenceResult *DR = nullptr;
+  Reachability *RC = nullptr;
+
+  /// @brief Mapping between the uniform version and the predicated version
+  ///        of the BOSCC. This is useful to keep information between both
+  ///        versions shared, such as exit masks.
+  llvm::ValueToValueMapTy VMap;
+
+  /// @brief Mapping between the predicated version and the uniform version
+  ///        of the BOSCC loops.
+  llvm::DenseMap<const llvm::Loop *, llvm::Loop *> LMap;
+
+  UniformRegions uniformRegions;
+
+  /// @brief Original edges of the CFG. Used to connect the uniform regions
+  ///        to their predicated version.
+  llvm::DenseMap<llvm::BasicBlock *, llvm::SmallVector<llvm::BasicBlock *, 2>>
+      uniformEdges;
+
+  /// @brief Mapping between a block from which a value should be replaced by
+  ///        its blended value.
+  using URVBlender =
+      std::vector<std::pair<llvm::BasicBlock *,
+                            std::pair<llvm::Value *, llvm::Instruction *>>>;
+
+  URVBlender URVB;
+
+  LivenessResult *liveness = nullptr;
+
+  /// @brief Create uniform regions
+  /// @return true if no problem occurred, false otherwise.
+  bool createUniformRegions(
+      llvm::DenseSet<llvm::BasicBlock *> const &noDuplicateBlocks);
+  /// @brief Duplicate a loop, creating a new looptag and updating all the
+  ///        relevant information.
+  /// @param[in] L The loop to duplicate
+  /// @return true if no problem occurred, false otherwise.
+  bool duplicateUniformLoops(llvm::Loop *L);
+
+  /// @brief Connect the uniform blocks that belong to the uniform region
+  /// @param[in] region Uniform region we are connecting
+  /// @param[in] predicatedB Div causing block in the predicated version
+  /// @param[in] uniformB Div causing block in the uniform version
+  /// @return true if no problem occured, false otherwise.
+  bool connectUniformRegion(UniformRegion &region,
+                            llvm::BasicBlock *predicatedB,
+                            llvm::BasicBlock *uniformB);
+
+  /// @brief Blend uniform region instructions into the predicated region
+  ///        connection point 'CP'.
+  /// @param[in] CP Connection point between a uniform and predicated region.
+  /// @param[in] incoming Predicated and uniform incoming block of 'CP'.
+  /// @return true if no problem occured, false otherwise.
+  bool blendConnectionPoint(
+      llvm::BasicBlock *CP,
+      const std::pair<llvm::BasicBlock *, llvm::BasicBlock *> &incoming);
+
+  /// @brief Apply all the changes stored up by `connectUniformRegion`
+  ///        and `blendConnectionPoint` once the CFG has been fully updated.
+  /// @return true if no problem occured, false otherwise.
+  bool blendFinalize();
+
+  /// @brief Update blend values in loop headers.
+  /// @param[in] LTag Loop whose blend values we update
+  /// @param[in] from The value we want to update
+  /// @param[in] to The value we update 'from' with.
+  /// @return true if no problem occured, false otherwise.
+  bool updateLoopBlendValues(LoopTag *LTag, llvm::Instruction *from,
+                             llvm::Instruction *to);
+
+  /// @brief Generate a block ordering.
+  ///
+  /// This ordering differs a little bit from the one in
+  /// ControlFlowConversionPass as we must process all the blocks that belong
+  /// in the same uniform region at once.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeBlockOrdering();
+};
+}  // namespace vecz
+
+#endif  // VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
new file mode 100644
index 0000000000000..6e15810ef16fd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
@@ -0,0 +1,56 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief ROSCC control flow transformation.
+///
+/// Style guideline 004 exemption note: This inner class declaration is in its
+/// own header to match `control_flow_boscc.h`.
+
+#ifndef VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
+#define VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
+
+#include "transform/control_flow_conversion_pass.h"
+
+namespace llvm {
+class Instruction;
+class BasicBlock;
+class Loop;
+}  // namespace llvm
+
+namespace vecz {
+
+/// @brief class that encapsulates the ROSCC transformation, which stands for
+///        "Return On Superword Condition Code" and optimizes non-uniform
+///        branches to the function return block(s).
+class ControlFlowConversionState::ROSCCGadget final {
+ public:
+  ROSCCGadget(ControlFlowConversionState &Pass)
+      : UVR(Pass.UVR), DT(Pass.DT), PDT(Pass.PDT), LI(Pass.LI) {}
+
+  /// @brief perform the ROSCC transformation
+  bool run(llvm::Function &F);
+
+ private:
+  UniformValueResult *UVR = nullptr;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+  llvm::LoopInfo *LI = nullptr;
+};
+}  // namespace vecz
+
+#endif  // VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
new file mode 100644
index 0000000000000..2911916b2a9da
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -0,0 +1,201 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Functions, macros, etc used for debugging
+
+#ifndef VECZ_DEBUGGING_H_INCLUDED
+#define VECZ_DEBUGGING_H_INCLUDED
+
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/DiagnosticInfo.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Value.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/optional_helper.h>
+
+#include <cstdlib>
+#include <memory>
+
+namespace vecz {
+
+/// @brief Namespace used for vecz utils that we don't want to pollute the whole
+/// vecz namespace
+namespace internal {
+/// @brief Helper type for signaling a failure from functions that return either
+/// a pointer or a boolean to indicate if vectorization was successful or not
+struct VeczFailResult {
+  /// @brief For functions that return a boolean value
+  operator bool() const { return false; }
+  /// @brief For functions that return a pointer
+  template <typename T>
+  operator T *() const {
+    return nullptr;
+  }
+  /// @brief For functions that return an std::shared_ptr
+  template <typename T>
+  operator std::shared_ptr<T>() const {
+    return nullptr;
+  }
+  /// @brief For functions that return an std::unique_ptr
+  template <typename T>
+  operator std::unique_ptr<T>() const {
+    return nullptr;
+  }
+  /// @brief For functions that return an llvm::Optional
+  template <typename T>
+  operator multi_llvm::Optional<T>() const {
+    return multi_llvm::None;
+  }
+};
+
+struct AnalysisFailResult : public internal::VeczFailResult {
+  AnalysisFailResult() = default;
+  ~AnalysisFailResult() = default;
+  // If an optimization failed we'd better not have altered the validity of any
+  // analysis...
+  operator llvm::PreservedAnalyses() const {
+    return llvm::PreservedAnalyses::all();
+  }
+};
+
+/*
+ * The following macros are available:
+ *
+ * VECZ_FAIL: Return from the function with a failure value (e.g. `false` or
+ * `nullptr`).
+ *
+ * VECZ_FAIL_IF(cond): If (cond == true) then VECZ_FAIL
+ *
+ * VECZ_STAT_FAIL_IF(cond, stat): If (cond == true) then VECZ_FAIL and increment
+ * stat
+ *
+ * VECZ_ERROR_IF(cond, message): Similar to VECZ_FAIL_IF, but when NDEBUG is not
+ * set it aborts instead of returning a failure value.
+ *
+ * VECZ_ERROR(message): Similar to VECZ_ERROR_IF(true, message)
+ *
+ * VECZ_WARN_IF(cond, message): Similar to VECZ_ERROR_IF, but it doesn't abort
+ * but warns and carries on.
+ *
+ * VECZ_UNREACHABLE(message): Unconditionally terminate with an error message.
+ *
+ * For all the macros, the message is <<'d to llvm::errs(), so it is possible to
+ * print llvm Values etc. For example, this works:
+ *   VECZ_WARN_IF(cond, "Warning: Value = " << *V)
+ */
+
+#define VECZ_FAIL() return vecz::internal::VeczFailResult()
+
+#define VECZ_FAIL_IF(cond) \
+  do {                     \
+    if (cond) {            \
+      VECZ_FAIL();         \
+    }                      \
+  } while (false)
+
+#define VECZ_STAT_FAIL_IF(cond, stat) \
+  do {                                \
+    if (cond) {                       \
+      ++stat;                         \
+      VECZ_FAIL();                    \
+    }                                 \
+  } while (false)
+
+#define VECZ_ERROR_IF(cond, message) \
+  do {                               \
+    if (cond) {                      \
+      VECZ_ERROR(message);           \
+    }                                \
+  } while (false)
+
+#ifdef NDEBUG
+
+#define VECZ_ERROR(message)                                             \
+  do {                                                                  \
+    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__ \
+                 << "\n";                                               \
+    llvm::errs() << "!! Reason: " << message << "\n";                   \
+    VECZ_FAIL();                                                        \
+  } while (false)
+
+#define VECZ_WARN_IF(cond, message) /* Nothing */
+#define VECZ_UNREACHABLE(message)   /* Nothing */
+
+#else /* !NDEBUG */
+
+#define VECZ_ERROR(message)                                             \
+  do {                                                                  \
+    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__ \
+                 << "\n";                                               \
+    llvm::errs() << "!! Reason: " << (message) << "\n";                 \
+    std::abort();                                                       \
+  } while (false)
+
+#define VECZ_WARN_IF(cond, message)                                         \
+  do {                                                                      \
+    if (cond) {                                                             \
+      llvm::errs() << "!! Vecz: WARNING in " << __FILE__ << ":" << __LINE__ \
+                   << "\n";                                                 \
+      llvm::errs() << "!! Reason: " << (message) << "\n";                   \
+    }                                                                       \
+  } while (false)
+
+#define VECZ_UNREACHABLE(message)                                         \
+  do {                                                                    \
+    llvm::errs() << "!! Vecz: UNREACHABLE reached in " << __FILE__ << ":" \
+                 << __LINE__ << "\n";                                     \
+    llvm::errs() << "!! Message: " << (message) << "\n";                  \
+    std::abort();                                                         \
+  } while (false)
+#endif /* NDEBUG */
+}  // namespace internal
+
+#define VECZ_UNUSED(x) ((void)(x))
+
+/// @brief Emit a RemarkMissed message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] V The value (can be `nullptr`) to be included in the message
+/// @param[in] Msg The main remark message text
+void emitVeczRemarkMissed(const llvm::Function *F, const llvm::Value *V,
+                          llvm::StringRef Msg);
+/// @brief Emit a RemarkMissed message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] Msg The main remark message text
+void emitVeczRemarkMissed(const llvm::Function *F, llvm::StringRef Msg);
+/// @brief Emit a Remark message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] V The value (can be `nullptr`) to be included in the message
+/// @param[in] Msg The main remark message text
+void emitVeczRemark(const llvm::Function *F, const llvm::Value *V,
+                    llvm::StringRef Msg);
+/// @brief Emit a Remark message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] Msg The main remark message text
+void emitVeczRemark(const llvm::Function *F, llvm::StringRef Msg);
+
+}  // namespace vecz
+
+#endif  // VECZ_DEBUGGING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
new file mode 100644
index 0000000000000..3c90865c7f5f2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
@@ -0,0 +1,52 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_IR_CLEANUP_H_INCLUDED
+#define VECZ_IR_CLEANUP_H_INCLUDED
+
+#include <llvm/ADT/SmallPtrSet.h>
+
+namespace llvm {
+class Instruction;
+}
+
+namespace vecz {
+class IRCleanup {
+ public:
+  /// @brief Mark the instruction as needing deletion. It will only be deleted
+  /// if it is unused. This is used to mark instructions with side-effects
+  /// (e.g. call, load, store and leaves) that have been replaced and are no
+  /// longer needed. Dead Code Elimination will not touch such instructions.
+  ///
+  /// @param[in] I Instruction to mark as needing deletion.
+  void deleteInstructionLater(llvm::Instruction *I);
+
+  /// @brief Get rid of instructions that have been marked for deletion.
+  void deleteInstructions();
+
+  /// @brief Immediately delete an instruction, and replace all uses with undef
+  ///
+  /// @param[in] I Instruction to delete.
+  static void deleteInstructionNow(llvm::Instruction *I);
+
+ private:
+  /// @brief Instructions that have been marked for deletion.
+  llvm::SmallPtrSet<llvm::Instruction *, 16> InstructionsToDelete;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
new file mode 100644
index 0000000000000..10a0f0adf7d94
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
@@ -0,0 +1,54 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief LLVM helper methods.
+
+#ifndef VECZ_LLVM_HELPERS_H_INCLUDED
+#define VECZ_LLVM_HELPERS_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/vector_type_helper.h>
+
+namespace vecz {
+
+/// @brief Determine if the value has vector type, and return it.
+///
+/// @param[in] V Value to analyze.
+///
+/// @return Vector type of V or null.
+llvm::FixedVectorType *getVectorType(llvm::Value *V);
+
+/// @brief Get the default value for a type.
+///
+/// @param[in] T Type to get default value of.
+/// @param[in] V Default value to use for numeric type
+///
+/// @return Default value, which will be undef for non-numeric types
+llvm::Value *getDefaultValue(llvm::Type *T, uint64_t V = 0UL);
+
+/// @brief Get the shuffle mask as sequence of integers.
+///
+/// @param[in] Shuffle Instruction
+///
+/// @return Array of integers representing the Shuffle mask
+llvm::ArrayRef<int> getShuffleVecMask(llvm::ShuffleVectorInst *Shuffle);
+}  // namespace vecz
+
+#endif  // VECZ_LLVM_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
new file mode 100644
index 0000000000000..8ddf484a3c0a6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -0,0 +1,627 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Manipulation of memory operations like loads and stores.
+
+#ifndef VECZ_MEMORY_OPERATIONS_H_INCLUDED
+#define VECZ_MEMORY_OPERATIONS_H_INCLUDED
+
+#include <inttypes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <multi_llvm/optional_helper.h>
+
+#include <string>
+
+namespace llvm {
+class CallInst;
+class LoadInst;
+class StoreInst;
+class Argument;
+class Function;
+class Instruction;
+class Value;
+class Type;
+}  // namespace llvm
+
+namespace vecz {
+
+class VectorizationContext;
+struct UniformValueResult;
+
+/// @brief Return or declare a masked memory operation builtin function.
+///
+/// @param[in] Ctx Context used to manipulate internal builtins.
+/// @param[in] DataTy Loaded type or stored value type.
+/// @param[in] PtrTy Pointer type. Must either be opaque or have its pointee
+/// type match DataTy.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] IsLoad true if defined a masked load, false if a masked store.
+/// @param[in] IsVP true if defining a vector-predicated operation
+///
+/// @return Masked builtin function.
+llvm::Function *getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
+                                         llvm::Type *DataTy,
+                                         llvm::PointerType *PtrTy,
+                                         unsigned Alignment, bool IsLoad,
+                                         bool IsVP);
+
+/// @brief Create a call to a masked load operation builtin function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] Ty Type to load from memory.
+/// @param[in] Ptr Pointer. Internally bitcast to point to Ty.
+/// @param[in] Mask Mask.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Alignment Alignment
+/// @param[in] Name Name to give to the call instruction.
+/// @param[in] InsertBefore Insertion point for the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createMaskedLoad(VectorizationContext &Ctx, llvm::Type *Ty,
+                                 llvm::Value *Ptr, llvm::Value *Mask,
+                                 llvm::Value *EVL, unsigned Alignment,
+                                 llvm::Twine Name = "",
+                                 llvm::Instruction *InsertBefore = nullptr);
+
+/// @brief Create a call to a masked store operation builtin function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] Data Stored value.
+/// @param[in] Ptr Pointer. Internally bitcast to pointer to Data's type.
+/// @param[in] Mask Mask.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Alignment Alignment
+/// @param[in] Name Name to give to the call instruction.
+/// @param[in] InsertBefore Insertion point for the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createMaskedStore(VectorizationContext &Ctx, llvm::Value *Data,
+                                  llvm::Value *Ptr, llvm::Value *Mask,
+                                  llvm::Value *EVL, unsigned Alignment,
+                                  llvm::Twine Name = "",
+                                  llvm::Instruction *InsertBefore = nullptr);
+
+/// @brief Return or declare a (masked) interleaved memory operation builtin
+/// function.
+
+/// @param[in] Ctx Context used to manipulate internal builtins.
+/// @param[in] DataTy Loaded type or stored value type.
+/// @param[in] PtrTy Pointer type. Must either be opaque or have its pointee
+/// type match DataTy's element type.
+/// @param[in] Stride The stride of the access. May be null in which case the
+/// default stride is used.
+/// @param[in] MaskTy The mask type. May be null for an unmasked operation.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] IsLoad true if defining a load, false if defining a store.
+/// @param[in] IsVP true if defining a vector-predicated operation
+///
+/// @return (Masked) interleaved builtin function.
+llvm::Function *getOrCreateInterleavedMemOpFn(
+    VectorizationContext &Ctx, llvm::Type *DataTy, llvm::PointerType *PtrTy,
+    llvm::Value *Stride, llvm::Type *MaskTy, unsigned Alignment, bool IsLoad,
+    bool IsVP);
+
+/// @brief Create a call to a (masked) interleaved load builtin function. Also
+/// known as a strided load.
+///
+/// @param[in] Ctx Vectorization Context used to retrieve the builtin info.
+/// @param[in] Ty Type to load from memory
+/// @param[in] Ptr Pointer. Internally bitcast to a pointer to Ty's element
+/// type.
+/// @param[in] Stride The stride of the operation. May be null in which case
+/// the default stride is used.
+/// @param[in] Mask The mask controlling the operation. May be null in which
+/// case an unmasked builtin is called.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] Name Name to give to the call instruction.
+/// @param[in] InsertBefore Insertion point for the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createInterleavedLoad(
+    VectorizationContext &Ctx, llvm::Type *Ty, llvm::Value *Ptr,
+    llvm::Value *Stride, llvm::Value *Mask, llvm::Value *EVL,
+    unsigned Alignment, llvm::Twine Name = "",
+    llvm::Instruction *InsertBefore = nullptr);
+
+/// @brief Create a call to a (masked) interleaved store builtin function. Also
+/// known as a strided store.
+///
+/// @param[in] Ctx Vectorization Context used to retrieve the builtin info.
+/// @param[in] Data Data value to store to memory.
+/// @param[in] Ptr Pointer. Internally bitcast to a pointer to Data's element
+/// type.
+/// @param[in] Stride The stride of the operation. May be null in which case
+/// the default stride is used.
+/// @param[in] Mask The mask controlling the operation. May be null in which
+/// case an unmasked builtin is called.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] Name Name to give to the call instruction.
+/// @param[in] InsertBefore Insertion point for the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createInterleavedStore(
+    VectorizationContext &Ctx, llvm::Value *Data, llvm::Value *Ptr,
+    llvm::Value *Stride, llvm::Value *Mask, llvm::Value *EVL,
+    unsigned Alignment, llvm::Twine Name = "",
+    llvm::Instruction *InsertBefore = nullptr);
+
+/// @brief Return or declare a (masked) scatter/gather memory operation builtin
+/// function.
+///
+/// @param[in] Ctx Context used to manipulate internal builtins.
+/// @param[in] DataTy Loaded type or stored value type.
+/// @param[in] VecPtrTy Pointer type. Must be a vector of pointers, each of
+/// which are either opaque or have a pointee type matching DataTy's element
+/// type.
+/// @param[in] MaskTy The mask type. May be null for an unmasked operation.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] IsGather true if defining a gather (load), false if defining a
+/// scatter (store).
+/// @param[in] IsVP true if defining a vector-predicated operation
+///
+/// @return Scatter/gather builtin function.
+llvm::Function *getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx,
+                                                llvm::Type *DataTy,
+                                                llvm::VectorType *VecPtrTy,
+                                                llvm::Type *MaskTy,
+                                                unsigned Alignment,
+                                                bool IsGather, bool IsVP);
+
+/// @brief Create a call to a (masked) gather memory operation builtin
+/// function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] Ty Type to load from memory.
+/// @param[in] VecPtr Pointer value. Must be a vector of pointers, each of
+/// which are either opaque or have a pointee type matching DataTy's element
+/// type.
+/// @param[in] Mask The predicate of the masked instruction. May be null in
+/// which case an unmasked builtin is created.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Name Name to give to the call instruction.
+/// @param[in] InsertBefore Insertion point for the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createGather(VectorizationContext &Ctx, llvm::Type *Ty,
+                             llvm::Value *VecPtr, llvm::Value *Mask,
+                             llvm::Value *EVL, unsigned Alignment,
+                             llvm::Twine Name = "",
+                             llvm::Instruction *InsertBefore = nullptr);
+
+/// @brief Create a call to a (masked) scatter memory operation builtin
+/// function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] VecData Value to store to memory.
+/// @param[in] VecPtr Pointer value. Must be a vector of pointers, each of
+/// which are either opaque or have a pointee type matching DataTy's element
+/// type.
+/// @param[in] Mask The predicate of the masked instruction. May be null in
+/// which case an unmasked builtin is created.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Name Name to give to the call instruction.
+/// @param[in] InsertBefore Insertion point for the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createScatter(VectorizationContext &Ctx, llvm::Value *VecData,
+                              llvm::Value *VecPtr, llvm::Value *Mask,
+                              llvm::Value *EVL, unsigned Alignment,
+                              llvm::Twine Name = "",
+                              llvm::Instruction *InsertBefore = nullptr);
+
+/// @brief an enum to distinguish between loads and stores, and between builtin
+/// memop calls and native IR memop instructions.
+enum class MemOpKind : int {
+  /// @brief The object does not contain a valid memory operation.
+  Invalid = 0,
+  /// @brief The object contains a LLVM load instruction.
+  LoadInstruction,
+  /// @brief The object contains a LLVM store instruction.
+  StoreInstruction,
+  /// @brief The object contains a 'load-like' function call.
+  LoadCall,
+  /// @brief The object contains a 'store-like' function call.
+  StoreCall,
+};
+
+/// @brief an enum to distinguish between different memory access patterns
+enum class MemOpAccessKind : int {
+  /// @brief The object does not represent a vecz memop call
+  Native = 0,
+  /// @brief The object represents a masked memory operation
+  Masked,
+  /// @brief The object represents an interleaved memory operation
+  Interleaved,
+  /// @brief The object represents a masked interleaved memory operation
+  MaskedInterleaved,
+  /// @brief The object represents a scatter/gather memory operation
+  ScatterGather,
+  /// @brief The object represents a masked scatter/gather memory operation
+  MaskedScatterGather,
+};
+
+struct MemOp;
+
+/// @brief Describes a memory operation such as a load or a store.
+class MemOpDesc {
+  /// @brief Type of the data operand for stores, or memory type for loads.
+  llvm::Type *DataTy;
+  /// @brief Type of the pointer used to access memory.
+  llvm::Type *PtrTy;
+  /// @brief In the case of masked operations, type of the mask operand.
+  llvm::Type *MaskTy;
+  /// @brief Identifies the kind of memory operation which is performed.
+  MemOpKind Kind;
+  /// @brief Idenfities the kind of memory access pattern
+  MemOpAccessKind AccessKind;
+  /// @brief Whether or not the memory access is vector-length predicated.
+  bool IsVLOp;
+  /// @brief Memory alignment.
+  unsigned Alignment;
+  /// @brief Distance between consecutive elements in memory, in number of
+  /// elements. Zero means uniform access, one means sequential access.
+  /// Negative values mean the access is done is reverse order.
+  llvm::Value *Stride;
+  /// @brief Index of the data operand, for stores, or negative value.
+  int8_t DataOpIdx;
+  /// @brief Index of the pointer operand.
+  int8_t PtrOpIdx;
+  /// @brief Index of the mask operand, for masked operations, or negative
+  /// value.
+  int8_t MaskOpIdx;
+  /// @brief Index of vector length operand, or negative value.
+  int8_t VLOpIdx;
+
+  friend struct MemOp;
+
+ public:
+  /// @brief Create an invalid memory operation.
+  MemOpDesc();
+
+  bool isMaskedMemOp() const { return AccessKind == MemOpAccessKind::Masked; }
+  bool isInterleavedMemOp() const {
+    return AccessKind == MemOpAccessKind::Interleaved;
+  }
+  bool isMaskedInterleavedMemOp() const {
+    return AccessKind == MemOpAccessKind::MaskedInterleaved;
+  }
+  bool isScatterGatherMemOp() const {
+    return AccessKind == MemOpAccessKind::ScatterGather;
+  }
+  bool isMaskedScatterGatherMemOp() const {
+    return AccessKind == MemOpAccessKind::MaskedScatterGather;
+  }
+
+  /// @brief In the case of stores, return the data element being stored.
+  llvm::Value *getDataOperand(llvm::Function *F) const {
+    return getOperand(F, DataOpIdx);
+  }
+
+  /// @brief Return the pointer used by the memory operation.
+  llvm::Value *getPointerOperand(llvm::Function *F) const {
+    return getOperand(F, PtrOpIdx);
+  }
+
+  /// @brief In the case of a masked memory operation, return the mask.
+  llvm::Value *getMaskOperand(llvm::Function *F) const {
+    return getOperand(F, MaskOpIdx);
+  }
+
+  /// @brief In the case of a masked memory operation, return the vector
+  /// length.
+  llvm::Value *getVLOperand(llvm::Function *F) const {
+    return getOperand(F, VLOpIdx);
+  }
+
+  /// @brief Index of the data operand of the MemOp
+  /// @return The index, or -1 if no data operand
+  int8_t getDataOperandIndex() const { return DataOpIdx; }
+  /// @brief Index of the pointer operand of the MemOp
+  /// @return The index, or -1 if no pointer operand
+  int8_t getPointerOperandIndex() const { return PtrOpIdx; }
+  /// @brief Index of the mask operand of the MemOp
+  /// @return The index, or -1 if no mask operand
+  int8_t getMaskOperandIndex() const { return MaskOpIdx; }
+  /// @brief Index of the vector-length operand of the MemOp
+  /// @return The index, or -1 if no mask operand
+  int8_t getVLOperandIndex() const { return VLOpIdx; }
+
+  /// @brief Get what kind of memory operation this is.
+  /// @return The kind of the memory operation
+  MemOpKind getKind() const { return Kind; }
+
+  /// @brief Get the alignment of the memory operation.
+  /// @return The alignment in bytes
+  unsigned getAlignment() const { return Alignment; }
+
+  /// @brief In the case of a interleaved memory operation, return the stride.
+  /// @return The Value determining the stride
+  llvm::Value *getStride() const { return Stride; }
+  /// @brief Determine if the stride is an integer whose value can be determined
+  /// at compile time.
+  /// @return True is the stride is a compile time integer constant
+  bool isStrideConstantInt() const;
+  /// @brief Get the stride as a constant int. It assumes that it is possible
+  /// and valid to do so.
+  /// @return The stride in elements
+  int64_t getStrideAsConstantInt() const;
+
+  /// @brief Return the type of data element being accessed in memory.
+  /// @return The type of the data element being accessed in memory.
+  llvm::Type *getDataType() const { return DataTy; }
+
+  /// @brief Return the type of the pointer operand.
+  /// @return The type the pointer operand
+  llvm::Type *getPointerType() const { return PtrTy; }
+
+  /// @brief Return the specified operand from the function.
+  ///
+  /// @param[in] F Function to retrieve the operand from.
+  /// @param[in] OpIdx Index of the operand to retrieve.
+  ///
+  /// @return Operand or null.
+  llvm::Argument *getOperand(llvm::Function *F, int OpIdx) const;
+
+  /// @brief Determine whether the given function is a memory operation.
+  /// If that's the case, the descriptor is populated and returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a memory operation.
+  /// llvm::None otherwise.
+  static multi_llvm::Optional<MemOpDesc> analyzeMemOpFunction(
+      llvm::Function &F);
+
+  /// @brief Determine whether the given function is a masked memory operation.
+  /// If that's the case, the descriptor is populated and returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a masked memory operation.
+  /// llvm::None otherwise.
+  static multi_llvm::Optional<MemOpDesc> analyzeMaskedMemOp(llvm::Function &F);
+
+  /// @brief Determine whether the given function is an interleaved memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is an interleaved memory
+  /// operation. llvm::None otherwise.
+  static multi_llvm::Optional<MemOpDesc> analyzeInterleavedMemOp(
+      llvm::Function &F);
+
+  /// @brief Determine whether the given function is a masked interleaved memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a masked interleaved memory
+  /// operation. llvm::None otherwise.
+  static multi_llvm::Optional<MemOpDesc> analyzeMaskedInterleavedMemOp(
+      llvm::Function &F);
+
+  /// @brief Determine whether the given function is a scatter/gather memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a scatter/gather operation.
+  /// llvm::None otherwise.
+  static multi_llvm::Optional<MemOpDesc> analyzeScatterGatherMemOp(
+      llvm::Function &F);
+
+  /// @brief Determine whether the given function is a scatter/gather memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a masked scatter/gather
+  /// operation. llvm::None otherwise.
+  static multi_llvm::Optional<MemOpDesc> analyzeMaskedScatterGatherMemOp(
+      llvm::Function &F);
+
+  /// @brief Determine whether the operation is a load or not.
+  bool isLoad() const {
+    switch (Kind) {
+      default:
+        return false;
+      case MemOpKind::LoadInstruction:
+      case MemOpKind::LoadCall:
+        return true;
+    }
+  }
+
+  /// @brief Determine whether the operation is a store or not.
+  bool isStore() const {
+    switch (Kind) {
+      default:
+        return false;
+      case MemOpKind::StoreInstruction:
+      case MemOpKind::StoreCall:
+        return true;
+    }
+  }
+
+  /// @brief Determine whether the operation is an instruction or not.
+  bool isLoadStoreInst() const {
+    switch (Kind) {
+      default:
+        return false;
+      case MemOpKind::LoadInstruction:
+      case MemOpKind::StoreInstruction:
+        return true;
+    }
+  }
+
+  bool isVLOp() const { return IsVLOp; }
+};
+
+/// @brief Wrapper that combines a memory operation descriptor and instruction.
+/// This allows manipulating different kinds of memory operations (load and
+/// store instructions, vecz builtins) in the same way.
+struct MemOp {
+  /// @brief Create an invalid memory operation.
+  MemOp() {}
+  /// @brief Create a memory operation from an instruction and an existing
+  /// memory operation descriptor.
+  ///
+  /// @param[in] I Memory instruction.
+  /// @param[in] Desc Memory operation descriptor.
+  MemOp(llvm::Instruction *I, const MemOpDesc &Desc);
+  /// @brief Create a memory operation from an instruction.
+  /// @param[in] I Instruction that may be a memory operation.
+  static multi_llvm::Optional<MemOp> get(llvm::Instruction *I);
+  /// @brief Create a memory operation from an instruction and an existing
+  /// memory operation descriptor.
+  ///
+  /// @param[in] CI Memory builtin call instruction.
+  /// @param[in] AccessKind the kind of access to consider
+  static multi_llvm::Optional<MemOp> get(llvm::CallInst *CI,
+                                         MemOpAccessKind AccessKind);
+
+  /// @brief Access the memory operation descriptor.
+  const MemOpDesc &getDesc() const { return Desc; }
+
+  /// @brief Access the memory operation descriptor.
+  MemOpDesc &getDesc() { return Desc; }
+
+  /// @brief Return the instruction that performs the memory operation.
+  llvm::Instruction *getInstr() const { return Ins; }
+
+  /// @brief Return the alignment in bytes.
+  unsigned getAlignment() const { return Desc.getAlignment(); }
+
+  /// @brief In the case of a interleaved memory operation, return the stride.
+  llvm::Value *getStride() const { return Desc.getStride(); }
+
+  /// @brief Return the type of data element being accessed in memory.
+  llvm::Type *getDataType() const { return Desc.getDataType(); }
+
+  /// @brief Return the type of the pointer operand.
+  llvm::Type *getPointerType() const { return Desc.getPointerType(); }
+
+  /// @brief Determine whether the operation is a load or not.
+  bool isLoad() const { return Desc.isLoad(); }
+
+  /// @brief Determine whether the operation is a store or not.
+  bool isStore() const { return Desc.isStore(); }
+
+  /// @brief Determine whether the operation is an instruction or not.
+  bool isLoadStoreInst() const { return Desc.isLoadStoreInst(); }
+
+  /// @brief Determine whether the operation is a masked memop call
+  bool isMaskedMemOp() const { return Desc.isMaskedMemOp(); }
+
+  /// @brief Determine whether the operation is a scatter/gather memop call
+  bool isMaskedScatterGatherMemOp() const {
+    return Desc.isMaskedScatterGatherMemOp();
+  }
+
+  /// @brief Determine whether the operation is a masked interleaved memop call
+  bool isMaskedInterleavedMemOp() const {
+    return Desc.isMaskedInterleavedMemOp();
+  }
+
+  /// @brief In the case of stores, return the data element being stored.
+  /// @return Data operand or null.
+  llvm::Value *getDataOperand() const;
+  /// @brief Return the pointer used by the memory operation.
+  /// @return Pointer used by the memory operation or null for invalid
+  /// operations.
+  llvm::Value *getPointerOperand() const;
+  /// @brief In the case of a masked memory operation, return the mask.
+  /// @return Mask operand or null.
+  llvm::Value *getMaskOperand() const;
+
+  /// @brief In the case of stores, set the data element being stored.
+  /// @return true on success.
+  bool setDataOperand(llvm::Value *V);
+  /// @brief Set the pointer used by the memory operation.
+  /// @return true on success.
+  bool setPointerOperand(llvm::Value *V);
+  /// @brief In the case of a masked memory operation, set the mask.
+  /// @return true on success.
+  bool setMaskOperand(llvm::Value *V);
+
+  /// @brief In the case of a builtin memory operation, return the call.
+  /// @return Call instruction or null.
+  llvm::CallInst *getCall() const;
+
+  /// @brief Determine if the stride is an integer whose value can be determined
+  /// at compile time.
+  /// @return True is the stride is a compile time integer constant
+  bool isStrideConstantInt() const { return Desc.isStrideConstantInt(); }
+  /// @brief Get the stride as a constant int. It assumes that it is possible
+  /// and valid to do so.
+  /// @return The stride in elements
+  int64_t getStrideAsConstantInt() const {
+    return Desc.getStrideAsConstantInt();
+  }
+
+ private:
+  /// @brief Access an operand of the call instruction.
+  ///
+  /// @param[in] OpIdx Index of the operand to access.
+  ///
+  /// @return Specified operand of the call instruction.
+  llvm::Value *getCallOperand(int OpIdx) const;
+
+  /// @brief Set an operand of the call instruction.
+  ///
+  /// @param[in] OpIdx Index of the operand to access.
+  /// @param[in] V the Value to set
+  ///
+  /// @return true on success.
+  bool setCallOperand(int OpIdx, llvm::Value *V);
+
+  /// @brief Instruction that performs the memory operation.
+  llvm::Instruction *Ins;
+  /// @brief Describes the memory operation.
+  MemOpDesc Desc;
+};
+
+namespace {
+inline llvm::ConstantInt *getSizeInt(llvm::IRBuilder<> &B, int64_t val) {
+  if (B.GetInsertBlock()->getModule()->getDataLayout().getPointerSize() == 4) {
+    return B.getInt32(val);
+  }
+  return B.getInt64(val);
+}
+
+inline llvm::IntegerType *getSizeTy(llvm::Module &M) {
+  if (M.getDataLayout().getPointerSize() == 4) {
+    return llvm::Type::getInt32Ty(M.getContext());
+  }
+  return llvm::Type::getInt64Ty(M.getContext());
+}
+
+inline llvm::IntegerType *getSizeTy(llvm::IRBuilder<> &B) {
+  return getSizeTy(*(B.GetInsertBlock()->getModule()));
+}
+}  // namespace
+}  // namespace vecz
+
+#endif  // VECZ_MEMORY_OPERATIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
new file mode 100644
index 0000000000000..77e2c49da72db
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -0,0 +1,261 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Analysis of memory pointer offsets.
+
+#ifndef VECZ_OFFSET_INFO_H_INCLUDED
+#define VECZ_OFFSET_INFO_H_INCLUDED
+
+#include <inttypes.h>
+#include <llvm/IR/IRBuilder.h>
+
+namespace llvm {
+class CallInst;
+class Value;
+class Type;
+}  // namespace llvm
+
+namespace vecz {
+
+struct UniformValueResult;
+class ValueTagMap;
+
+/// @brief Item ID dependence kinds that an expression can have.
+/// Note that these are all mutually exclusive.
+enum OffsetKind {
+  /// @brief The offset may diverge in unmodelled ways when vectorized. This
+  /// state is to be assumed unless it can be proved otherwise.
+  eOffsetMayDiverge,
+  /// @brief The offset is a compile-time constant.
+  eOffsetConstant,
+  /// @brief The offset is a uniform variable.
+  eOffsetUniformVariable,
+  /// @brief The offset has a work-item ID dependence. The ID might be scaled
+  /// by some stride != 1, in which case loads or stores dependent on it will
+  /// be interleaved.
+  eOffsetLinear
+};
+
+class StrideAnalysisResult;
+
+/// @brief Describes an offset used by a load or store instruction we want to
+/// vectorize.
+struct OffsetInfo {
+  /// @brief Properties of the offset, which may prevent vectorization.
+  OffsetKind Kind;
+  /// @brief The actual value of the analyzed expression.
+  llvm::Value *const ActualValue;
+  /// @brief The difference in this value between two consecutive work items,
+  /// as a constant integer.
+  /// When the stride is a pointer, the difference is in bytes.
+  int64_t StrideInt;
+  /// @brief The difference in this value between two consecutive work items,
+  /// as a uniform value.
+  /// When the stride is a pointer, the difference is in bytes.
+  /// This is nullptr after analysis and is set upon calling `manifest()`.
+  llvm::Value *ManifestStride;
+
+  /// @brief A bit mask indicating which bits of the value it is possible to be
+  /// set, based on the expressions it depends on.
+  uint64_t BitMask;
+
+  /// @brief Construct a new offset information object from a general value
+  /// @param[in] B The StrideAnalysisResult used to retrieve other OffsetInfos.
+  /// @param[in] V Offset value to analyze.
+  OffsetInfo(StrideAnalysisResult &SAR, llvm::Value *V);
+
+  OffsetInfo() = delete;
+  OffsetInfo(const OffsetInfo &) = default;
+
+  /// @brief Return whether the offset has a non-analytical dedpendence on work
+  /// item ID.
+  bool mayDiverge() const { return Kind == eOffsetMayDiverge; }
+
+  /// @brief Return whether the offset has a linear dependence on work item ID.
+  bool hasStride() const { return Kind == eOffsetLinear; }
+
+  /// @brief Return whether the offset is a compile-time constant.
+  bool isConstant() const { return Kind == eOffsetConstant; }
+
+  /// @brief Return whether the offset has no dependence on work item ID.
+  bool isUniform() const {
+    return Kind == eOffsetConstant || Kind == eOffsetUniformVariable;
+  }
+
+  /// @brief Returns the actual value of the analyzed offset if it is uniform.
+  ///
+  /// @return The uniform Value or nullptr otherwise
+  llvm::Value *getUniformValue() const;
+  /// @brief Get the offset as a constant int. It assumes that it is possible to
+  /// do so.
+  /// @return The offset as an integer
+  int64_t getValueAsConstantInt() const;
+  /// @brief Get the Stride of the analyzed and manifested value.
+  /// @return The stride in number of elements
+  llvm::Value *getStride() const { return ManifestStride; }
+  /// @brief Determine whether the stride is simply a constant compile time
+  /// integer.
+  /// @return true if the stride is linear and constant, false otherwise.
+  bool isStrideConstantInt() const;
+  /// @brief Get the stride as a constant int.
+  /// @return The stride as an integer, or zero if the stride is not constant.
+  int64_t getStrideAsConstantInt() const;
+
+  /// @brief Convert the bytewise stride into an element-wise stride based on
+  /// the data type and data layout, as an integer.
+  ///
+  /// @param[in] PtrTy The element data type.
+  /// @param[in] DL The Data Layout.
+  /// @return The memory stride as number of elements.
+
+  uint64_t getConstantMemoryStride(llvm::Type *PtrEleTy,
+                                   llvm::DataLayout const *DL) const;
+
+  /// @brief Convert the bytewise stride into an element-wise stride based on
+  /// the data type and data layout, building instructions where needed. Note
+  /// that the stride must be manifest first.
+  ///
+  /// @param[in] B an IRBuilder used for creating constants or instructions.
+  /// @param[in] PtrTy The element data type.
+  /// @param[in] DL The Data Layout.
+  /// @return The memory stride as number of elements.
+  llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Type *PtrEleTy,
+                                 llvm::DataLayout const *DL) const;
+
+  /// @brief Create Values that represent or compute strides.
+  ///
+  /// @param[in] B an IRBuilder used for creating constants or instructions.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &manifest(llvm::IRBuilder<> &B, StrideAnalysisResult &SAR);
+
+ private:
+  /// @brief Mark this offset with the given flag.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setKind(OffsetKind Kind);
+  /// @brief Mark this offset as having a stride component.
+  /// @param[in] Stride Stride component applied to the item ID.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setStride(llvm::Value *Stride);
+  /// @brief Mark this offset as having a stride component.
+  /// @param[in] Stride Stride component applied to the item ID.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setStride(int64_t Stride);
+  /// @brief Mark this offset as possibly diverging.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setMayDiverge();
+
+  /// @brief Analyse the given integer offset for properties that we need to
+  /// know in order to vectorize loads and stores. In particular we are
+  /// interested in knowing whether the offset can diverge (be different for
+  /// different items) or not. We can handle divergence in several cases but not
+  /// all.
+  ///
+  /// @param[in] Offset Offset value to analyze.
+  /// @param[in] SAR Result of the stride analysis.
+  ///
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &analyze(llvm::Value *Offset, StrideAnalysisResult &SAR);
+
+  /// @brief Analyse the given pointer for properties that we need to
+  /// know in order to vectorize loads and stores. In particular we are
+  /// interested in knowing whether the offset can diverge (be different for
+  /// different items) or not. We can handle divergence in several cases but not
+  /// all.
+  ///
+  /// @param[in] Address Pointer to analyze.
+  /// @param[in] SAR Result of the stride analysis.
+  ///
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &analyzePtr(llvm::Value *Address, StrideAnalysisResult &SAR);
+
+  /// @brief Combine the offset info of LHS and RHS operands of an add
+  /// operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineAdd(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestAdd(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a sub operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineSub(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestSub(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of an and
+  /// operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineAnd(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestAnd(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of an or operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineOr(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestOr(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                         const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a xor operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineXor(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestXor(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a shl operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineShl(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestShl(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a ashr
+  /// operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineAShr(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestAShr(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                           const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a mul operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineMul(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestMul(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Copies the stride information from another OffsetInfo into this one
+  /// @param[in] Other the other OffsetInfo to copy from
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &copyStrideFrom(const OffsetInfo &Other);
+};
+
+}  // namespace vecz
+
+#endif  // #define VECZ_OFFSET_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
new file mode 100644
index 0000000000000..402913d912925
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
@@ -0,0 +1,116 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief A utility class to speed of reachability queries on a CFG
+
+#ifndef VECZ_REACHABILITY_H_INCLUDED
+#define VECZ_REACHABILITY_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <vector>
+
+namespace llvm {
+class BasicBlock;
+class DominatorTree;
+class Function;
+class LoopInfo;
+class PostDominatorTree;
+}  // namespace llvm
+
+namespace vecz {
+
+/// @brief A data structure to handle reachability queries
+class Reachability {
+ public:
+  /// @brief Construct the Reachability computation from a Dominator Tree
+  ///        and a Post-Dominator Tree, that are used to speed up the queries.
+  /// @param[in] DT the Dominator Tree
+  /// @param[in] PDT the Post-Dominator Tree
+  /// @param[in] LI the Loop Info
+  Reachability(llvm::DominatorTree &DT, llvm::PostDominatorTree &PDT,
+               llvm::LoopInfo &LI);
+
+  /// @brief Destructor
+  ~Reachability() = default;
+
+  /// @brief Computes a new data structure from the provided block tag list,
+  ///        overwriting any data that was already present.
+  ///
+  /// Back edges are disregarded during this process.
+  void recalculate(llvm::Function &F);
+
+  /// @brief Computes a new data structure from the provided block tag list,
+  ///        only if the structure is currently empty. Otherwise, does nothing.
+  void update(llvm::Function &F);
+
+  /// @brief Clears the data structure.
+  ///
+  /// Updating the underlying CFG invalidates the Reachability computations,
+  /// so it is required to clear the data ready to accept a new CFG.
+  void clear();
+
+  /// @brief Checks the internal consistency of the computed data structure.
+  bool validate() const;
+
+  /// @brief Check if a block is reachable from another.
+  ///
+  /// @param[in] from the BasicBlock to start from
+  /// @param[in] to the BasicBlock we are trying to reach
+  ///
+  /// @return True if "to" is reachable from "from"
+  bool isReachable(llvm::BasicBlock *from, llvm::BasicBlock *to) const;
+
+ private:
+  /// @brief Internal implementation of isReachable
+  ///
+  /// @param[in] from the graph node index to start from
+  /// @param[in] to the graph node index we are trying to reach
+  ///
+  /// @return True if "to" is reachable from "from"
+  bool isReachableImpl(size_t from, size_t to) const;
+
+  /// @brief The Dominator Tree
+  llvm::DominatorTree &DT;
+  /// @brief The Post-Dominator Tree
+  llvm::PostDominatorTree &PDT;
+  /// @brief The Loop Info, used to determine back-edges
+  llvm::LoopInfo &LI;
+
+  /// @brief Node structure containing implementational details
+  ///        computed and used by the algorithm.
+  struct Rnode {
+    size_t X = 0;
+    size_t Y = 0;
+    size_t dom = 0;
+    size_t postDom = 0;
+    unsigned predTmp = 0;
+    unsigned predecessors = 0;
+    llvm::SmallVector<size_t, 2> successors;
+  };
+
+  /// @brief The list of graph nodes that encode the graph.
+  std::vector<Rnode> graph;
+
+  /// @brief A mapping between BasicBlock pointers and graph node indices.
+  llvm::DenseMap<llvm::BasicBlock *, size_t> indexMap;
+};
+}  // namespace vecz
+
+#endif  // VECZ_REACHABILITY_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
new file mode 100644
index 0000000000000..1c7bf61e4e2ab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
@@ -0,0 +1,100 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief SIMD packets hold a value for each lane.
+
+#ifndef VECZ_SIMD_PACKET_H_INCLUDED
+#define VECZ_SIMD_PACKET_H_INCLUDED
+
+#include "debugging.h"
+
+namespace llvm {
+class Value;
+}
+
+namespace vecz {
+
+/// @brief Represents the status of lanes within a packet. The most common
+/// status would be that a lane can be either enabled or disabled.
+struct PacketMask {
+  /// @brief Create a new mask where all lanes are disabled.
+  explicit PacketMask() : Value(0) {}
+  /// @brief Create a new mask using an existing bit field.
+  explicit PacketMask(uint64_t Mask) : Value(Mask) {}
+
+  /// @brief Determine whether the given lane is enabled or not.
+  /// @param[in] Lane Index of the lane to test.
+  /// @return true if the lane is enabled, false otherwise.
+  bool isEnabled(unsigned Lane) const {
+    assert(Lane < CHAR_BIT * sizeof(Value) &&
+           "Invalid lane, possible mask overflow");
+    return (Value & (1ull << Lane)) != 0ull;
+  }
+
+  /// @brief Enable the given lane.
+  /// @param[in] Lane Index of the lane to enable.
+  void enable(unsigned Lane) {
+    assert(Lane < CHAR_BIT * sizeof(Value) &&
+           "Invalid lane, possible mask overflow");
+    Value |= (1ull << Lane);
+  }
+
+  /// @brief Disable the given lane.
+  /// @param[in] Lane Index of the lane to disable.
+  void disable(unsigned Lane) {
+    assert(Lane < CHAR_BIT * sizeof(Value) &&
+           "Invalid lane, possible mask overflow");
+    Value &= ~(1ull << Lane);
+  }
+  /// @brief Enable multiple lanes [0: NumLanes)
+  /// @param[in] NumLanes Number of lanes to enable.
+  void enableAll(unsigned NumLanes);
+
+  /// @brief Bit field that describes which lanes are enabled.
+  /// NOTE: The length of bitfield is limited to sizeof(uint64_t) * CHAR_BIT(8)
+  uint64_t Value;
+};
+
+/// @brief Packet of LLVM values (e.g. instructions), one for each SIMD lane.
+struct SimdPacket : public llvm::SmallVector<llvm::Value *, 4> {
+  /// @brief Create a new packet with no value and all lanes disabled.
+  SimdPacket();
+
+  /// @brief Return the value at the given index.
+  /// @param[in] Index Index of the value to return.
+  /// @return Value at the given index or null.
+  llvm::Value *at(unsigned Index) const;
+  /// @brief Set the value at the given index and enable the corresponding lane.
+  /// @param[in] Index Index of the value to set.
+  /// @param[in] V Value to store at the given index.
+  void set(unsigned Index, llvm::Value *V);
+  /// @brief Copy all enabled lanes from the other packet and update the mask.
+  /// @param[in] Other Packet to copy values from.
+  /// @return Reference to the current packet.
+  SimdPacket &update(const SimdPacket &Other);
+
+  /// @brief Bitmask of lanes that are 'enabled' in this packet.
+  /// This can mean different things depending on the context:
+  /// * By default, only lanes that are 'enabled' have a valid value.
+  /// * When scalarizing, only lanes that are 'enabled' will be scalarized.
+  PacketMask Mask;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_SIMD_PACKET_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
new file mode 100644
index 0000000000000..be3ab1ac66520
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
@@ -0,0 +1,56 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Remove duplicate GEP instructions.
+
+#ifndef VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// @brief This pass removes every duplicate GEP instruction before the
+/// packetization pass.
+class CommonGEPEliminationPass
+    : public llvm::PassInfoMixin<CommonGEPEliminationPass> {
+ public:
+  static void *ID() { return (void *)&PassID; };
+
+  /// @brief Remove duplicate GEP instructions.
+  ///
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved passes.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() { return "Common GEP Elimination pass"; }
+
+ private:
+  /// @brief Identifier for the pass.
+  static char PassID;
+};
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
new file mode 100644
index 0000000000000..6290119f88df1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -0,0 +1,153 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Control flow partial linearization transform.
+
+#ifndef VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <memory>
+
+namespace llvm {
+class Function;
+class Value;
+class DominatorTree;
+class PostDominatorTree;
+class PreservedAnalyses;
+class LoopInfo;
+}  // namespace llvm
+
+namespace vecz {
+struct BasicBlockTag;
+struct LoopTag;
+struct UniformValueResult;
+class DivergenceResult;
+class VectorizationUnit;
+class VectorizationContext;
+class Reachability;
+
+/// \addtogroup cfg-conversion Control Flow Conversion Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Pass that convert performs control-flow to data-flow conversion for
+/// a function.
+class ControlFlowConversionPass
+    : public llvm::PassInfoMixin<ControlFlowConversionPass> {
+ public:
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Perform control-flow to data-flow conversion on the function's CFG.
+  ///
+  /// @param[in] F Function to convert.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() {
+    return "Control flow to data flow conversion";
+  }
+
+ private:
+  /// @brief Unique identifier for the pass.
+  static char PassID;
+};
+
+class ControlFlowConversionState {
+ public:
+  /// @brief The actual implementation of this pass
+  class Impl;
+
+ protected:
+  ControlFlowConversionState(llvm::Function &,
+                             llvm::FunctionAnalysisManager &AM);
+
+  /// @brief BOSCC (Branch On Superword Codition Code) data structure that
+  ///        encloses regions of the CFG that contain blocks that need to be
+  ///        duplicated.
+  class BOSCCGadget;
+
+  /// @brief ROSCC (Return On Superword Codition Code) utility class to
+  ///        optimize conditional function return branches.
+  class ROSCCGadget;
+
+  llvm::Function &F;
+  llvm::FunctionAnalysisManager &AM;
+  VectorizationUnit &VU;
+  VectorizationContext &Ctx;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+  llvm::LoopInfo *LI = nullptr;
+  DivergenceResult *DR = nullptr;
+  UniformValueResult *UVR = nullptr;
+  std::unique_ptr<BOSCCGadget> BOSCC;
+  std::unique_ptr<Reachability> RC;
+
+ private:
+  struct MaskInfo {
+    /// @brief Mask that describes which lanes have exited the block.
+    llvm::SmallDenseMap<llvm::BasicBlock *, llvm::Value *, 4> exitMasks;
+    /// @brief Mask that describes which lanes are active at the start of the
+    /// basic block.
+    llvm::Value *entryMask = nullptr;
+  };
+  llvm::DenseMap<llvm::BasicBlock *, MaskInfo> MaskInfos;
+
+  /// @brief get the Mask Info struct for a Basic Block.
+  /// Note that the returned reference may be invalidated by subsequent calls.
+  ///
+  /// @param[in] BB the BasicBlock
+  /// @returns a reference to the MaskInfo
+  const MaskInfo &getMaskInfo(llvm::BasicBlock *BB) const {
+    auto const found = MaskInfos.find(BB);
+    assert(found != MaskInfos.end() &&
+           "Mask Info not constructed for Basic Block!");
+    return found->second;
+  }
+
+  /// @brief replaces reachable uses of a value
+  ///
+  /// @param[in] RC the reachability computation to use
+  /// @param[in] from the value to replace
+  /// @param[in] to the value to substitute
+  /// @param[in] src the basic block from which the value must be reachable
+  ///
+  /// @returns true
+  static bool replaceReachableUses(Reachability &RC, llvm::Instruction *from,
+                                   llvm::Value *to, llvm::BasicBlock *src);
+
+  /// @brief Generate a block ordering.
+  ///
+  /// This is based on a dominance-compact block indexing (DCBI) where we
+  /// topologically order blocks that belong to the same dominator tree.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeBlockOrdering();
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
new file mode 100644
index 0000000000000..18c9466c55c73
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
@@ -0,0 +1,49 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Replace calls to certain builtins with an inline implementation after
+/// vectorization.
+
+#ifndef VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+/// @brief This pass replaces calls to builtins that require special attention
+/// after vectorization.
+class InlinePostVectorizationPass
+    : public llvm::PassInfoMixin<InlinePostVectorizationPass> {
+ public:
+  /// @brief Create a new pass object.
+  InlinePostVectorizationPass() {}
+
+  /// @brief The entry point to the pass.
+  /// @param[in,out] F Function to optimize.
+  /// @param[in,out] AM FunctionAnalysisManager providing analyses.
+  /// @returns Whether or not the pass changed anything.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+  /// @brief Retrieve the pass's name.
+  /// @return pointer to text description.
+  static llvm::StringRef name() { return "Inline Post Vectorization pass"; }
+};
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
new file mode 100644
index 0000000000000..9d010d167a49c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
@@ -0,0 +1,113 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function instantiator.
+
+#ifndef VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+
+namespace vecz {
+
+class Packetizer;
+class VectorizationContext;
+class PacketRange;
+struct MemOp;
+
+/// @brief Instantiation pass where instructions that need it (vector or not)
+/// are instantiated (i.e. duplicated with lane ID substitution), starting from
+/// the leaves.
+class InstantiationPass {
+ public:
+  /// @brief Create a new instantiation pass.
+  ///
+  /// @param[in] PP The packetizer object to call back to when required.
+  InstantiationPass(Packetizer &PP);
+
+  /// @brief Instantiate the given value from the function.
+  /// The returned value is equivalent to a clone of the V 'expression' with any
+  /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID.
+  ///
+  /// @param[in] V Value to instantiate.
+  ///
+  /// @return Instantiated value.
+  PacketRange instantiate(llvm::Value *V);
+
+ private:
+  /// @brief Duplicates an instruction across all SIMD Lanes.
+  ///
+  /// @param[in] I The instruction to duplicate across lanes
+  ///
+  /// @return The SIMD Packet
+  PacketRange instantiateByCloning(llvm::Instruction *I);
+  /// @brief Broadcasts an instruction across all SIMD Lanes.
+  ///
+  /// @param[in] I The instruction to extract elements from
+  ///
+  /// @return The SIMD Packet
+  PacketRange simdBroadcast(llvm::Instruction *I);
+  /// @brief Instantiate the given value from the function.
+  /// The returned value is equivalent to a clone of the V 'expression' with any
+  /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID.
+  ///
+  /// @param[in] V Value to instantiate.
+  ///
+  /// @return Instantiated value.
+  PacketRange instantiateInternal(llvm::Value *V);
+  /// @brief Instantiate the given intruction from the function.
+  /// The returned value is equivalent to a clone of the V 'expression' with any
+  /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID.
+  ///
+  /// @param[in] Ins instruction to instantiate.
+  ///
+  /// @return Instantiated value.
+  PacketRange instantiateInstruction(llvm::Instruction *Ins);
+  /// @brief Perform post-instantiation tasks.
+  ///
+  /// @param[in] P Packet that is the result of instantiation or null.
+  /// @param[in] V Value that was instantiated.
+  ///
+  /// @return Instantiated packet or null.
+  PacketRange assignInstance(const PacketRange P, llvm::Value *V);
+  /// @brief Create a packet where all lanes contain the same value.
+  ///
+  /// @param[in] V Value to broadcast.
+  ///
+  /// @return Packet with the broadcasted value.
+  PacketRange broadcast(llvm::Value *V);
+  /// @brief Instantiate a call instruction.
+  ///
+  /// @param[in] CI Instruction to instantiate.
+  ///
+  /// @return Instantiated packet for the given instruction.
+  PacketRange instantiateCall(llvm::CallInst *CI);
+  /// @brief Instantiate an alloca instruction.
+  ///
+  /// @param[in] Alloca Instruction to instantiate.
+  ///
+  /// @return Instantiated packet for the given instruction.
+  PacketRange instantiateAlloca(llvm::AllocaInst *Alloca);
+
+  VectorizationContext &Ctx;
+  Packetizer &packetizer;
+};
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
new file mode 100644
index 0000000000000..f73f17c20197d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
@@ -0,0 +1,94 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Combine groups of interleaved memory operations.
+
+#ifndef VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "vecz/vecz_target_info.h"
+
+namespace llvm {
+class ScalarEvolution;
+}
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// @brief Combine groups of interleaved memory operations.
+class InterleavedGroupCombinePass
+    : public llvm::PassInfoMixin<InterleavedGroupCombinePass> {
+ public:
+  /// @brief Create a new pass object.
+  ///
+  /// @param[in] kind Kind of interleaved operation to combine.
+  InterleavedGroupCombinePass(InterleavedOperation kind)
+      : Kind(kind), scalarEvolution(nullptr) {}
+
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Combine groups of interleaved operations.
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() {
+    return "Combine interleaved memory instructions";
+  }
+
+ private:
+  /// @brief Information about an interleaved operation.
+  struct InterleavedOpInfo;
+
+  /// @brief Information about a group of interleaved operations.
+  struct InterleavedGroupInfo;
+
+  /// @brief Try to find a group of interleaved instructions that have the same
+  /// stride and collectively access a consecutive chunk of memory.
+  ///
+  /// @param[in] Ops List of interleaved operations to analyze.
+  /// @param[in] UVR Result of uniform value analysis.
+  /// @param[out] Info information about a group of interleaved instructions.
+  ///
+  /// @return true if a group was found or false otherwise.
+  bool findGroup(const std::vector<InterleavedOpInfo> &Ops,
+                 UniformValueResult &UVR, InterleavedGroupInfo &Info);
+
+  /// @brief Unique identifier for the pass.
+  static char PassID;
+  /// @brief Kind of interleaved operation to combine.
+  InterleavedOperation Kind;
+
+  /// @brief Scalar Evolution Analysis that allows us to subtract two pointers
+  /// to find any constant offset between them.
+  llvm::ScalarEvolution *scalarEvolution;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
new file mode 100644
index 0000000000000..1dac719f28eee
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -0,0 +1,220 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function packetizer helper classes.
+
+#ifndef VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
+#define VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/IRBuilder.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <memory>
+
+namespace llvm {
+class Value;
+class ShuffleVectorInst;
+class Twine;
+}  // namespace llvm
+
+namespace vecz {
+class TargetInfo;
+struct SimdPacket;
+
+/// @brief Provides the insertion point after the value V. Intended to be used
+/// in IRBuilder constructor. If V has a position in the function, (e.g., an
+/// Instruction), this method will return the next point after that. If V has
+/// no position (e.g., a Constant or an Argument) then this method will return
+/// a suitable insertion point at the beginning of the function.
+///
+/// @param[in] V Value to insert instructions after, if an llvm::Instruction.
+/// @param[in] F Function to insert instructions into, if V is not an
+/// llvm::Instruction.
+/// @param[in] IsPhi true if the instructions to insert are phis, false if the
+/// insertion point should be after all phis in the basic block.
+///
+/// @return Insertion Point.
+llvm::Instruction *buildAfter(llvm::Value *V, llvm::Function &F,
+                              bool IsPhi = false);
+
+/// @brief Utility function for building a shufflevector instruction, absorbing
+/// its operands where possible.
+///
+/// @param[in] B IRBuilder to build any new instruction created
+/// @param[in] srcA the first vector operand of the new shuffle
+/// @param[in] srcB the second vector operand of the new shuffle
+/// @param[in] mask the shuffle mask
+/// @param[in] name the name of the new instruction
+///
+/// @return a value identical to the requested shufflevector
+llvm::Value *createOptimalShuffle(llvm::IRBuilder<> &B, llvm::Value *srcA,
+                                  llvm::Value *srcB,
+                                  const llvm::SmallVectorImpl<int> &mask,
+                                  const llvm::Twine &name = llvm::Twine());
+
+/// @brief Utility function for splatting a vector of scalars to create a
+/// "vector of vectors", being the concatenation of vector splats of its
+/// elements. eg. subSplat("ABCD", 4) == "AAAABBBBCCCCDDDD"
+///
+/// Only works on fixed vector types.
+///
+/// @param[in] TI TargetInfo for target-dependent optimizations
+/// @param[in] B IRBuilder to build any new instructions created
+/// @param[in,out] srcs The packet of vectors to sub-splat
+/// @param[in] subWidth The width of the individual splats
+///
+/// @return true on success
+bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B,
+                     llvm::SmallVectorImpl<llvm::Value *> &srcs,
+                     unsigned subWidth);
+
+/// @brief Utility function for sanitizing the input to a reduction when
+/// vector-predicating. Since VP reduction intrinsics didn't land in LLVM 13,
+/// reductions must ensure that elements past VL don't affect the result.
+///
+/// Only works on RecurKind::And, Or, Add, SMin, SMax, UMin, UMax, FAdd.
+///
+/// @param[in] B IRBuilder to build any new instructions created
+/// @param[in] Val The value to sanitize
+/// @param[in] VL The vector length
+/// @param[in] Kind The kind of reduction to sanitize for
+llvm::Value *sanitizeVPReductionInput(llvm::IRBuilder<> &B, llvm::Value *Val,
+                                      llvm::Value *VL, llvm::RecurKind Kind);
+
+/// @brief Utility function to obtain an indices vector to be used in a gather
+/// operation.
+///
+/// When accessing a vector using an indices vector, this must be
+/// modified taking into account the SIMD width.
+///
+/// @return An indices vector to be used in a gather operation; nullptr for LLVM
+/// version < 13.
+///
+/// @param[in] B IRBuilder to build any new instructions created
+/// @param[in] Indices Original indices vector
+/// @param[in] Ty Type of the output vector
+/// @param[in] FixedVecElts Original vector length
+/// @param[in] N Name of the output variable
+llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices,
+                                    llvm::Type *Ty, unsigned FixedVecElts,
+                                    const llvm::Twine &N = "");
+
+/// @brief Class that represents a range in a vector of Value pointers.
+/// The range is represented by its integer starting index and length, so that
+/// it remains valid if the vector re-allocates its storage.
+class PacketRange {
+ public:
+  using value_type = llvm::Value *;
+  using iterator = value_type *;
+  using const_iterator = const value_type *;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+
+  /// @brief Construct an empty range
+  constexpr PacketRange(std::vector<llvm::Value *> &d)
+      : data(d), start(0), length(0) {}
+  /// @brief Construct a range with given start index and length
+  constexpr PacketRange(std::vector<llvm::Value *> &d, size_t s, size_t l)
+      : data(d), start(s), length(l) {}
+
+  /// @brief Copy constructor
+  constexpr PacketRange(const PacketRange &) = default;
+  /// @brief Move constructor
+  constexpr PacketRange(PacketRange &&) = default;
+  /// @brief Destructor
+  ~PacketRange() = default;
+
+  /// @brief Return the length of the range
+  size_t size() const { return length; }
+  /// @brief Standard container begin iterator
+  iterator begin() { return &*data.begin() + start; }
+  /// @brief Standard container begin const iterator
+  const_iterator begin() const { return &*data.begin() + start; }
+  /// @brief Standard container end iterator
+  iterator end() { return begin() + length; }
+  /// @brief Standard container end const iterator
+  const_iterator end() const { return begin() + length; }
+  /// @brief Return a reference to the element at given index
+  reference at(size_t i) { return data[start + i]; }
+  /// @brief Return a const reference to the element at given index
+  const_reference at(size_t i) const { return data[start + i]; }
+  /// @brief Return a reference to the element at given index
+  reference operator[](size_t i) { return at(i); }
+  /// @brief Return a const reference to the element at given index
+  const_reference operator[](size_t i) const { return at(i); }
+  /// @brief Return a reference to the first element in the range
+  reference front() { return data[start]; }
+  /// @brief Return a const reference to the first element in the range
+  const_reference front() const { return data[start]; }
+  /// @brief Return a reference to the last element in the range
+  reference back() { return data[start + length - 1]; }
+  /// @brief Return a const reference to the last element in the range
+  const_reference back() const { return data[start + length - 1]; }
+
+  /// @brief Convert to bool
+  /// @returns false if length is zero, true otherwise
+  operator bool() const { return length != 0; }
+
+ private:
+  std::vector<llvm::Value *> &data;
+  const size_t start;
+  const size_t length;
+};
+
+/// @brief Structure to hold the strategy-agnostic result of packetizing an
+/// instruction (i.e. can represent either a vectorized or an instantiated
+/// value) that enables the result to be converted on demand.
+struct PacketInfo {
+  /// @brief The number of instances created during packetization
+  unsigned numInstances = 0;
+
+  /// @brief Vectorized value. Each element in the vector represents a scalar
+  /// instance (SIMD lane).
+  llvm::Value *vector = nullptr;
+
+  /// @brief Map of vector widths to packet range start indices
+  llvm::SmallDenseMap<unsigned, unsigned, 2> packets;
+
+  /// @brief Default constructor
+  PacketInfo() = default;
+  /// @brief Deleted copy constructor
+  PacketInfo(const PacketInfo &) = delete;
+  /// @brief Move constructor
+  PacketInfo(PacketInfo &&) = default;
+  /// @brief Destructor
+  ~PacketInfo() = default;
+  /// @brief Deleted copy assignment operator
+  PacketInfo &operator=(const PacketInfo &) = delete;
+  /// @brief Move assignment operator
+  PacketInfo &operator=(PacketInfo &&) = default;
+
+  /// @brief get the range of values for a given packet width
+  PacketRange getRange(std::vector<llvm::Value *> &d, unsigned width) const;
+
+  /// @brief get the range of values for the originally created packet.
+  PacketRange getRange(std::vector<llvm::Value *> &d) const {
+    return getRange(d, numInstances);
+  }
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
new file mode 100644
index 0000000000000..f817258a97ef9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
@@ -0,0 +1,77 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function packetizer.
+
+#ifndef VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// \addtogroup packetization Packetization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Vectorization pass where scalar instructions that need it are
+/// packetized, starting from leaves.
+class PacketizationPass : public llvm::PassInfoMixin<PacketizationPass> {
+ public:
+  /// @brief Create a new packetization pass object.
+  PacketizationPass() = default;
+
+  /// @brief Create a new packetization pass object.
+  ///
+  /// @param[in] P Pass to move.
+  PacketizationPass(PacketizationPass &&P) = default;
+
+  // Mark default copy constructor as deleted
+  PacketizationPass(const PacketizationPass &) = delete;
+
+  /// @brief Deleted move assignment operator.
+  ///
+  /// Also deletes the copy assignment operator.
+  PacketizationPass &operator=(PacketizationPass &&) = delete;
+
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Packetize the given function, duplicating its behaviour (defined
+  /// values and side effects) for each lane of a SIMD packet.
+  ///
+  /// @param[in] F Function to packetize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved passes.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() { return "Function packetization"; }
+
+  /// @brief Unique identifier for the pass.
+  static char PassID;
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
new file mode 100644
index 0000000000000..2c8b76306d3cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
@@ -0,0 +1,234 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function packetizer.
+
+#ifndef VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
+#define VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/TypeSize.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <memory>
+
+#include "ir_cleanup.h"
+#include "transform/packetization_helpers.h"
+
+namespace vecz {
+
+struct MemOp;
+class InstantiationPass;
+class PacketizationAnalysisResult;
+class StrideAnalysisResult;
+struct UniformValueResult;
+class VectorizationUnit;
+class VectorizationContext;
+class VectorizationChoices;
+
+/// \addtogroup packetization Packetization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief The implementation of the packetization process
+class Packetizer {
+ public:
+  class Result {
+    friend class Packetizer;
+
+   public:
+    Result() = delete;
+    Result(const Result &) = default;
+    constexpr Result(Result &&) = default;
+
+    Result(Packetizer &p) : packetizer(p), scalar(nullptr), info(nullptr) {}
+    Result(Packetizer &p, llvm::Value *s, PacketInfo *i)
+        : packetizer(p), scalar(s), info(i) {}
+
+    operator bool() const { return info; }
+
+    /// @brief Get a packetized/instantiated instruction as a vector value.
+    /// If the value was instantiated, this will construct and return a gather
+    /// of the SIMD lanes.
+    ///
+    /// @return Packetized value
+    llvm::Value *getAsValue() const;
+
+    /// @brief Get a packetized/instantiated instruction as a SIMD packet.
+    /// If the value was packetized, this will construct a new packet by
+    /// extracting the elements.
+    ///
+    /// @param[in] width the width of the packet to get.
+    ///
+    /// @return Instantiated packet
+    PacketRange getAsPacket(unsigned width) const;
+
+    /// @brief Get a copy of all the Values from the vector or packet, as
+    /// the width it was originally packetized to.
+    ///
+    /// @param[out] vals a vector of Values representing the result.
+    void getPacketValues(llvm::SmallVectorImpl<llvm::Value *> &vals) const;
+
+    /// @brief Get a copy of all the Values from the vector or packet.
+    /// When `width == 1` this will return a length-1 result containing the
+    /// vector valued result. Otherwise, it copies the values from the
+    /// packet of the requested width.
+    ///
+    /// @param[in] width the width of the packet to get.
+    /// @param[out] vals a vector of Values representing the result.
+    void getPacketValues(unsigned width,
+                         llvm::SmallVectorImpl<llvm::Value *> &vals) const;
+
+   private:
+    Packetizer &packetizer;
+    llvm::Value *const scalar;
+    PacketInfo *const info;
+
+    PacketRange createPacket(unsigned width) const;
+    PacketRange getRange(unsigned width) const;
+    PacketRange widen(unsigned width) const;
+    PacketRange narrow(unsigned width) const;
+    const Result &broadcast(unsigned width) const;
+  };
+
+  /// @brief Packetize the given function, duplicating its behaviour (defined
+  /// values and side effects) for each lane of a SIMD packet.
+  ///
+  /// @param[in] F Function to packetize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  /// @param[in] Width the vectorization factor
+  /// @param[in] Dim the vectorization dimension
+  ///
+  /// @return true if the function was packetized, false otherwise.
+  static bool packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                        llvm::ElementCount Width, unsigned Dim);
+
+  /// @brief Packetize the given value from the function.
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized value.
+  Result packetize(llvm::Value *V);
+
+  /// @brief Return an already packetized value.
+  ///
+  /// @param[in] V Value to query.
+  ///
+  /// @return Packetized value or nullptr.
+  Result getPacketized(llvm::Value *V);
+
+  /// @brief Create a new SIMD packet to hold an instantiated value.
+  ///
+  /// @param[in] V the value the packet will represent
+  /// @param[in] width the SIMD width of the packet
+  ///
+  /// @returns a new packet
+  PacketRange createPacket(llvm::Value *V, unsigned width);
+
+  /// @brief Get the Uniform Value Result
+  ///
+  /// @return the Uniform Value Result
+  const UniformValueResult &uniform() const { return UVR; }
+
+  /// @brief get the vectorization factor.
+  llvm::ElementCount width() const { return SimdWidth; }
+
+  /// @brief get the vectorization factor.
+  unsigned dimension() const { return Dimension; }
+
+  /// @brief get the function being packetized
+  llvm::Function &function() { return F; }
+
+  /// @brief get the Vectorization Context
+  VectorizationContext &context() { return Ctx; }
+
+  /// @brief get the Vectorization Context
+  const VectorizationChoices &choices() const { return Choices; }
+
+  PacketRange getEmptyRange() { return PacketRange(packetData); }
+
+  /// @brief mark the instruction for deletion when packetization finishes
+  void deleteInstructionLater(llvm::Instruction *I) {
+    IC.deleteInstructionLater(I);
+  }
+
+ private:
+  Packetizer(llvm::Function &, llvm::FunctionAnalysisManager &AM,
+             llvm::ElementCount Width, unsigned Dim);
+  Packetizer() = delete;
+  Packetizer(const Packetizer &) = delete;
+  Packetizer(Packetizer &&) = delete;
+  ~Packetizer() = default;
+
+  llvm::FunctionAnalysisManager &AM;
+  VectorizationUnit &VU;
+  VectorizationContext &Ctx;
+  const VectorizationChoices &Choices;
+  UniformValueResult &UVR;
+  StrideAnalysisResult &SAR;
+  PacketizationAnalysisResult &PAR;
+  llvm::Function &F;
+  IRCleanup IC;
+
+  /// @brief Vectorization factor
+  llvm::ElementCount SimdWidth;
+
+  /// @brief Vectorization dimension
+  unsigned Dimension;
+
+  /// @brief Map onto packetized versions of scalar values
+  llvm::DenseMap<llvm::Value *, PacketInfo> packets;
+
+  /// @brief Central storage for all the packetized values
+  ///
+  /// This vector is a contiguous storage for all the wide packets created
+  /// during the packetization process. New packets get allocated to a
+  /// range at the end of the vector, and are referenced by index so that
+  /// they are not invalidated when the storage is re-allocated. Vector
+  /// elements will never be erased during packetization, and the data will
+  /// not be cleared until the packetizer itself is destroyed.
+  /*
+                 /^ ^\
+     "No take"  / 0 0 \
+                V\ Y /V  */
+  std::vector<llvm::Value *> packetData;
+  /*             |    \
+                 || (__V  "ONLY GROW"
+  */
+
+  /// @brief The value representing the current (dynamic) active vector length
+  /// for this kernel. This value is the *base* vector length for one scalar
+  /// work-item; vector operations must be scaled according to their vector
+  /// width.
+  /// If non-null, packetized operations are required to respect this active
+  /// length if they would produce side effects.
+  llvm::Value *VL = nullptr;
+
+  /// @brief This class contains the private implementation of the packetizer.
+  /// Declaring it as an inner class of the Packetizer class allows it access
+  /// to its private members (including its constructor).
+  class Impl;
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
new file mode 100644
index 0000000000000..a2d115a939589
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
@@ -0,0 +1,209 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Factory functions for some Vecz support passes
+
+#ifndef VECZ_TRANSFORM_PASSES_H_INCLUDED
+#define VECZ_TRANSFORM_PASSES_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+#include <llvm/Transforms/Scalar/LoopPassManager.h>
+
+namespace compiler {
+namespace utils {
+class BuiltinInfo;
+}  // namespace utils
+}  // namespace compiler
+
+namespace vecz {
+class SimplifyInfiniteLoopPass
+    : public llvm::PassInfoMixin<SimplifyInfiniteLoopPass> {
+ public:
+  SimplifyInfiniteLoopPass() = default;
+
+  llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &,
+                              llvm::LoopStandardAnalysisResults &,
+                              llvm::LPMUpdater &);
+};
+
+/// @brief This pass replaces calls to builtins that require special attention
+/// (e.g. there is no scalar or vector equivalent) with inline implementations.
+class BuiltinInliningPass : public llvm::PassInfoMixin<BuiltinInliningPass> {
+ public:
+  /// @brief Create a new pass object.
+  BuiltinInliningPass() = default;
+
+  /// @brief The entry point to the pass.
+  /// @param[in,out] M Module to optimize.
+  /// @param[in,out] AM ModuleAnalysisManager providing analyses.
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+
+  /// @brief Retrieve the pass's name.
+  /// @return pointer to text description.
+  static llvm::StringRef name() { return "OpenCL builtin inlining pass"; }
+
+ private:
+  /// @brief Process a call site, inlining it or marking it as needing inlining
+  /// if required.
+  ///
+  /// @param[in] CI Call site to inspect.
+  /// @param[out] NeedLLVMInline Whether the call site needs LLVM inlining.
+  ///
+  /// @return New return value for the call instruction.
+  llvm::Value *processCallSite(llvm::CallInst *CI, bool &NeedLLVMInline);
+};
+
+/// @brief This pass tries to remove unecessary allocas that are not optimized
+/// away by LLVM's Mem2Reg pass, for example in the presence of bitcasts. It is
+/// however much simpler than LLVM's.
+class BasicMem2RegPass : public llvm::PassInfoMixin<BasicMem2RegPass> {
+ public:
+  BasicMem2RegPass(){};
+
+  /// @brief The entry point to the pass.
+  /// @param[in,out] F Function to optimize.
+  /// @param[in,out] AM FunctionAnalysisManager providing analyses.
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+  /// @brief Retrieve the pass's name.
+  /// @return pointer to text description.
+  static llvm::StringRef name() { return "Basic Mem2Reg Pass"; }
+
+ private:
+  /// @brief Determine whether the alloca can be promoted or not.
+  ///
+  /// This is the case when it is inside the entry block, there is at most one
+  /// store to it and all other users are loads (possibly through bitcasts).
+  /// The store must also be in the entry block and precede all loads.
+  ///
+  /// @param[in] Alloca Alloca instruction to analyze.
+  /// @return true if the alloca can be promoted, false otherwise.
+  bool canPromoteAlloca(llvm::AllocaInst *Alloca) const;
+  /// @brief Try to promote the alloca, remove store users and replacing load
+  /// users by the stored values. The alloca itself isn't touched.
+  /// @param[in] Alloca Alloca instruction to promote.
+  /// @return true if the alloca was promoted, false otherwise.
+  bool promoteAlloca(llvm::AllocaInst *Alloca) const;
+};
+
+class PreLinearizePass : public llvm::PassInfoMixin<PreLinearizePass> {
+ public:
+  PreLinearizePass() = default;
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  static llvm::StringRef name() { return "Prepare for SPMD linearization"; }
+};
+
+/// @brief Wraps llvm's LoopRotatePass but retricts the range of loops on which
+/// it works.
+class VeczLoopRotatePass : public llvm::PassInfoMixin<VeczLoopRotatePass> {
+ public:
+  VeczLoopRotatePass() {}
+
+  llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &,
+                              llvm::LoopStandardAnalysisResults &,
+                              llvm::LPMUpdater &);
+
+  static llvm::StringRef name() { return "Vecz Loop Rotation Wrapper"; };
+};
+
+class RemoveIntPtrPass : public llvm::PassInfoMixin<RemoveIntPtrPass> {
+ public:
+  RemoveIntPtrPass() = default;
+
+  static llvm::StringRef name() { return "Remove IntPtr instructions"; }
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &);
+};
+
+class SquashSmallVectorsPass
+    : public llvm::PassInfoMixin<SquashSmallVectorsPass> {
+ public:
+  SquashSmallVectorsPass() = default;
+
+  static llvm::StringRef name() { return "Squash Small Vectors"; }
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &);
+};
+
+/// @brief Try to replace or remove masked memory operations that are trivially
+/// not needed or can be converted to non-masked operations.
+class SimplifyMaskedMemOpsPass
+    : public llvm::PassInfoMixin<SimplifyMaskedMemOpsPass> {
+ public:
+  /// @brief Create a new pass object.
+  SimplifyMaskedMemOpsPass() = default;
+
+  /// @brief Replace masked memory operations that use 'all true' masks by
+  /// regular memory operations, and remove masked operations that use 'all
+  /// false' masks.
+  ///
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() { return "Simplify masked memory operations"; }
+};
+
+/// @brief reassociate uniform binary operators and split branches
+class UniformReassociationPass
+    : public llvm::PassInfoMixin<UniformReassociationPass> {
+ public:
+  UniformReassociationPass() = default;
+
+  static llvm::StringRef name() { return "Reassociate uniform binops"; }
+
+  llvm::PreservedAnalyses run(llvm::Function &,
+                              llvm::FunctionAnalysisManager &);
+};
+
+/// @brief Removes uniform divergence reductions created by CFG conversion
+class DivergenceCleanupPass
+    : public llvm::PassInfoMixin<DivergenceCleanupPass> {
+ public:
+  /// @brief Create a new pass object.
+  DivergenceCleanupPass() = default;
+
+  /// @brief Remove uniform divergence reductions.
+  ///
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() {
+    return "Remove uniform divergence reductions";
+  }
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_PASSES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
new file mode 100644
index 0000000000000..af4cd7ed67b0b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
@@ -0,0 +1,117 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+
+#ifndef VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
+#define VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
+
+#include <string>
+
+namespace llvm {
+class Module;
+class User;
+class Instruction;
+template <typename T, unsigned N>
+class SmallVector;
+class GlobalVariable;
+class Value;
+class CallInst;
+}  // namespace llvm
+
+namespace vecz {
+
+/// @brief An enumeration of errors that can occur when processing a format
+/// string.
+enum EnumPrintfError {
+  kPrintfError_success,
+  kPrintfError_fail,
+  kPrintfError_invalidFormatString
+};
+
+/// @brief Retrieves a module-level global variable for a printf format string
+/// from an Value.
+/// @param[in] op The value that uses a global variable representing a printf
+/// format string.
+/// @return The module-level global variable for the printf format string.
+llvm::GlobalVariable *GetFormatStringAsValue(llvm::Value *op);
+
+/// @brief Extracts the raw string contents from a module-level global variable
+/// containing a printf format string.
+///
+/// The @p op parameter must be an GlobalVariable with an initializer.
+///
+/// @param[in] op The module-level global variable for a printf format string.
+/// @return The raw string contents of the format string global variable, or ""
+/// if there was an error.
+std::string GetFormatStringAsString(llvm::Value *op);
+
+/// @brief Creates a global variable for a scalarized format string.
+/// @param[in,out] module The parent module given to the pass.
+/// @param[in] string_value The GlobalVariable for the old format string,
+/// used to copy attributes over.
+/// @param[in]  new_format_string The scalarized format string to create a
+/// global variable from.
+/// @return The newly created global variable for the format string.
+llvm::GlobalVariable *GetNewFormatStringAsGlobalVar(
+    llvm::Module &module, llvm::GlobalVariable *const string_value,
+    const std::string &new_format_string);
+
+/// @brief This function transforms an OpenCL printf format string into a
+/// C99-conformant one.
+
+/// Its main job is to scalarize vector format specifiers into scalarized form.
+/// It does this by taking a vector specifier and determining the specifier
+/// corresponding to each vector element. It then emits the element specifier
+/// into the new format string for each element in the vector, separated by a
+/// comma.
+///
+/// Special care needs to be taken for modifiers that aren't supported by C99
+/// such as the 'hl' length modifier. The new format string will have 'hl'
+/// stripped out.
+///
+/// Examples:
+/// @code{.cpp}
+/// // vector 2, 8-bit sized hexadecimal integers
+/// "%v2hhx"  --> "%hhx,%hhx"
+/// // vector 4, 32-bit sized floats
+/// "%v4hlf"  --> "%f,%f,%f,%f"
+/// @endcode
+///
+/// It also does some checking to ensure the printf string is conformant to the
+/// OpenCL 1.2 specification, and returns an error if it is not.
+/// @param[in] str The format string to scalarize and check.
+/// @param[out] new_str The new, scalarized, format string.
+/// @return The status of the scalarization (kPrintfError_success on success,
+/// otherwise kPrintfError_invalidFormatString if we detected an illegal OpenCL
+/// printf format string).
+EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
+                                              std::string &new_str);
+
+/// @brief Builds a new scalarized printf call given an existing call and a new
+/// format string.
+///
+/// @param[in,out] module The parent module given to the pass.
+/// @param[in] old_inst The old call to the printf function.
+/// @param[in] new_format_string_gvar The module-level global variable for the
+/// new format string.
+/// @return A new call instruction to the new printf function.
+llvm::Instruction *BuildNewPrintfCall(
+    llvm::Module &module, llvm::CallInst *const old_inst,
+    llvm::GlobalVariable *const new_format_string_gvar);
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
new file mode 100644
index 0000000000000..e0d1156d72c06
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function scalarizer.
+
+#ifndef VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Function;
+}  // namespace llvm
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// \addtogroup scalarization Scalarization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Scalarization pass where vector instructions that need it are
+/// scalarized, starting from leaves.
+class ScalarizationPass : public llvm::PassInfoMixin<ScalarizationPass> {
+ public:
+  /// @brief Create a new scalarizaation pass.
+  ScalarizationPass();
+
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Scalarize the given function.
+  ///
+  /// @param[in] F Function to scalarize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Name of the pass.
+  static llvm::StringRef name() { return "Function scalarization"; }
+
+ private:
+  static char PassID;
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
new file mode 100644
index 0000000000000..224ab9a6cb439
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
@@ -0,0 +1,323 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function scalarizer.
+
+#ifndef VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
+#define VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <vector>
+
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "simd_packet.h"
+
+namespace llvm {
+class Instruction;
+class LoadInst;
+class StoreInst;
+class CastInst;
+class BitCastInst;
+class BinaryOperator;
+class FreezeInst;
+class GetElementPtrInst;
+class UnaryOperator;
+class ICmpInst;
+class FCmpInst;
+class SelectInst;
+class CallInst;
+class ShuffleVectorInst;
+class InsertElementInst;
+class PHINode;
+class ExtractElementInst;
+class IntrinsicInst;
+}  // namespace llvm
+
+namespace vecz {
+
+class VectorizationChoices;
+class VectorizationContext;
+struct MemOp;
+struct PacketMask;
+struct SimdPacket;
+
+/// \addtogroup scalarization Scalarization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Holds the result of scalarization analysis for a given function.
+class Scalarizer {
+ public:
+  /// @brief Create new scalarization results for the function.
+  ///
+  /// @param[in] F Function to scalarize.
+  /// @param[in] Ctx VectorizationContext for this Function.
+  /// @param[in] DoubleSuport True if double-precision floating point is
+  /// supported
+  Scalarizer(llvm::Function &F, VectorizationContext &Ctx, bool DoubleSuport);
+
+  /// @brief Mark the value as needing scalarization.
+  /// @param[in] V Value that needs scalarization.
+  void setNeedsScalarization(llvm::Value *V);
+
+  /// @brief Scalarize everything that has been marked for scalarization
+  bool scalarizeAll();
+
+  /// @brief A container type for instructions that failed to scalarize
+  using FailureSet = llvm::DenseSet<const llvm::Value *>;
+
+  /// @brief Get the list of instructions that failed to scalarize
+  const FailureSet &failures() const { return Failures; }
+
+ private:
+  /// @brief Vectorization context for the function to scalarize.
+  VectorizationContext &Ctx;
+  llvm::Function &F;
+  IRCleanup IC;
+  bool DoubleSupport;
+
+  /// @brief The values to scalarize, in order
+  std::vector<llvm::Value *> ToScalarize;
+
+  /// @brief The un-ordered set of values to scalarize for fast lookup
+  llvm::DenseSet<llvm::Value *> ScalarizeSet;
+
+  /// @brief Map of values to a gather of their scalarized elements
+  llvm::DenseMap<llvm::Value *, llvm::Value *> Gathers;
+
+  /// @brief Map onto packetized versions of scalar values
+  llvm::DenseMap<const llvm::Value *, std::unique_ptr<SimdPacket>> packets;
+
+  /// @brief The number of instructions that failed to scalarize
+  FailureSet Failures;
+
+  /// @brief Transform values that have non-vector types and vector operands
+  /// by scalarizing their operands.
+  ///
+  /// @param[in] I Instruction whose operands to scalarize.
+  ///
+  /// @return A different value than V if the operands were scalarized; null if
+  /// scalarization failed; or V if the value has no vector operand.
+  llvm::Value *scalarizeOperands(llvm::Instruction *I);
+
+  /// @brief Scalarize the given value from the function. Multiple calls to this
+  /// function with the same value should return a cached result.
+  ///
+  /// @param[in] V Value to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarize(llvm::Value *V, PacketMask PM);
+
+  /// @brief Get or create a packet for the given value.
+  ///
+  /// @param[in] V Value to retrieve a packet for.
+  /// @param[in] SimdWidth Number of lanes in the packet.
+  /// @param[in] Create true if a packet should be created if not present.
+  ///
+  /// @return SIMD packet for the given value.
+  SimdPacket *getPacket(const llvm::Value *V, unsigned SimdWidth,
+                        bool Create = true);
+
+  llvm::Value *getGather(llvm::Value *V);
+
+  /// @brief Perform post-scalarization tasks for the given value.
+  ///
+  /// @param[in] P Packet resulting from scalarization or null.
+  /// @param[in] V Value to scalarize.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *assignScalar(SimdPacket *P, llvm::Value *V);
+  /// @brief Extract an element's values, for use by scalarized users
+  ///
+  /// @param[in] V Value to extract.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *extractLanes(llvm::Value *V, PacketMask PM);
+  /// @brief Scalarize a load instruction.
+  ///
+  /// @param[in] Load Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeLoad(llvm::LoadInst *Load, PacketMask PM);
+  /// @brief Scalarize a store instruction.
+  ///
+  /// @param[in] Store Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeStore(llvm::StoreInst *Store, PacketMask PM);
+  /// @brief Scalarize a cast instruction.
+  ///
+  /// @param[in] CastI Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeCast(llvm::CastInst *CastI, PacketMask PM);
+  /// @brief Scalarize a bitcast instruction.
+  ///
+  /// @param[in] BC Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeBitCast(llvm::BitCastInst *BC, PacketMask PM);
+  /// @brief Scalarize a binary operation instruction.
+  ///
+  /// @param[in] BinOp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeBinaryOp(llvm::BinaryOperator *BinOp, PacketMask PM);
+// Freeze instruction is not available in LLVM versions prior 10.0
+// and not used in LLVM versions prior to 11.0
+  /// @brief Scalarize a freeze instruction.
+  ///
+  /// @param[in] FreezeInst Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeFreeze(llvm::FreezeInst *FreezeI, PacketMask PM);
+  /// @brief Scalarize a unary operation instruction.
+  ///
+  /// @param[in] UnOp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeUnaryOp(llvm::UnaryOperator *UnOp, PacketMask PM);
+  /// @brief Scalarize an interger compare instruction.
+  ///
+  /// @param[in] ICmp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeICmp(llvm::ICmpInst *ICmp, PacketMask PM);
+  /// @brief Scalarize a floating-point compare instruction.
+  ///
+  /// @param[in] FCmp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeFCmp(llvm::FCmpInst *FCmp, PacketMask PM);
+  /// @brief Scalarize a select instruction.
+  ///
+  /// @param[in] Select Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeSelect(llvm::SelectInst *Select, PacketMask PM);
+  /// @brief Scalarize a call instruction.
+  ///
+  /// @param[in] CI Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeCall(llvm::CallInst *CI, PacketMask PM);
+  /// @brief Scalarize a call instruction to a masked mem op.
+  ///
+  /// @param[in] CI Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  /// @param[in] MaskedOp Masked memory operation to scalarize.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeMaskedMemOp(llvm::CallInst *CI, PacketMask PM,
+                                   MemOp &MaskedOp);
+  /// @brief Scalarize a shuffle vector instruction.
+  ///
+  /// @param[in] Shuffle Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeShuffleVector(llvm::ShuffleVectorInst *Shuffle,
+                                     PacketMask PM);
+  /// @brief Scalarize an insert element instruction.
+  ///
+  /// @param[in] Insert Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeInsertElement(llvm::InsertElementInst *Insert,
+                                     PacketMask PM);
+  /// @brief Scalarize GEPs with vector arguments
+  ///
+  /// @param[in] GEP The GEP to scalarize
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return The packet containing the scalarized values or null
+  SimdPacket *scalarizeGEP(llvm::GetElementPtrInst *GEP, PacketMask PM);
+  /// @brief Scalarize Phi nodes with vector arguments
+  ///
+  /// @param[in] Phi The Phi node to scalarize
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return The packet containing the scalarized values or null
+  SimdPacket *scalarizePHI(llvm::PHINode *Phi, PacketMask PM);
+  /// @brief Preserves debug information attached to old instruction
+  ///        we have just scalarized before it is removed.
+  ///
+  /// @param[in] Original Vector instruction which has been scalarized.
+  /// @param[in] Packet Packetized instruction after scalarization.
+  /// @param[in] Width SIMD width of packet.
+  void scalarizeDI(llvm::Instruction *Original, const SimdPacket *Packet,
+                   unsigned Width);
+
+  // These functions work on scalar values that use vector values.
+
+  /// @brief Scalarize the operands of an extract element instruction.
+  ///
+  /// @param[in] Extr Instruction to scalarize.
+  ///
+  /// @return A different value than Extr if the operands were scalarized; null
+  /// if scalarization failed; or Extr if the value has no vector operand.
+  llvm::Value *scalarizeOperandsExtractElement(llvm::ExtractElementInst *Extr);
+  /// @brief Scalarize the operands of a bitcast instruction.
+  ///
+  /// @param[in] BC Instruction to scalarize.
+  ///
+  /// @return A different value than BC if the operands were scalarized; null if
+  /// scalarization failed; or BC if the value has no vector operand.
+  llvm::Value *scalarizeOperandsBitCast(llvm::BitCastInst *BC);
+
+  /// @brief Scalarize the operands of a printf call.
+  ///
+  /// @param[in] CI Instruction to scalarize.
+  ///
+  /// @return A different value than CI if the operands were scalarized;
+  /// null if scalarization failed; or CI if the value has no vector
+  /// operand.
+  llvm::Value *scalarizeOperandsPrintf(llvm::CallInst *CI);
+
+  /// @brief Scalarize the operands of a binary operation instruction.
+  ///
+  /// @param[in] Intrin Instruction to scalarize.
+  ///
+  /// @return A different value than Intrin if the operands were scalarized;
+  /// null if scalarization failed; or Intrin if the value has no vector
+  /// operand.
+  llvm::Value *scalarizeReduceIntrinsic(llvm::IntrinsicInst *Intrin);
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
new file mode 100644
index 0000000000000..54baef8617a34
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
@@ -0,0 +1,49 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Transform the pattern generated by ternary operators to a
+/// vectorizable instruction set
+
+#ifndef VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+/// @brief This pass tries to transform selects with pointer operands,
+/// transforms to individual GEPs followed by masked memory operations.
+class TernaryTransformPass : public llvm::PassInfoMixin<TernaryTransformPass> {
+ public:
+  TernaryTransformPass() = default;
+
+  /// @brief The entry point to the pass.
+  //
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  //
+  /// @return The preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  // @brief Pass name.
+  static llvm::StringRef name() { return "Ternary transform pass"; }
+};
+}  // namespace vecz
+
+#endif  // VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
new file mode 100644
index 0000000000000..9fc3c35a21d09
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -0,0 +1,319 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vectorization_context.h
+///
+/// @brief Hold global state and objects used for vectorization.
+
+#ifndef VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
+#define VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/IR/ValueHandle.h>
+#include <llvm/Support/TypeSize.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <map>
+#include <memory>
+
+namespace llvm {
+class TargetTransformInfo;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+class BuiltinInfo;
+}  // namespace utils
+}  // namespace compiler
+
+namespace vecz {
+class MemOpDesc;
+class TargetInfo;
+struct UniformValueResult;
+class VectorizationChoices;
+struct VectorizationResult;
+class VectorizationUnit;
+
+using ActiveUnitMap = llvm::DenseMap<llvm::AssertingVH<const llvm::Function>,
+                                     VectorizationUnit *>;
+
+/// @brief Holds global (per-module) vectorization state.
+class VectorizationContext {
+ public:
+  /// @brief Create a new vectorization context object.
+  ///
+  /// @param[in] target Module in which vectorization happens.
+  /// @param[in] vti Target information.
+  /// @param[in] bi Builtins information
+  VectorizationContext(llvm::Module &target, TargetInfo &vti,
+                       compiler::utils::BuiltinInfo &bi);
+
+  /// @brief Access the public vectorizer API.
+
+  /// @brief Module in which vectorization happens.
+  llvm::Module &module() const { return Module; }
+
+  /// @brief Data layout for the target.
+  const llvm::DataLayout *dataLayout() const { return DL; }
+
+  /// @brief Information about the target.
+  TargetInfo &targetInfo() { return VTI; }
+
+  /// @brief Information about the target.
+  const TargetInfo &targetInfo() const { return VTI; }
+
+  llvm::TargetTransformInfo getTargetTransformInfo(llvm::Function &F) const;
+
+  /// @brief Construct and initialize the PassManager to be used for
+  /// vectorizing.
+  /// @return true if no problem occurred, false otherwise.
+  bool buildPassPipeline();
+  VectorizationUnit *getActiveVU(const llvm::Function *F) const;
+
+  /// @brief Log the Function's VectorizationUnit as the one governing the
+  /// current vectorization.
+  void setActiveVU(llvm::Function *F, VectorizationUnit *VU) {
+    ActiveVUs[F] = VU;
+  }
+  /// @brief Log the Function's VectorizationUnit as the one governing the
+  /// current vectorization.
+  void clearActiveVU(llvm::Function *F) { ActiveVUs.erase(F); }
+
+  /// @brief Builtin database.
+  compiler::utils::BuiltinInfo &builtins();
+
+  /// @brief Builtin database.
+  const compiler::utils::BuiltinInfo &builtins() const;
+
+  /// @brief Determine whether the function is an internal builtin or not.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return true if F is an internal builtin function, false otherwise.
+  static bool isInternalBuiltin(const llvm::Function *F);
+  /// @brief Create a new function with the given name and type, unless it
+  /// already exists in the module. Mark it as an internal builtin.
+  ///
+  /// @param[in] Name Name of the builtin function.
+  /// @param[in] FT Function type for the builtin.
+  ///
+  /// @return Internal builtin function with the given Name.
+  llvm::Function *getOrCreateInternalBuiltin(llvm::StringRef Name,
+                                             llvm::FunctionType *FT = nullptr);
+  /// @brief Define the internal builtin function, i.e. generate its body.
+  ///
+  /// @param[in] F Function declaration to emit a body for.
+  ///
+  /// @return true if the body of the builtin was emitted, false otherwise.
+  bool defineInternalBuiltin(llvm::Function *F);
+  /// @brief Given a scalar builtin function, return a vector equivalent if it
+  /// is an internal builtin.
+  ///
+  /// @param[in] ScalarFn Scalar builtin to map to a vector equivalent.
+  /// @param[in] SimdWidth SIMD width used to determine which vector equivalent
+  /// to select.
+  ///
+  /// @return Equivalent vector builtin function on success, or null.
+  llvm::Function *getInternalVectorEquivalent(llvm::Function *ScalarFn,
+                                              unsigned SimdWidth);
+
+  /// @brief Check if the given function is a masked version of another function
+  ///
+  /// @param[in] F The function to check
+  /// @return true if the function is a masked version, or false otherwise
+  bool isMaskedFunction(const llvm::Function *F) const;
+  /// @brief Get the original non-masked function from a masked function
+  ///
+  /// @param[in] F The masked function
+  /// @return Original masked function if it exists, or null
+  llvm::Function *getOriginalMaskedFunction(llvm::Function *F);
+  /// @brief Get (if it exists already) or create the masked version of a
+  /// function
+  ///
+  /// @param[in] CI Call to the function to be masked
+  /// @return The masked version of the function
+  llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
+
+  /// @brief Create a VectorizationUnit to use to vectorize the given scalar
+  /// function.
+  ///
+  /// The lifetime of the returned VectorizationUnit is managed by the
+  /// VectorizationContext.
+  ///
+  /// @param[in] F Function to vectorize.
+  /// @param[in] Width VF vectorization factor to use.
+  /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
+  /// @param[in] Ch Vectorization Choices for the vectorization.
+  VectorizationUnit *createVectorizationUnit(llvm::Function &F,
+                                             llvm::ElementCount VF,
+                                             unsigned Dimension,
+                                             const VectorizationChoices &Ch);
+
+  /// @brief Vectorizes all Vectorization Units in the context
+  void vectorize();
+
+  /// @brief Try to get a vectorization result for the scalar builtin function.
+  ///
+  /// @param[in] F Builtin function to create or retrieve an unit for.
+  /// @param[in] SimdWidth Vectorization factor to use.
+  ///
+  /// @return a VectorizationResult representing the vectorized function.
+  VectorizationResult &getOrCreateBuiltin(llvm::Function &F,
+                                          unsigned SimdWidth);
+
+  /// @brief Vectorize a builtin function by a given factor
+  ///
+  /// @param[in] F the function to vectorize.
+  /// @param[in] factor the vectorization factor.
+  ///
+  /// @return a VectorizationResult representing the vectorized function.
+  VectorizationResult getVectorizedFunction(llvm::Function &F,
+                                            llvm::ElementCount factor);
+
+  /// @brief Determine whether I is a vector instruction or not, i.e. it has any
+  /// vector operand.
+  ///
+  /// @param[in] I Instruction to analyze.
+  ///
+  /// @return true if I is a vector instruction.
+  static bool isVector(const llvm::Instruction &I);
+
+  static const char *InternalBuiltinPrefix;
+
+ private:
+  /// @brief Determine whether this scalar builtin function can be safely
+  /// expanded at vector call sites, i.e. it has not side effects.
+  ///
+  /// @param[in] ScalarFn Builtin function to analyze.
+  ///
+  /// @return true if the function can be expanded.
+  bool canExpandBuiltin(const llvm::Function *ScalarFn) const;
+
+  /// @brief Emit the body for the masked load or store internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitMaskedMemOpBody(llvm::Function &F, MemOpDesc const &Desc) const;
+  /// @brief Emit the body for the interleaved load or store internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitInterleavedMemOpBody(llvm::Function &F, MemOpDesc const &Desc) const;
+  /// @brief Emit the body for the masked interleaved load/store internal
+  /// builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitMaskedInterleavedMemOpBody(llvm::Function &F,
+                                      MemOpDesc const &Desc) const;
+  /// @brief Emit the body for the scatter or gather internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitScatterGatherMemOpBody(llvm::Function &F,
+                                  MemOpDesc const &Desc) const;
+  /// @brief Emit the body for the masked scatter or gather internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitMaskedScatterGatherMemOpBody(llvm::Function &F,
+                                        MemOpDesc const &Desc) const;
+  /// @brief Add the masked function to the tracking set
+  ///
+  /// @param[in] F The function to add
+  /// @param[in] WrappedF The original function being masked
+  /// @return false if the function was already in the set, or true otherwise
+  bool insertMaskedFunction(llvm::Function *F, llvm::Function *WrappedF);
+
+  /// @brief Emit the body for the subgroup scan builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] IsInclusive whether the scan should be inclusive (on true) or
+  /// exclusive (on false).
+  /// @param[in] OpKind the kind of scan to emit. Note: not all values of
+  /// llvm::RecurKind are supported scan operations.
+  /// @param[in] IsVP whether the scan is vector-predicated.
+  /// @returns true on success, false otherwise
+  bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
+                            llvm::RecurKind OpKind, bool IsVP) const;
+
+  /// @brief Helper for non-vectorization tasks.
+  TargetInfo &VTI;
+  /// @brief Module in which the vectorization happens.
+  llvm::Module &Module;
+  /// @brief Builtins database.
+  compiler::utils::BuiltinInfo &BI;
+  /// @brief Data layout object used to determine the size and alignment of
+  /// types.
+  const llvm::DataLayout *DL;
+  /// @brief Persistent storage for Kernel Vectorization Units
+  std::vector<std::unique_ptr<VectorizationUnit>> KernelUnits;
+  /// @brief Mapping between functions in the module and vectorization units.
+  llvm::DenseMap<const llvm::Function *,
+                 llvm::SmallDenseMap<unsigned, VectorizationResult, 1>>
+      VectorizedBuiltins;
+  /// @brief Maps vector functions to their VectorizationUnits
+  ActiveUnitMap ActiveVUs;
+  /// @brief Map of masked functions used in the module to their original
+  /// non-masked function.
+  llvm::ValueToValueMapTy MaskedFunctionsMap;
+  /// @brief All the masked versions of functions generated by Vecz
+  ///
+  /// Keeps track of all the functions we already have masked versions of. We
+  /// use the name of the masked function instead of just the Function pointer
+  /// because vararg functions have different masked versions for different
+  /// argument types.
+  std::map<std::string, llvm::Function *> MaskedVersions;
+};
+
+/// \addtogroup passes Passes
+/// @{
+/// \ingroup vecz
+
+/// @brief Implement internal builtins.
+class DefineInternalBuiltinsPass
+    : public llvm::PassInfoMixin<DefineInternalBuiltinsPass> {
+ public:
+  /// @brief Create a new pass object.
+  DefineInternalBuiltinsPass() {}
+
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Define all used internal builtins in the module, expanding bodies
+  /// for declaration only references.
+  ///
+  /// @param[in] M Module in which to define internal builtins.
+  /// @param[in] AM ModuleAnalysisManager providing analyses.
+  ///
+  /// @return Set of preserved analyses (all analyses).
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+
+  static llvm::StringRef name() { return "Define internal builtins"; }
+
+ private:
+  /// @brief Identifier for the DefineInternalBuiltin pass.
+  static char PassID;
+};
+
+/// @}
+}  // namespace vecz
+
+#endif  // VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
new file mode 100644
index 0000000000000..adba458a067eb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_VECTORIZATION_HELPERS_H_INCLUDED
+#define VECZ_VECTORIZATION_HELPERS_H_INCLUDED
+
+#include <llvm/Support/TypeSize.h>
+
+#include <string>
+
+namespace llvm {
+class Function;
+class StringRef;
+}  // namespace llvm
+
+namespace vecz {
+class VectorizationUnit;
+class VectorizationChoices;
+
+/// @brief Generate a name for the vectorized function, which depends on the
+/// original function name and SIMD width.
+///
+/// @param[in] ScalarName Name of the original function.
+/// @param[in] VF vectorization factor of the vectorized function.
+/// @param[in] Choices choices used for vectorization
+///
+/// @return Name for the vectorized function.
+std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
+                                      llvm::ElementCount VF,
+                                      VectorizationChoices Choices);
+
+/// @brief Clone the scalar function's body into the function to vectorize,
+/// vectorizing function argument types where required.
+///
+/// @param[in] VU the Vectorization Unit of the scalar function to clone.
+///
+/// @return The cloned function.
+llvm::Function *cloneFunctionToVector(VectorizationUnit const &VU);
+
+/// @brief Create a copy of the scalar functions debug info metatadata
+//         nodes and set the scope of the copied DI to the vectorized
+//         function.
+void cloneDebugInfo(VectorizationUnit const &VU);
+
+/// @brief Clone OpenCL related metadata from the scalar kernel to the
+/// vectorized one.
+///
+/// This function will copy any 'opencl.kernels' or
+/// 'opencl.kernel_wg_size_info' metadata from the scalar kernel to the
+/// vectorized one. Obviously, the kernel itself has to be cloned before
+/// calling this function.
+void cloneOpenCLMetadata(VectorizationUnit const &VU);
+}  // namespace vecz
+
+#endif  // VECZ_VECTORIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
new file mode 100644
index 0000000000000..f79a058b16acc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
@@ -0,0 +1,43 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
+#define VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
+
+#include <llvm/Support/TypeSize.h>
+
+namespace llvm {
+class Function;
+}  // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @brief Decide whether a function is worth vectorizing for a given
+/// vectorization factor.
+///
+/// @param[in] F the function to analyze
+/// @param[in] Ctx the vectorization context
+/// @param[in] VF the vectorization factor
+/// @param[in] SimdDimIdx the vectorization dimension
+///
+/// @return Whether we should vectorize the function or not.
+bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
+                     llvm::ElementCount VF, unsigned SimdDimIdx);
+
+}  // namespace vecz
+
+#endif  // VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
new file mode 100644
index 0000000000000..812fc1de48966
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
@@ -0,0 +1,259 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_VECTORIZATION_UNIT_H_INCLUDED
+#define VECZ_VECTORIZATION_UNIT_H_INCLUDED
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm {
+class Function;
+class FunctionType;
+class Module;
+class Instruction;
+class Argument;
+}  // namespace llvm
+
+namespace vecz {
+namespace internal {
+struct VeczFailResult;
+struct AnalysisFailResult;
+}  // namespace internal
+
+struct SimdPacket;
+struct UniformValueResult;
+class ValueTagMap;
+class VectorizationContext;
+class VectorizationChoices;
+
+template <typename T>
+class AnalysisWrapper;
+
+/// @brief Describe an argument of a function that needs to be vectorized.
+struct VectorizerTargetArgument {
+  /// @brief Argument of the scalar function.
+  llvm::Argument *OldArg;
+  /// @brief Argument of the vectorized function. Might be scalar or vector.
+  llvm::Argument *NewArg;
+  /// @brief Whether the argument needs to be vectorized or not.
+  bool IsVectorized;
+  /// @brief If the argument is a 'byref' pointer used to return a value, this
+  /// is the type of that value. Else it is null.
+  llvm::Type *PointerRetPointeeTy;
+  /// @brief Placeholder instruction for arguments needing vectorization.
+  llvm::Instruction *Placeholder;
+};
+
+/// @brief Analysis flags that can be attached to LLVM functions.
+enum FunctionFlags {
+  eFunctionNoFlag = 0,
+  /// @brief The function has been analyzed.
+  /// Set by the preliminary vectorization analysis (canVectorize). Set once.
+  eFunctionAnalysisDone = (1 << 0),
+  /// @brief The function can be vectorized.
+  /// Set by the preliminary vectorization analysis (canVectorize). Set once.
+  eFunctionVectorizable = (1 << 1),
+  /// @brief Vectorization of the function failed.
+  /// Can be set by any pass. Set once.
+  eFunctionVectorizationFailed = (1 << 2),
+};
+
+/// @brief struct to hold only the data needed to use a vectorized function
+struct VectorizationResult {
+  struct Arg {
+    enum Kind { SCALAR, VECTORIZED, POINTER_RETURN } kind;
+    llvm::Type *type;
+    llvm::Type *pointerRetPointeeTy = nullptr;
+    constexpr Arg(Kind k, llvm::Type *ty, llvm::Type *ptrRetTy)
+        : kind(k), type(ty), pointerRetPointeeTy(ptrRetTy) {}
+  };
+
+  llvm::Function *func = nullptr;
+  llvm::SmallVector<Arg, 2> args;
+
+  operator bool() const { return func; }
+  llvm::Function *get() const { return func; }
+};
+
+/// @brief Describe a function that needs to be vectorized.
+class VectorizationUnit {
+ public:
+  /// @brief Create a new vectorization unit for the given scalar function.
+  ///
+  /// @param[in] F Function to vectorize.
+  /// @param[in] Width SIMD width (i.e. vectorization factor) to use.
+  /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
+  /// @param[in] Ctx Context for vectorization.
+  /// @param[in] Ch Vectorization Choices for the vectorization.
+  VectorizationUnit(llvm::Function &F, llvm::ElementCount Width,
+                    unsigned Dimension, VectorizationContext &Ctx,
+                    const VectorizationChoices &Ch);
+  /// @brief Free up any resource used by the function.
+  ~VectorizationUnit();
+
+  /// @brief Access the vectorization context linked to this function.
+  VectorizationContext &context() { return Ctx; }
+
+  /// @brief Access the vectorization context linked to this function.
+  const VectorizationContext &context() const { return Ctx; }
+
+  /// @brief Number of available SIMD lanes, i.e. vectorization factor.
+  llvm::ElementCount width() const { return SimdWidth; }
+
+  /// @brief Get the work group size along the vectorization dimension.
+  uint64_t getLocalSize() const { return LocalSize; }
+
+  /// @brief Whether to run the SIMD Width Analysis during vectorization.
+  bool autoWidth() const { return AutoSimdWidth; }
+
+  /// @brief Index of SIMD dimension used in vectorization.
+  unsigned dimension() const { return SimdDimIdx; }
+
+  /// @brief Set the SIMD width, i.e. vectorization factor. After changing this
+  /// value a possible existing vectorized function is looked up in the module.
+  ///
+  /// @param[in] NewWidth New SIMD width.
+  void setWidth(llvm::ElementCount NewWidth);
+
+  /// @brief Set the work group size along the vectorization dimension.
+  ///
+  /// @param[in] LS the local work group size
+  void setLocalSize(uint64_t LS) { LocalSize = LS; }
+
+  /// @brief Set whether to use the SIMD width analysis
+  ///
+  /// @param[in] Auto true to use auto SIMD width, false otherwise
+  void setAutoWidth(bool Auto) { AutoSimdWidth = Auto; }
+
+  /// @brief Determine whether vectorizing the function failed or not.
+  bool failed() const { return hasFlag(eFunctionVectorizationFailed); }
+
+  /// @brief Mark this function as failing vectorization.
+  /// @param[in] Remark Message to print into the optimization remarks
+  /// @param[in] F Function to pass to emitVeczRemarkMissed
+  /// @param[in] V Value to pass to emitVeczRemarkMissed
+  /// @return unconditionally returns a VeczFailResult which can be safely
+  /// ignored. This can help cut down on some boilerplate in contexts where
+  /// we'll immediately return, via the following idiom:
+  /// ```
+  ///   if (!thing) {
+  ///     return setFailed("thing wasn't");
+  ///   }
+  /// ```
+  internal::AnalysisFailResult setFailed(const char *Remark,
+                                         const llvm::Function *F = nullptr,
+                                         const llvm::Value *V = nullptr);
+
+  /// @brief Check whether the function has the given flag or not.
+  ///
+  /// @param[in] Flag Flag to check.
+  ///
+  /// @return true if the function has the given flag, false otherwise.
+  bool hasFlag(FunctionFlags Flag) const { return (FnFlags & Flag) == Flag; }
+
+  /// @brief Set the given flag to the function.
+  ///
+  /// @param[in] Flag Flag to set.
+  void setFlag(FunctionFlags Flag) {
+    FnFlags = (FunctionFlags)(FnFlags | Flag);
+  }
+
+  /// @brief Clear the given flag from the function.
+  ///
+  /// @param[in] Flag Flag to set.
+  void clearFlag(FunctionFlags Flag) {
+    FnFlags = (FunctionFlags)(FnFlags & ~Flag);
+  }
+
+  /// @brief Access the arguments of the function to vectorize.
+  const llvm::SmallVectorImpl<VectorizerTargetArgument> &arguments() const {
+    return Arguments;
+  }
+
+  /// @brief Return the vectorized function if it exists, otherwise the original
+  /// function.
+  llvm::Function &function();
+
+  /// @brief Return the vectorized function if it exists, otherwise the original
+  /// function.
+  const llvm::Function &function() const;
+
+  /// @brief Original function to vectorize.
+  llvm::Function *scalarFunction() const { return ScalarFn; }
+
+  /// @brief Set the function to vectorize. This updates the function arguments.
+  ///
+  /// @param[in] NewFunction Original function.
+  void setScalarFunction(llvm::Function *NewFunction);
+
+  /// @brief Vectorized function.
+  llvm::Function *vectorizedFunction() const { return VectorizedFn; }
+
+  /// @brief Set the vectorized function. This updates the function arguments.
+  ///
+  /// @param[in] NewFunction Vectorized function.
+  void setVectorizedFunction(llvm::Function *NewFunction);
+
+  /// @brief Name of the current function.
+  llvm::StringRef getName() const { return function().getName(); }
+
+  /// @brief Get the result of the vectorization
+  /// @return The VectorizationResult respresenting the vectorized function
+  VectorizationResult getResult() const;
+
+  /// @brief Get the Vecz optimizations tracker class
+  /// @return The Choices
+  const VectorizationChoices &choices() const { return Choices; };
+
+ private:
+  /// @brief Context this function is vectorized in.
+  VectorizationContext &Ctx;
+  /// @brief Which Vecz code generation choices are enabled and which not
+  const VectorizationChoices &Choices;
+  /// @brief Function to vectorize.
+  llvm::Function *ScalarFn;
+  /// @brief Target (vectorized) function.
+  llvm::Function *VectorizedFn;
+  /// @brief Arguments of the function to vectorize.
+  llvm::SmallVector<VectorizerTargetArgument, 4> Arguments;
+  /// @brief Vectorization factor to use.
+  llvm::ElementCount SimdWidth;
+  /// @brief The work group size along the vectorization dimension, if known,
+  /// zero otherwise. For our purposes, this only need be an upper bound.
+  uint64_t LocalSize;
+  /// @brief Use the SIMD Width Analysis to determine the SIMD width
+  bool AutoSimdWidth;
+  /// @brief SimdDimIdx Index of vectorization dimension to use.
+  unsigned SimdDimIdx;
+  /// @brief Name of the builtin function, if the function to vectorize is one.
+  std::string BuiltinName;
+  /// @brief Per-function analysis flags.
+  FunctionFlags FnFlags;
+  /// @brief Placeholder instructions for arguments that will be vectorized.
+  llvm::SmallPtrSet<const llvm::Instruction *, 4> ArgumentPlaceholders;
+};
+
+}  // namespace vecz
+
+#endif  // VECZ_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
new file mode 100644
index 0000000000000..d3d42aecbd066
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
@@ -0,0 +1,74 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vectorizer.h
+///
+/// @brief Entry point for the kernel vectorizer.
+
+#ifndef VECZ_VECTORIZER_H_INCLUDED
+#define VECZ_VECTORIZER_H_INCLUDED
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Function;
+}  // namespace llvm
+
+namespace vecz {
+
+/// @brief The maximum number of vectorization dimension that Vecz can handle.
+///
+/// The current limitation is due to the assumption that work groups are
+/// being represented as 1- 2- or 3-dimensional arrays or work items.
+const unsigned MAX_SIMD_DIM = 3;
+
+class VectorizationContext;
+class VectorizationUnit;
+struct VeczPassOptions;
+
+/// @brief Try to create a vectorization unit for the given kernel function,
+///        with the given vectorization factor and vectorization options.
+///
+/// @param[in] Ctx VectorizationContext used to perform the vectorization.
+/// @param[in] Kernel kernel function to vectorize.
+/// @param[in] Opts Vecz Pass Options struct for this vectorization.
+/// @param[in] FAM Function Analysis Manager for running analyses
+/// @param[in] Check check for vectorizability before creating the VU
+///
+/// @return Pointer to a vectorization unit on success, or nullptr on failure.
+VectorizationUnit *createVectorizationUnit(VectorizationContext &Ctx,
+                                           llvm::Function *Kernel,
+                                           const VeczPassOptions &Opts,
+                                           llvm::FunctionAnalysisManager &FAM,
+                                           bool Check);
+
+/// @brief Create metadata for the vectorization unit relating the vectorized
+///        function to the scalar function.
+///
+/// @param[in] VU the vectorization Unit of to create metadata for
+/// @returns true iff vectorization succeeded.
+bool createVectorizedFunctionMetadata(VectorizationUnit &VU);
+
+/// @brief Register failure, success, and update statistics for the given
+/// VectorizationUnit.
+///
+/// @param[in] VU the vectorization Unit of to create metadata for
+/// @returns true iff vectorization succeeded.
+void trackVeczSuccessFailure(VectorizationUnit &VU);
+}  // namespace vecz
+
+#endif  // VECZ_VECTORIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
new file mode 100644
index 0000000000000..9dc3d3559cdcb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vecz_pass_builder.h
+///
+/// @brief class to initialize a Module Pass Manager to perform vectorization.
+
+#ifndef VECZ_VECZ_PASS_BUILDER_H_INCLUDED
+#define VECZ_VECZ_PASS_BUILDER_H_INCLUDED
+
+#include <compiler/utils/pass_machinery.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Module;
+class TargetTransformInfo;
+class TargetMachine;
+}  // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @brief A class that manages the lifetime and initialization of all
+/// components required to set up an LLVM pass manager to run Vecz passes.
+class VeczPassMachinery final : public compiler::utils::PassMachinery {
+ public:
+  /// @brief Construct the pass machinery.
+  /// The base class method `initialize(TargetInfo)` must also be called.
+  ///
+  /// @param[in] TM TargetMachine to be used for passes. May be nullptr
+  /// @param[in] ctx the vectorization context object for the module.
+  /// @param[in] verifyEach true if each pass should be verified
+  /// @param[in] debugLogLevel debug logging verbosity.
+  VeczPassMachinery(llvm::LLVMContext &llvmCtx, llvm::TargetMachine *TM,
+                    VectorizationContext &ctx, bool verifyEach,
+                    compiler::utils::DebugLogging debugLogLevel =
+                        compiler::utils::DebugLogging::None);
+
+  virtual void registerPasses() override;
+
+ private:
+  virtual void addClassToPassNames() override;
+  virtual void registerPassCallbacks() override;
+
+  VectorizationContext &Ctx;
+};
+
+/// @brief Add the full Vecz pass pipeline to the given pass manager.
+///
+/// @param[in] PM The Module Pass Manager to build.
+/// @return true on success.
+bool buildPassPipeline(llvm::ModulePassManager &PM);
+}  // namespace vecz
+
+#endif  // VECZ_VECZ_PASS_BUILDER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
new file mode 100644
index 0000000000000..3519c5b506897
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -0,0 +1,192 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "ir_cleanup.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/DebugInfo.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/Local.h>
+
+#include "memory_operations.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+
+/// @brief Determine whether all users of the instructions are dead. An user is
+/// dead if it has no use, if it is present in the 'to delete' list or if it is
+/// a phi node whose only use keeps it alive is the 'backedge'.
+///
+/// @param[in] I Instruction to check for deletion.
+/// @param[in] DeadList Instructions marked for deletion.
+/// @param[in,out] WorkList Newly detected Instructions marked for deletion.
+/// @param[in,out] Visited Instructions visited for deletion.
+///
+/// @return true if all users of the instructions are dead, false otherwise.
+bool AreUsersDead(Instruction *I,
+                  const SmallPtrSetImpl<Instruction *> &DeadList,
+                  SmallPtrSetImpl<Instruction *> &WorkList,
+                  SmallPtrSetImpl<Instruction *> &Visited) {
+  for (User *U : I->users()) {
+    // Ignore non-instructions.
+    Instruction *UserI = dyn_cast<Instruction>(U);
+    if (!UserI) {
+      continue;
+    }
+
+    // Trivially dead users can be removed, even if we haven't explicitly marked
+    // them for deletion. The DCE pass would have removed these later on anyway,
+    // and by marking them for deletion here we can be more aggressive about
+    // what we delete.
+    if (isInstructionTriviallyDead(UserI)) {
+      WorkList.insert(UserI);
+    }
+
+    // I is held by a non-dead user.
+    if (!DeadList.count(UserI) && !WorkList.count(UserI)) {
+      return false;
+    }
+
+    // Recurse over the user's users.
+    if (!UserI->user_empty() && Visited.insert(UserI).second &&
+        !AreUsersDead(UserI, DeadList, WorkList, Visited)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// @brief Mark any invalid debug intrinsics in the DbgUsers list for
+/// deletion. When an Instruction is deleted, its debug uses change to undef
+/// or an empty MDNode. In this case we add it in the 'to delete' list.
+///
+/// @param[in] DbgUsers Debug Intrinsic Instructions.
+/// @param[in,out] WorkList Newly detected Instructions marked for deletion.
+///
+/// @return void
+void DeleteDebugInfoInstructions(
+    const SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
+    SmallPtrSetImpl<Instruction *> &WorkList) {
+  for (llvm::DbgVariableIntrinsic *DII : DbgUsers) {
+    Value *Op = DII->getOperand(0);
+    // The first operand must be a non-null variable location argument.
+    if (Op) {
+      auto *MD = cast<MetadataAsValue>(Op)->getMetadata();
+
+      // Check the variable location is not an undef.
+      if (auto *V = dyn_cast<ValueAsMetadata>(MD)) {
+        Value *Var = V->getValue();
+        if (Var && !isa<UndefValue>(Var)) {
+          continue;
+        }
+      }
+
+      // Check the variable doesn't point to an empty MDNode.
+      if (auto *mdNode = dyn_cast<MDNode>(MD)) {
+        if (mdNode->getNumOperands() > 0) {
+          continue;
+        }
+      }
+    }
+
+    // Mark the Debug Info Intrinsic for deletion.
+    WorkList.insert(DII);
+  }
+}
+
+}  // namespace
+
+void IRCleanup::deleteInstructionLater(llvm::Instruction *I) {
+  if (InstructionsToDelete.insert(I).second) {
+    LLVM_DEBUG(dbgs() << "Marking for deletion: " << *I << "\n");
+  }
+}
+
+void IRCleanup::deleteInstructions() {
+  SmallPtrSet<Instruction *, 16> WorkList;
+  SmallPtrSet<Instruction *, 16> VisitedForCycles;
+  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+  bool progress = true;
+  while (progress && !InstructionsToDelete.empty()) {
+    progress = false;
+    for (Instruction *I : InstructionsToDelete) {
+      WorkList.erase(I);
+      if (I->use_empty()) {
+        // Before we delete the current instruction we save its debug users, to
+        // check for potential loss of debug information after the removal of I.
+        findDbgUsers(DbgUsers, I);
+        I->eraseFromParent();
+        // After we delete the instruction, its debug uses (if any) may become
+        // useless as a result of a loss of debug info. where the value of one
+        // or more source variables becomes unavailable, so at this point we
+        // will identify and delete those debug info instructions.
+        DeleteDebugInfoInstructions(DbgUsers, WorkList);
+        DbgUsers.clear();
+        progress = true;
+      } else if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+        if (AreUsersDead(Phi, InstructionsToDelete, WorkList,
+                         VisitedForCycles)) {
+          Phi->replaceAllUsesWith(UndefValue::get(Phi->getType()));
+          Phi->eraseFromParent();
+          progress = true;
+        } else {
+          WorkList.insert(Phi);
+        }
+        VisitedForCycles.clear();
+      } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        // MemOps make deleting unnecessary instructions harder, because they
+        // cannot be trivially dead instructions, thus breaking our recursive
+        // deletion. However, if we have packetized a load or a store, we
+        // definitely want to remove the scalar one, as it will be
+        // reading/writing to invalid pointers. To make things simpler, here we
+        // detect internal builtins that perform memory operations and erase
+        // them. Since stores have no users, they will be removed earlier on and
+        // we do not need to check here.
+        auto Op = MemOp::get(CI);
+        if (Op && Op->isLoad()) {
+          // We need to replace loads with nops, as we need to have a value for
+          // their users, which will be removed later on.
+          I->replaceAllUsesWith(UndefValue::get(Op->getDataType()));
+          I->eraseFromParent();
+        } else {
+          WorkList.insert(I);
+        }
+      } else {
+        WorkList.insert(I);
+      }
+    }
+    InstructionsToDelete = std::move(WorkList);
+    WorkList.clear();
+  }
+
+  // Remove remaining instructions from the list.
+  LLVM_DEBUG(for (Instruction *I
+                  : InstructionsToDelete) {
+    dbgs() << "vecz: could not delete " << *I << "\n";
+  });
+  InstructionsToDelete.clear();
+}
+
+void IRCleanup::deleteInstructionNow(Instruction *I) {
+  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->eraseFromParent();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
new file mode 100644
index 0000000000000..57013bebc77ba
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
@@ -0,0 +1,73 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "llvm_helpers.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Module.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include "debugging.h"
+#include "memory_operations.h"
+
+using namespace llvm;
+
+/// @brief Determine if the value has vector type, and return it.
+///
+/// @param[in] V Value to analyze.
+///
+/// @return Vector type of V or null.
+FixedVectorType *vecz::getVectorType(Value *V) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+    auto *VO = Store->getValueOperand();
+    assert(VO && "Could not get value operand");
+    return dyn_cast<FixedVectorType>(VO->getType());
+  } else if (CallInst *Call = dyn_cast<CallInst>(V)) {
+    if (auto MaskedOp = MemOp::get(Call, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp() && MaskedOp->isStore()) {
+        return dyn_cast<FixedVectorType>(MaskedOp->getDataType());
+      }
+    }
+  }
+  return dyn_cast<FixedVectorType>(V->getType());
+}
+
+/// @brief Get the default value for a type.
+///
+/// @param[in] T Type to get default value of.
+/// @param[in] V Default value to use for numeric type
+///
+/// @return Default value, which will be undef for non-numeric types
+Value *vecz::getDefaultValue(Type *T, uint64_t V) {
+  if (T->isIntegerTy()) {
+    return ConstantInt::get(T, V);
+  }
+
+  if (T->isFloatTy() || T->isDoubleTy()) {
+    return ConstantFP::get(T, V);
+  }
+
+  return UndefValue::get(T);
+}
+
+/// @brief Get the shuffle mask as sequence of integers.
+///
+/// @param[in] Shuffle Instruction
+///
+/// @return Array of integers representing the Shuffle mask
+ArrayRef<int> vecz::getShuffleVecMask(ShuffleVectorInst *Shuffle) {
+  return Shuffle->getShuffleMask();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
new file mode 100644
index 0000000000000..b4788067249f7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -0,0 +1,1002 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "memory_operations.h"
+
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/opaque_pointers.h>
+#include <multi_llvm/optional_helper.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+using namespace vecz;
+using namespace llvm;
+
+static std::string getMaskedMemOpName(Type *DataTy, PointerType *PtrTy,
+                                      Type *MaskTy, unsigned Alignment,
+                                      bool IsLoad, bool IsVP) {
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy) &&
+         "Invalid masked memory operation");
+  if (!DataTy) {
+    return std::string();
+  }
+  compiler::utils::NameMangler Mangler(&DataTy->getContext());
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy));
+  const char *BaseName = IsLoad ? "masked_load" : "masked_store";
+  compiler::utils::TypeQualifiers DataQuals(compiler::utils::eTypeQualNone);
+  compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
+                                           compiler::utils::eTypeQualNone);
+  compiler::utils::TypeQualifiers MaskQuals(compiler::utils::eTypeQualNone);
+  std::string Name;
+  raw_string_ostream O(Name);
+  O << VectorizationContext::InternalBuiltinPrefix << BaseName << Alignment
+    << "_";
+  if (IsVP) {
+    O << "vp_";
+  }
+  if (!Mangler.mangleType(O, DataTy, DataQuals) ||
+      !Mangler.mangleType(O, PtrTy, PtrQuals) ||
+      !Mangler.mangleType(O, MaskTy, MaskQuals)) {
+    return std::string();
+  }
+  if (IsVP) {
+    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
+                            VLQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return Name;
+}
+
+Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
+                                         Type *DataTy, PointerType *PtrTy,
+                                         unsigned Alignment, bool IsLoad,
+                                         bool IsVP) {
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy) &&
+         "Invalid masked memory operation");
+  Module &M = Ctx.module();
+  LLVMContext &LLVMCtx = M.getContext();
+  Type *MaskTy = IntegerType::getInt1Ty(LLVMCtx);
+  if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+    MaskTy = VectorType::get(MaskTy, multi_llvm::getVectorElementCount(VecTy));
+  }
+
+  // Try to retrieve the builtin if it already exists.
+  std::string Name =
+      getMaskedMemOpName(DataTy, PtrTy, MaskTy, Alignment, IsLoad, IsVP);
+  VECZ_FAIL_IF(Name.empty());
+  Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
+  if (!F) {
+    // Declare it if it doesn't exist.
+    SmallVector<Type *, 4> Tys;
+    if (!IsLoad) {
+      Tys.push_back(DataTy);
+    }
+    Tys.push_back(PtrTy);
+    Tys.push_back(MaskTy);
+    if (IsVP) {
+      Tys.push_back(IntegerType::getInt32Ty(LLVMCtx));
+    }
+
+    Type *RetTy = IsLoad ? DataTy : Type::getVoidTy(LLVMCtx);
+    FunctionType *FT = FunctionType::get(RetTy, Tys, false);
+    F = Ctx.getOrCreateInternalBuiltin(Name, FT);
+  }
+  return F;
+}
+
+static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data,
+                                   Type *DataTy, Value *Ptr, Value *Mask,
+                                   Value *EVL, unsigned Alignment, Twine Name,
+                                   Instruction *InsertBefore) {
+  VECZ_FAIL_IF(!DataTy);
+  VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
+  VECZ_FAIL_IF(!Mask);
+  assert(!Data || Data->getType() == DataTy);
+  auto *PtrTy =
+      PointerType::get(DataTy, Ptr->getType()->getPointerAddressSpace());
+  if (Ptr->getType() != PtrTy) {
+    Ptr = BitCastInst::CreatePointerCast(Ptr, PtrTy, "", InsertBefore);
+  }
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(cast<PointerType>(PtrTy),
+                                                  DataTy));
+  Function *F =
+      getOrCreateMaskedMemOpFn(Ctx, DataTy, PtrTy, Alignment,
+                               /*IsLoad*/ Data == nullptr, EVL != nullptr);
+  VECZ_FAIL_IF(!F);
+  SmallVector<Value *, 4> Ops;
+  if (Data) {
+    Ops.push_back(Data);
+  }
+  Ops.push_back(Ptr);
+  Ops.push_back(Mask);
+  if (EVL) {
+    Ops.push_back(EVL);
+  }
+  return CallInst::Create(F, Ops, Name, InsertBefore);
+}
+
+CallInst *vecz::createMaskedLoad(VectorizationContext &Ctx, Type *Ty,
+                                 Value *Ptr, Value *Mask, Value *EVL,
+                                 unsigned Alignment, Twine Name,
+                                 Instruction *InsertBefore) {
+  return createMaskedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Mask, EVL, Alignment,
+                           Name, InsertBefore);
+}
+
+CallInst *vecz::createMaskedStore(VectorizationContext &Ctx, Value *Data,
+                                  Value *Ptr, Value *Mask, Value *EVL,
+                                  unsigned Alignment, Twine Name,
+                                  Instruction *InsertBefore) {
+  return createMaskedMemOp(Ctx, Data, Data->getType(), Ptr, Mask, EVL,
+                           Alignment, Name, InsertBefore);
+}
+
+static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy,
+                                           Value *Stride, Type *MaskTy,
+                                           unsigned Alignment, bool IsLoad,
+                                           bool IsVP) {
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy,
+                                                  DataTy->getScalarType()) &&
+         "Invalid masked memory operation");
+  if (!DataTy) {
+    return std::string();
+  }
+  compiler::utils::NameMangler Mangler(&DataTy->getContext());
+  const char *BaseName = IsLoad ? "interleaved_load" : "interleaved_store";
+  std::string Name;
+  compiler::utils::TypeQualifiers VecQuals(compiler::utils::eTypeQualNone,
+                                           compiler::utils::eTypeQualNone);
+  compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
+                                           compiler::utils::eTypeQualNone);
+  raw_string_ostream O(Name);
+  O << VectorizationContext::InternalBuiltinPrefix;
+  if (MaskTy) {
+    O << "masked_";
+  }
+  O << BaseName << Alignment << "_";
+  if (IsVP) {
+    O << "vp_";
+  }
+  if (auto *CVal = dyn_cast<ConstantInt>(Stride)) {
+    O << CVal->getSExtValue();
+  } else {
+    O << "V";
+  }
+  O << "_";
+  if (!Mangler.mangleType(O, DataTy, VecQuals) ||
+      !Mangler.mangleType(O, PtrTy, PtrQuals)) {
+    return std::string();
+  }
+  if (MaskTy) {
+    compiler::utils::TypeQualifiers MaskQuals(compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, MaskTy, MaskQuals)) {
+      return std::string();
+    }
+  }
+  if (IsVP) {
+    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
+                            VLQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return Name;
+}
+
+Function *vecz::getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx,
+                                              Type *DataTy, PointerType *PtrTy,
+                                              Value *Stride, Type *MaskTy,
+                                              unsigned Alignment, bool IsLoad,
+                                              bool IsVP) {
+  assert(
+      multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy->getScalarType()));
+  Module &M = Ctx.module();
+  LLVMContext &LLVMCtx = M.getContext();
+
+  // Try to retrieve the builtin if it already exists.
+  std::string Name = getInterleavedMemOpName(DataTy, PtrTy, Stride, MaskTy,
+                                             Alignment, IsLoad, IsVP);
+  VECZ_FAIL_IF(Name.empty());
+  Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
+  if (!F) {
+    // Declare it if it doesn't exist.
+    SmallVector<Type *, 6> Tys;
+    if (!IsLoad) {
+      VECZ_FAIL_IF(!DataTy);
+      Tys.push_back(DataTy);
+    }
+    VECZ_FAIL_IF(!PtrTy);
+    Tys.push_back(PtrTy);
+    if (MaskTy) {
+      Tys.push_back(MaskTy);
+    }
+    if (IsVP) {
+      Tys.push_back(IntegerType::getInt32Ty(LLVMCtx));
+    }
+    if (!isa<ConstantInt>(Stride)) {
+      Tys.push_back(getSizeTy(M));
+    }
+    Type *RetTy = IsLoad ? DataTy : Type::getVoidTy(LLVMCtx);
+    FunctionType *FT = FunctionType::get(RetTy, Tys, false);
+    F = Ctx.getOrCreateInternalBuiltin(Name, FT);
+  }
+  return F;
+}
+
+static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data,
+                                        Type *DataTy, Value *Ptr, Value *Stride,
+                                        Value *Mask, Value *EVL,
+                                        unsigned Alignment, llvm::Twine Name,
+                                        llvm::Instruction *InsertBefore) {
+  VECZ_FAIL_IF(!DataTy);
+  VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
+  assert(!Data || Data->getType() == DataTy);
+  auto *PtrTy = PointerType::get(DataTy->getScalarType(),
+                                 Ptr->getType()->getPointerAddressSpace());
+  if (Ptr->getType() != PtrTy) {
+    Ptr = BitCastInst::CreatePointerCast(Ptr, PtrTy, "", InsertBefore);
+  }
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(cast<PointerType>(PtrTy),
+                                                  DataTy->getScalarType()));
+  Type *MaskTy = Mask ? Mask->getType() : nullptr;
+  Function *F = getOrCreateInterleavedMemOpFn(
+      Ctx, DataTy, PtrTy, Stride, MaskTy, Alignment,
+      /*IsLoad*/ Data == nullptr, EVL != nullptr);
+  VECZ_FAIL_IF(!F);
+  SmallVector<Value *, 4> Ops;
+  if (Data) {
+    Ops.push_back(Data);
+  }
+  Ops.push_back(Ptr);
+  if (Mask) {
+    Ops.push_back(Mask);
+  }
+  if (EVL) {
+    Ops.push_back(EVL);
+  }
+  if (!isa<ConstantInt>(Stride)) {
+    Ops.push_back(Stride);
+  }
+  return CallInst::Create(F, Ops, Name, InsertBefore);
+}
+
+CallInst *vecz::createInterleavedLoad(VectorizationContext &Ctx, Type *Ty,
+                                      Value *Ptr, Value *Stride, Value *Mask,
+                                      Value *EVL, unsigned Alignment,
+                                      Twine Name, Instruction *InsertBefore) {
+  return createInterleavedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Stride, Mask,
+                                EVL, Alignment, Name, InsertBefore);
+}
+
+CallInst *vecz::createInterleavedStore(VectorizationContext &Ctx, Value *Data,
+                                       Value *Ptr, Value *Stride, Value *Mask,
+                                       Value *EVL, unsigned Alignment,
+                                       Twine Name, Instruction *InsertBefore) {
+  return createInterleavedMemOp(Ctx, Data, Data->getType(), Ptr, Stride, Mask,
+                                EVL, Alignment, Name, InsertBefore);
+}
+
+static std::string getScatterGatherMemOpName(Type *DataTy, VectorType *VecPtrTy,
+                                             Type *MaskTy, unsigned Alignment,
+                                             bool IsGather, bool IsVP) {
+  if (!DataTy) {
+    return std::string();
+  }
+  compiler::utils::NameMangler Mangler(&DataTy->getContext());
+  const char *BaseName = IsGather ? "gather_load" : "scatter_store";
+  std::string Name;
+  compiler::utils::TypeQualifiers VecQuals(compiler::utils::eTypeQualNone,
+                                           compiler::utils::eTypeQualNone);
+  compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
+                                           compiler::utils::eTypeQualNone);
+  compiler::utils::TypeQualifiers MaskQuals(compiler::utils::eTypeQualNone);
+  PtrQuals.push_back(compiler::utils::eTypeQualNone);
+  raw_string_ostream O(Name);
+  O << VectorizationContext::InternalBuiltinPrefix;
+  if (MaskTy) {
+    O << "masked_";
+  }
+  O << BaseName << Alignment << "_";
+  if (IsVP) {
+    O << "vp_";
+  }
+  if (!Mangler.mangleType(O, DataTy, VecQuals) ||
+      !Mangler.mangleType(O, VecPtrTy, PtrQuals)) {
+    return std::string();
+  }
+  if (MaskTy && !Mangler.mangleType(O, MaskTy, MaskQuals)) {
+    return std::string();
+  }
+  if (IsVP) {
+    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
+                            VLQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return Name;
+}
+
+Function *vecz::getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx,
+                                                llvm::Type *DataTy,
+                                                llvm::VectorType *VecPtrTy,
+                                                llvm::Type *MaskTy,
+                                                unsigned Alignment,
+                                                bool IsGather, bool IsVP) {
+  Module &M = Ctx.module();
+  LLVMContext &LLVMCtx = M.getContext();
+  assert(VecPtrTy);
+  assert(!MaskTy || multi_llvm::getVectorElementCount(MaskTy) ==
+                        multi_llvm::getVectorElementCount(DataTy));
+
+  // Try to retrieve the builtin if it already exists.
+  std::string Name = getScatterGatherMemOpName(DataTy, VecPtrTy, MaskTy,
+                                               Alignment, IsGather, IsVP);
+  VECZ_FAIL_IF(Name.empty());
+  Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
+  if (!F) {
+    // Declare it if it doesn't exist.
+    SmallVector<Type *, 4> Tys;
+    if (!IsGather) {
+      VECZ_FAIL_IF(!DataTy);
+      Tys.push_back(DataTy);
+    }
+    Tys.push_back(VecPtrTy);
+    if (MaskTy) {
+      Tys.push_back(MaskTy);
+    }
+    if (IsVP) {
+      Tys.push_back(IntegerType::getInt32Ty(LLVMCtx));
+    }
+
+    Type *RetTy = IsGather ? DataTy : Type::getVoidTy(LLVMCtx);
+    FunctionType *FT = FunctionType::get(RetTy, Tys, false);
+    F = Ctx.getOrCreateInternalBuiltin(Name, FT);
+  }
+  return F;
+}
+
+static CallInst *createScatterGatherMemOp(VectorizationContext &Ctx,
+                                          Value *VecData, Type *DataTy,
+                                          Value *VecPtr, Value *Mask,
+                                          Value *EVL, unsigned Alignment,
+                                          Twine Name,
+                                          Instruction *InsertBefore) {
+  VECZ_FAIL_IF(!DataTy);
+  VECZ_FAIL_IF(!VecPtr || !VecPtr->getType()->isVectorTy() ||
+               !VecPtr->getType()->getScalarType()->isPointerTy());
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+      cast<PointerType>(VecPtr->getType()->getScalarType()),
+      DataTy->getScalarType()));
+  Type *MaskTy = Mask ? Mask->getType() : nullptr;
+  Function *F = getOrCreateScatterGatherMemOpFn(
+      Ctx, DataTy, cast<VectorType>(VecPtr->getType()), MaskTy, Alignment,
+      /*IsGather*/ VecData == nullptr, EVL != nullptr);
+  VECZ_FAIL_IF(!F);
+  SmallVector<Value *, 4> Ops;
+  if (VecData) {
+    Ops.push_back(VecData);
+  }
+  Ops.push_back(VecPtr);
+  if (Mask) {
+    Ops.push_back(Mask);
+  }
+  if (EVL) {
+    Ops.push_back(EVL);
+  }
+  return CallInst::Create(F, Ops, Name, InsertBefore);
+}
+
+llvm::CallInst *vecz::createGather(VectorizationContext &Ctx, llvm::Type *Ty,
+                                   llvm::Value *VecPtr, llvm::Value *Mask,
+                                   llvm::Value *EVL, unsigned Alignment,
+                                   llvm::Twine Name,
+                                   llvm::Instruction *InsertBefore) {
+  return createScatterGatherMemOp(Ctx, /*Data*/ nullptr, Ty, VecPtr, Mask, EVL,
+                                  Alignment, Name, InsertBefore);
+}
+
+llvm::CallInst *vecz::createScatter(VectorizationContext &Ctx,
+                                    llvm::Value *VecData, llvm::Value *VecPtr,
+                                    llvm::Value *Mask, llvm::Value *EVL,
+                                    unsigned Alignment, llvm::Twine Name,
+                                    llvm::Instruction *InsertBefore) {
+  return createScatterGatherMemOp(Ctx, VecData, VecData->getType(), VecPtr,
+                                  Mask, EVL, Alignment, Name, InsertBefore);
+}
+
+MemOpDesc::MemOpDesc()
+    : DataTy(nullptr),
+      PtrTy(nullptr),
+      MaskTy(nullptr),
+      Kind(MemOpKind::Invalid),
+      AccessKind(MemOpAccessKind::Native),
+      IsVLOp(false),
+      Alignment(1),
+      Stride(nullptr),
+      DataOpIdx(-1),
+      PtrOpIdx(-1),
+      MaskOpIdx(-1),
+      VLOpIdx(-1) {}
+
+bool MemOpDesc::isStrideConstantInt() const {
+  return Stride && isa<ConstantInt>(Stride);
+}
+
+int64_t MemOpDesc::getStrideAsConstantInt() const {
+  return cast<ConstantInt>(Stride)->getSExtValue();
+}
+
+Argument *MemOpDesc::getOperand(Function *F, int OpIdx) const {
+  VECZ_FAIL_IF(!F || (OpIdx < 0) || ((size_t)OpIdx >= F->arg_size()));
+  return F->getArg(OpIdx);
+}
+
+multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMemOpFunction(Function &F) {
+  if (auto Op = MemOpDesc::analyzeMaskedMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeInterleavedMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeMaskedInterleavedMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeScatterGatherMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeMaskedScatterGatherMemOp(F)) {
+    return Op;
+  }
+  return multi_llvm::None;
+}
+
+multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
+  StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return multi_llvm::None;
+  }
+
+  MemOpDesc Desc;
+  if (L.Consume("masked_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.MaskOpIdx = 2;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::Masked;
+    return Desc;
+  }
+
+  if (L.Consume("masked_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.MaskOpIdx = 1;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::Masked;
+    return Desc;
+  }
+  return multi_llvm::None;
+}
+
+multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(
+    Function &F) {
+  StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return multi_llvm::None;
+  }
+  MemOpDesc Desc;
+  int ConstantStride{};
+  if (L.Consume("interleaved_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      VECZ_ERROR_IF(F.arg_size() != 2,
+                    "Wrong argument list size for interleaved store");
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      VECZ_ERROR_IF(F.arg_size() != 3,
+                    "Wrong argument list size for interleaved store");
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 2);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.AccessKind = MemOpAccessKind::Interleaved;
+    return Desc;
+  }
+
+  if (L.Consume("interleaved_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      VECZ_ERROR_IF(F.arg_size() != 1,
+                    "Wrong argument list size for interleaved load");
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      VECZ_ERROR_IF(F.arg_size() != 2,
+                    "Wrong argument list size for interleaved load");
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 1);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.AccessKind = MemOpAccessKind::Interleaved;
+    return Desc;
+  }
+
+  return multi_llvm::None;
+}
+
+multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(
+    Function &F) {
+  StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return multi_llvm::None;
+  }
+  MemOpDesc Desc;
+  if (L.Consume("masked_interleaved_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    // KLOCWORK "UNINIT.STACK.MUST" possible false positive
+    // Initialization of ConstantStride looks like an uninitialized access to
+    // Klocwork
+    int ConstantStride;
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+        return multi_llvm::None;
+      }
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      if (F.arg_size() != 4 + (unsigned)Desc.IsVLOp) {
+        return multi_llvm::None;
+      }
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 3 + Desc.IsVLOp);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.MaskOpIdx = 2;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedInterleaved;
+    return Desc;
+  }
+  if (L.Consume("masked_interleaved_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    // KLOCWORK "UNINIT.STACK.MUST" possible false positive
+    // Initialization of ConstantStride looks like an uninitialized access to
+    // Klocwork
+    int ConstantStride;
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
+        return multi_llvm::None;
+      }
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+        return multi_llvm::None;
+      }
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 2 + Desc.IsVLOp);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.MaskOpIdx = 1;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedInterleaved;
+    return Desc;
+  }
+
+  return multi_llvm::None;
+}
+
+multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(
+    Function &F) {
+  StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return multi_llvm::None;
+  }
+  MemOpDesc Desc;
+  if (L.Consume("scatter_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    if (F.arg_size() != 2) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.AccessKind = MemOpAccessKind::ScatterGather;
+    return Desc;
+  }
+
+  if (L.Consume("gather_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    if (F.arg_size() != 1) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.AccessKind = MemOpAccessKind::ScatterGather;
+    return Desc;
+  }
+
+  return multi_llvm::None;
+}
+
+multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
+    Function &F) {
+  StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return multi_llvm::None;
+  }
+
+  MemOpDesc Desc;
+  if (L.Consume("masked_scatter_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.MaskOpIdx = 2;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedScatterGather;
+    return Desc;
+  }
+
+  if (L.Consume("masked_gather_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return multi_llvm::None;
+    }
+    if (!L.Consume("_")) {
+      return multi_llvm::None;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
+      return multi_llvm::None;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.MaskOpIdx = 1;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedScatterGather;
+    return Desc;
+  }
+
+  return multi_llvm::None;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+multi_llvm::Optional<MemOp> MemOp::get(llvm::Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    MemOpDesc Desc;
+    Desc.Kind = MemOpKind::LoadInstruction;
+    Desc.Alignment = LI->getAlign().value();
+    Desc.DataTy = LI->getType();
+    auto *PO = LI->getPointerOperand();
+    assert(PO && "Could not get pointer operand");
+    Desc.PtrTy = PO->getType();
+    return MemOp(I, Desc);
+  }
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    MemOpDesc Desc;
+    Desc.Kind = MemOpKind::StoreInstruction;
+    Desc.Alignment = SI->getAlign().value();
+    assert(SI->getValueOperand() && "Could not get value operand");
+    Desc.DataTy = SI->getValueOperand()->getType();
+    auto *PO = SI->getPointerOperand();
+    assert(PO && "Could not get pointer operand");
+    Desc.PtrTy = PO->getType();
+    return MemOp(I, Desc);
+  }
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    if (Function *Caller = CI->getCalledFunction()) {
+      if (auto FnOp = MemOpDesc::analyzeMemOpFunction(*Caller)) {
+        return MemOp(I, *FnOp);
+      }
+    }
+  }
+  return multi_llvm::None;
+}
+
+multi_llvm::Optional<MemOp> MemOp::get(llvm::CallInst *CI,
+                                       MemOpAccessKind AccessKind) {
+  if (!CI->getCalledFunction()) {
+    return multi_llvm::None;
+  }
+  multi_llvm::Optional<MemOpDesc> Desc;
+  if (Function *Caller = CI->getCalledFunction()) {
+    switch (AccessKind) {
+      default:
+        return multi_llvm::None;
+      case MemOpAccessKind::Masked:
+        Desc = MemOpDesc::analyzeMaskedMemOp(*Caller);
+        break;
+      case MemOpAccessKind::Interleaved:
+        Desc = MemOpDesc::analyzeInterleavedMemOp(*Caller);
+        break;
+      case MemOpAccessKind::MaskedInterleaved:
+        Desc = MemOpDesc::analyzeMaskedInterleavedMemOp(*Caller);
+        break;
+      case MemOpAccessKind::ScatterGather:
+        Desc = MemOpDesc::analyzeScatterGatherMemOp(*Caller);
+        break;
+      case MemOpAccessKind::MaskedScatterGather:
+        Desc = MemOpDesc::analyzeMaskedScatterGatherMemOp(*Caller);
+        break;
+    }
+  }
+  if (!Desc) {
+    return multi_llvm::None;
+  }
+  return MemOp(CI, *Desc);
+}
+
+MemOp::MemOp(Instruction *I, const MemOpDesc &desc) {
+  Ins = I;
+  Desc = desc;
+}
+
+llvm::Value *MemOp::getCallOperand(int OpIdx) const {
+  VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) &&
+               (Desc.getKind() != MemOpKind::StoreCall));
+  CallInst *CI = dyn_cast<CallInst>(Ins);
+  VECZ_FAIL_IF(!CI || (OpIdx < 0) || ((unsigned)OpIdx >= CI->arg_size()));
+  return CI->getArgOperand((unsigned)OpIdx);
+}
+
+bool MemOp::setCallOperand(int OpIdx, Value *V) {
+  VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) &&
+               (Desc.getKind() != MemOpKind::StoreCall));
+  CallInst *CI = dyn_cast<CallInst>(Ins);
+  VECZ_FAIL_IF(!CI || (OpIdx < 0) || ((unsigned)OpIdx >= CI->arg_size()));
+  CI->setArgOperand((unsigned)OpIdx, V);
+  return true;
+}
+
+llvm::Value *MemOp::getDataOperand() const {
+  if (Desc.getKind() == MemOpKind::StoreInstruction) {
+    return cast<StoreInst>(Ins)->getValueOperand();
+  } else if (Desc.getKind() == MemOpKind::StoreCall) {
+    return getCallOperand(Desc.getDataOperandIndex());
+  } else {
+    return nullptr;
+  }
+}
+
+llvm::Value *MemOp::getPointerOperand() const {
+  switch (Desc.getKind()) {
+    default:
+      return nullptr;
+    case MemOpKind::LoadInstruction:
+      return cast<LoadInst>(Ins)->getPointerOperand();
+    case MemOpKind::StoreInstruction:
+      return cast<StoreInst>(Ins)->getPointerOperand();
+    case MemOpKind::LoadCall:
+    case MemOpKind::StoreCall:
+      return getCallOperand(Desc.getPointerOperandIndex());
+  }
+}
+
+llvm::Value *MemOp::getMaskOperand() const {
+  switch (Desc.getKind()) {
+    default:
+      return nullptr;
+    case MemOpKind::LoadCall:
+    case MemOpKind::StoreCall:
+      return getCallOperand(Desc.getMaskOperandIndex());
+  }
+}
+
+bool MemOp::setDataOperand(Value *V) {
+  if (Desc.getKind() == MemOpKind::StoreInstruction) {
+    cast<StoreInst>(Ins)->setOperand(0, V);
+    return true;
+  } else if (Desc.getKind() == MemOpKind::StoreCall) {
+    return setCallOperand(Desc.getDataOperandIndex(), V);
+  } else {
+    return false;
+  }
+}
+
+bool MemOp::setPointerOperand(Value *V) {
+  switch (Desc.getKind()) {
+    default:
+      return false;
+    case MemOpKind::LoadInstruction:
+      cast<LoadInst>(Ins)->setOperand(0, V);
+      return true;
+    case MemOpKind::StoreInstruction:
+      cast<StoreInst>(Ins)->setOperand(1, V);
+      return true;
+    case MemOpKind::LoadCall:
+    case MemOpKind::StoreCall:
+      return setCallOperand(Desc.getPointerOperandIndex(), V);
+  }
+}
+
+bool MemOp::setMaskOperand(Value *V) {
+  switch (Desc.getKind()) {
+    default:
+      return false;
+    case MemOpKind::LoadCall:
+    case MemOpKind::StoreCall:
+      return setCallOperand(Desc.getMaskOperandIndex(), V);
+  }
+}
+
+CallInst *MemOp::getCall() const {
+  VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) &&
+               (Desc.getKind() != MemOpKind::StoreCall));
+  return dyn_cast<CallInst>(Ins);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
new file mode 100644
index 0000000000000..1dca4a6f72cd9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -0,0 +1,1058 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "offset_info.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/KnownBits.h>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+inline uint64_t SizeOrZero(TypeSize &&T) {
+  return T.isScalable() ? 0 : multi_llvm::getFixedValue(T);
+}
+
+uint8_t highbit(const uint32_t x) {
+  assert((x & (x - 1)) == 0 && "Value must be a power of two");
+  // This is a De Bruijn hash table, it returns the index of the highest
+  // bit, which works when x is a power of 2. For details, see
+  // https://en.wikipedia.org/wiki/De_Bruijn_sequence#Uses
+  static const uint32_t deBruijn_magic = 0x06EB14F9U;
+  static const uint8_t tab[32] = {
+      0,  1,  16, 2,  29, 17, 3,  22, 30, 20, 18, 11, 13, 4, 7,  23,
+      31, 15, 28, 21, 19, 10, 12, 6,  14, 27, 9,  5,  26, 8, 25, 24,
+  };
+  return tab[(uint32_t)(x * deBruijn_magic) >> 27];
+};
+
+// Returns a value extended or truncated to match the size type of the target.
+// This will return the original value if it is already the correct size.
+Value *matchSizeType(IRBuilder<> &B, Value *V, bool sext) {
+  auto *const sizeTy = getSizeTy(B);
+
+  if (sext) {
+    return B.CreateSExtOrTrunc(V, sizeTy, "stride_conv");
+  } else {
+    return B.CreateZExtOrTrunc(V, sizeTy, "stride_conv");
+  }
+}
+
+uint64_t getTypeMask(Type *Ty) {
+  auto const bits = Ty->getIntegerBitWidth();
+  return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0);
+}
+
+// The index size potentially depends on the address space of the pointer,
+// but let's just use the pointer size for now.
+uint64_t getSizeTypeMask(DataLayout const &DL) {
+  auto const bits = DL.getPointerSizeInBits();
+  return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0);
+}
+
+OffsetKind combineKinds(OffsetKind LHS, OffsetKind RHS) {
+  assert(LHS != eOffsetLinear && RHS != eOffsetLinear &&
+         "OffsetInfo analysis functions should handle all linear cases");
+
+  if (LHS == RHS) {
+    return LHS;
+  }
+
+  if (LHS == eOffsetMayDiverge || RHS == eOffsetMayDiverge) {
+    return eOffsetMayDiverge;
+  }
+
+  // Uniform values are all that's left.
+  return eOffsetUniformVariable;
+}
+}  // namespace
+
+OffsetInfo::OffsetInfo(StrideAnalysisResult &SAR, Value *V)
+    : Kind(eOffsetMayDiverge),
+      ActualValue(V),
+      StrideInt(0),
+      ManifestStride(nullptr),
+      BitMask(~uint64_t(0)) {
+  auto *const ty = V->getType();
+  if (ty->isIntegerTy()) {
+    analyze(V, SAR);
+  } else if (ty->isPointerTy()) {
+    analyzePtr(V, SAR);
+  } else {
+    setMayDiverge();
+  }
+}
+
+Value *OffsetInfo::getUniformValue() const {
+  return isUniform() ? ActualValue : nullptr;
+}
+
+int64_t OffsetInfo::getValueAsConstantInt() const {
+  ConstantInt *CInt = cast<ConstantInt>(ActualValue);
+  return CInt->getSExtValue();
+}
+
+bool OffsetInfo::isStrideConstantInt() const {
+  return (Kind == eOffsetLinear && StrideInt != 0);
+}
+
+int64_t OffsetInfo::getStrideAsConstantInt() const { return StrideInt; }
+
+OffsetInfo &OffsetInfo::setMayDiverge() { return setKind(eOffsetMayDiverge); }
+
+OffsetInfo &OffsetInfo::setStride(Value *Stride) {
+  if (auto *const CInt = dyn_cast_or_null<ConstantInt>(Stride)) {
+    StrideInt = CInt->getSExtValue();
+  } else {
+    StrideInt = 0;
+  }
+  ManifestStride = Stride;
+  Kind = eOffsetLinear;
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::setStride(int64_t Stride) {
+  if (Stride == 0) {
+    Kind = eOffsetUniformVariable;
+  } else {
+    StrideInt = Stride;
+    ManifestStride = nullptr;
+    Kind = eOffsetLinear;
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::setKind(OffsetKind K) {
+  Kind = K;
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
+  Type *OffsetTy = Offset->getType();
+  if (!OffsetTy->isIntegerTy() || OffsetTy->isVectorTy()) {
+    return setMayDiverge();
+  }
+
+  if (auto *const CInt = dyn_cast<ConstantInt>(Offset)) {
+    BitMask = CInt->getZExtValue();
+    return setKind(eOffsetConstant);
+  }
+  BitMask = getTypeMask(OffsetTy);
+
+  if (isa<Argument>(Offset)) {
+    return setKind(eOffsetUniformVariable);
+  }
+
+  Instruction *Ins = dyn_cast<Instruction>(Offset);
+  if (!Ins) {
+    return setMayDiverge();
+  }
+
+  // If we have a uniform value here we don't need to analyse any further.
+  if (!SAR.UVR.isVarying(Ins)) {
+    auto const &KB = computeKnownBits(Ins, SAR.F.getParent()->getDataLayout(),
+                                      0, &SAR.assumptions);
+    auto const bitWidth = OffsetTy->getIntegerBitWidth();
+
+    // We are interested in the bits that are not known to be zero.
+    BitMask &= ~KB.Zero.extractBitsAsZExtValue(bitWidth, 0);
+    return setKind(eOffsetUniformVariable);
+  }
+
+  // Analyse binary instructions.
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Offset)) {
+    // Copy these values into local variables, because `SAR.analyze()` can
+    // invalidate any previously obtained references.
+    auto const LHS = SAR.analyze(BOp->getOperand(0));
+    auto const RHS = SAR.analyze(BOp->getOperand(1));
+    if (LHS.mayDiverge() || RHS.mayDiverge()) {
+      return setMayDiverge();
+    }
+
+    if (isa<OverflowingBinaryOperator>(BOp) && !BOp->hasNoUnsignedWrap()) {
+      // This operation can over/underflow, therefore all bets are off on
+      // which bits are on. We set it to all ones so a ZExt will catch it.
+      // SExt does not care since overflow is UB.
+      BitMask = ~uint64_t(0);
+    }
+
+    switch (BOp->getOpcode()) {
+      default:
+        return setMayDiverge();
+      case Instruction::Add:
+        return combineAdd(LHS, RHS);
+      case Instruction::Sub:
+        return combineSub(LHS, RHS);
+      case Instruction::And:
+        return combineAnd(LHS, RHS);
+      case Instruction::Or:
+        return combineOr(LHS, RHS);
+      case Instruction::Xor:
+        return combineXor(LHS, RHS);
+      case Instruction::Mul:
+        return combineMul(LHS, RHS);
+      case Instruction::Shl:
+        return combineShl(LHS, RHS);
+      case Instruction::AShr:
+        return combineAShr(LHS, RHS);
+    }
+  }
+
+  // Consider that integer casts cannot scale item IDs.
+  if (CastInst *Cast = dyn_cast<CastInst>(Offset)) {
+    auto const &Src = SAR.analyze(Cast->getOperand(0));
+    if (Src.mayDiverge()) {
+      return setMayDiverge();
+    }
+
+    // However, a Zero-extended offset can underflow.
+    if (isa<ZExtInst>(Cast)) {
+      // A zero-extended offset could underflow and result in an invalid base
+      // address, rendering the entire strided MemOp invalid, even when masked
+      // such that the read from the base address is not meant to execute.
+      // Note that we don't care about overflowing the index type.
+      auto const typeMask = getTypeMask(Cast->getSrcTy());
+      auto const bitMaskSized =
+          Src.BitMask & getSizeTypeMask(Cast->getModule()->getDataLayout());
+      if ((bitMaskSized & typeMask) != bitMaskSized) {
+        return setMayDiverge();
+      }
+      BitMask = Src.BitMask & typeMask;
+    } else if (isa<SExtInst>(Cast)) {
+      uint64_t widthMask = getTypeMask(Cast->getSrcTy());
+      uint64_t signMask = (widthMask >> 1) + 1;
+      if (Src.BitMask & signMask) {
+        // If it's possible for the source value to be negative, all of the
+        // bits in the extended value might be set.
+        BitMask = Src.BitMask | ~widthMask;
+      } else {
+        BitMask = Src.BitMask & widthMask;
+      }
+    } else {
+      // We don't truncate the bitmask here, since we don't know if it's going
+      // to be sign extended or zero extended later, which affects whether we
+      // can ignore overflow or not.
+      BitMask = Src.BitMask;
+    }
+    return copyStrideFrom(Src);
+  }
+
+  if (auto *Select = dyn_cast<SelectInst>(Offset)) {
+    if (SAR.UVR.isVarying(Select->getCondition())) {
+      return setMayDiverge();
+    }
+
+    // If the condition isn't varying and both operands have the same
+    // constant stride, the result will also have the same constant stride.
+    auto const LHS = SAR.analyze(Select->getOperand(1));
+    auto const RHS = SAR.analyze(Select->getOperand(2));
+    if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
+        LHS.isStrideConstantInt()) {
+      return copyStrideFrom(LHS);
+    }
+    return setMayDiverge();
+  }
+
+  if (auto *Phi = dyn_cast<PHINode>(Offset)) {
+    if (auto *const CVal = Phi->hasConstantValue()) {
+      return copyStrideFrom(SAR.analyze(CVal));
+    }
+
+    auto NumIncoming = Phi->getNumIncomingValues();
+    if (NumIncoming == 1) {
+      // LCSSA Phi, just go right through it..
+      return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(0)));
+    } else if (NumIncoming == 2) {
+      auto identifyIncrement = [&](Value *incoming) -> bool {
+        if (auto *BOp = dyn_cast<BinaryOperator>(incoming)) {
+          auto Opcode = BOp->getOpcode();
+          // If it's a simple loop iterator, the stride can be analyzed from the
+          // initial value.
+          return ((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
+                  BOp->getOperand(0) == Phi &&
+                  !SAR.UVR.isVarying(BOp->getOperand(1))) ||
+                 (Opcode == Instruction::Add && BOp->getOperand(1) == Phi &&
+                  !SAR.UVR.isVarying(BOp->getOperand(0)));
+        }
+        return false;
+      };
+
+      // Try the PHI node's incoming values both ways round.
+      if (identifyIncrement(Phi->getIncomingValue(1))) {
+        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(0)));
+      } else if (identifyIncrement(Phi->getIncomingValue(0))) {
+        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(1)));
+      }
+    }
+    return setMayDiverge();
+  }
+
+  // Analyse function calls.
+  if (CallInst *CI = dyn_cast<CallInst>(Offset)) {
+    auto const &BI = SAR.UVR.Ctx.builtins();
+    auto const Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension);
+    switch (Builtin.uniformity) {
+      default:
+      case compiler::utils::eBuiltinUniformityMaybeInstanceID:
+      case compiler::utils::eBuiltinUniformityNever:
+        return setMayDiverge();
+      case compiler::utils::eBuiltinUniformityLikeInputs:
+        break;
+      case compiler::utils::eBuiltinUniformityAlways:
+        return setKind(eOffsetUniformVariable);
+      case compiler::utils::eBuiltinUniformityInstanceID:
+        if (Builtin.properties & compiler::utils::eBuiltinPropertyLocalID) {
+          // If the local size is unknown (represented by zero), the
+          // resulting mask will be ~0ULL (all ones). Potentially, it is
+          // possible to use the CL_​DEVICE_​MAX_​WORK_​ITEM_​SIZES
+          // property as an upper bound in this case.
+          uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
+          LocalBitMask |= LocalBitMask >> 32;
+          LocalBitMask |= LocalBitMask >> 16;
+          LocalBitMask |= LocalBitMask >> 8;
+          LocalBitMask |= LocalBitMask >> 4;
+          LocalBitMask |= LocalBitMask >> 2;
+          LocalBitMask |= LocalBitMask >> 1;
+          BitMask = LocalBitMask;
+        }
+        return setStride(1);
+    }
+  }
+
+  return setMayDiverge();
+}
+
+OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
+  if (BitCastInst *BCast = dyn_cast<BitCastInst>(Address)) {
+    return copyStrideFrom(SAR.analyze(BCast->getOperand(0)));
+  } else if (auto *ASCast = dyn_cast<AddrSpaceCastInst>(Address)) {
+    return copyStrideFrom(SAR.analyze(ASCast->getOperand(0)));
+  } else if (auto *IntPtr = dyn_cast<IntToPtrInst>(Address)) {
+    return copyStrideFrom(SAR.analyze(IntPtr->getOperand(0)));
+  } else if (auto *Arg = dyn_cast<Argument>(Address)) {
+    // 'Pointer return' arguments should be treated as having an implicit ItemID
+    // offset. This allows memory operations to be packetized instead of
+    // instantiated.
+    if (Arg->getType()->isPointerTy()) {
+      for (const VectorizerTargetArgument &VUArg : SAR.UVR.VU.arguments()) {
+        if (((VUArg.OldArg == Arg) || (VUArg.NewArg == Arg)) &&
+            VUArg.PointerRetPointeeTy) {
+          Type *MemTy = VUArg.PointerRetPointeeTy;
+          uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
+          return setStride(MemSize);
+        }
+      }
+    }
+    return setKind(eOffsetUniformVariable);
+  } else if (isa<GlobalVariable>(Address)) {
+    return setKind(eOffsetUniformVariable);
+  } else if (!SAR.UVR.isVarying(Address)) {
+    // If it's uniform we can just return the uniform address.
+    // Check this condition before bothering to descend into Phi nodes or GEPs,
+    // since we know stride is zero anyway.
+    return setKind(eOffsetUniformVariable);
+  } else if (auto *const Alloca = dyn_cast<AllocaInst>(Address)) {
+    if (needsInstantiation(SAR.UVR.Ctx, *Alloca)) {
+      // Instantiated allocas result in scatter/gather
+      return setMayDiverge();
+    }
+
+    Type *MemTy = Alloca->getAllocatedType();
+    uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
+    return setStride(MemSize);
+  } else if (auto *const Phi = dyn_cast<PHINode>(Address)) {
+    // If all the incoming values are the same, we can trace through it. In
+    // the general case, it's not trivial to check that the stride is the same
+    // from every incoming block, and since incoming values may not dominate
+    // the IRBuilder insert point, we might not even be able to build the
+    // offset expression instructions there.
+    if (auto *const CVal = Phi->hasConstantValue()) {
+      return copyStrideFrom(SAR.analyze(CVal));
+    }
+
+    // In the simple case of a loop-incremented pointer using a GEP, we can
+    // handle it thus:
+    auto NumIncoming = Phi->getNumIncomingValues();
+    if (NumIncoming != 2) {
+      // Perhaps we can handle more than one loop latch, but not yet.
+      return setMayDiverge();
+    }
+
+    if (auto *const GEP =
+            dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(1))) {
+      // If it's a simple loop iterator, the stride can be analyzed from the
+      // initial value.
+      if (GEP->getPointerOperand() == Phi) {
+        for (auto const &index : GEP->indices()) {
+          if (SAR.UVR.isVarying(index.get())) {
+            return setMayDiverge();
+          }
+        }
+        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(0)));
+      }
+    } else if (auto *const GEP =
+                   dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(0))) {
+      // If it's a simple loop iterator, the stride can be analyzed from the
+      // initial value.
+      if (GEP->getPointerOperand() == Phi) {
+        for (auto const &index : GEP->indices()) {
+          if (SAR.UVR.isVarying(index.get())) {
+            return setMayDiverge();
+          }
+        }
+        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(1)));
+      }
+    }
+
+    return setMayDiverge();
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Address)) {
+    {
+      auto *const Ptr = GEP->getPointerOperand();
+      auto const &PtrInfo = SAR.analyze(Ptr);
+      if (PtrInfo.mayDiverge()) {
+        if (isa<SelectInst>(Ptr)) {
+          // For the benefit of the Ternary Transform Pass
+          for (Value *idx : GEP->indices()) {
+            SAR.analyze(idx);
+          }
+        }
+        return setMayDiverge();
+      }
+      copyStrideFrom(PtrInfo);
+    }
+
+    PointerType *GEPPtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+    if (!GEPPtrTy) {
+      // A GEP base can be a vector of pointers, for instance. (Unexpected!)
+      return setMayDiverge();
+    }
+
+    int64_t GEPStrideInt = StrideInt;
+    bool StrideVariable = (hasStride() && StrideInt == 0);
+    SmallVector<Value *, 4> Indices;
+    for (unsigned i = 0; i < GEP->getNumIndices(); i++) {
+      // Analyze each GEP offset.
+      Value *GEPIndex = GEP->getOperand(1 + i);
+      assert(GEPIndex && "Could not get operand from GEP");
+
+      auto const &idxOffset = SAR.analyze(GEPIndex);
+      if (idxOffset.mayDiverge()) {
+        return setMayDiverge();
+      }
+
+      Indices.push_back(GEPIndex);
+      if (!idxOffset.hasStride()) {
+        continue;
+      }
+
+      Type *MemTy = GetElementPtrInst::getIndexedType(
+          GEP->getSourceElementType(), Indices);
+      if (!MemTy) {
+        // A somewhat unlikely scenario...?
+        return setMayDiverge();
+      }
+
+      if (idxOffset.isStrideConstantInt()) {
+        // Add all the strides together,
+        // since `Base + (A * X) + (B * X) == Base + (A + B) * X`
+        uint64_t MemSize = SizeOrZero(
+            GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy));
+        GEPStrideInt += idxOffset.StrideInt * MemSize;
+      } else {
+        StrideVariable = true;
+      }
+    }
+
+    if (StrideVariable) {
+      // We don't know what the stride is yet,
+      // but we know it's linear and variable.
+      setStride(nullptr);
+    } else {
+      setStride(GEPStrideInt);
+    }
+    return *this;
+  } else if (auto *Select = dyn_cast<SelectInst>(Address)) {
+    auto const LHS = SAR.analyze(Select->getOperand(1));
+    auto const RHS = SAR.analyze(Select->getOperand(2));
+    if (SAR.UVR.isVarying(Select->getCondition())) {
+      // Note that we analyze the operands before returning here, for the
+      // benefit of the Ternary Transform Pass, which does its work ONLY
+      // when the condition is varying.
+      return setMayDiverge();
+    }
+
+    // If the condition isn't varying and both operands have the same
+    // constant stride, the result will also have the same constant stride.
+    if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
+        LHS.isStrideConstantInt()) {
+      return copyStrideFrom(LHS);
+    }
+    return setMayDiverge();
+  }
+
+  // If it's varying we can't analyze it any further.
+  return setMayDiverge();
+}
+
+OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
+  if (ManifestStride || Kind != eOffsetLinear) {
+    // If we already manifested the stride, or if it's not a linear value,
+    // there is nothing to do.
+    return *this;
+  }
+
+  if (StrideInt != 0) {
+    // It's an integer stride so we can just create a `ConstantInt`.
+    ManifestStride = getSizeInt(B, StrideInt);
+    return *this;
+  }
+
+  Instruction *Offset = cast<Instruction>(ActualValue);
+  // Analyse binary instructions.
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Offset)) {
+    auto const &LHS = SAR.manifest(B, BOp->getOperand(0));
+    auto const &RHS = SAR.manifest(B, BOp->getOperand(1));
+
+    // Build strides immediately before their instructions
+    B.SetInsertPoint(BOp);
+    switch (BOp->getOpcode()) {
+      default:
+        return *this;
+      case Instruction::Add:
+        return manifestAdd(B, LHS, RHS);
+      case Instruction::Sub:
+        return manifestSub(B, LHS, RHS);
+      case Instruction::And:
+        return manifestAnd(B, LHS, RHS);
+      case Instruction::Or:
+        return manifestOr(B, LHS, RHS);
+      case Instruction::Xor:
+        return manifestXor(B, LHS, RHS);
+      case Instruction::Mul:
+        return manifestMul(B, LHS, RHS);
+      case Instruction::Shl:
+        return manifestShl(B, LHS, RHS);
+      case Instruction::AShr:
+        return manifestAShr(B, LHS, RHS);
+    }
+  }
+
+  // Consider that integer casts cannot scale item IDs.
+  if (CastInst *Cast = dyn_cast<CastInst>(Offset)) {
+    return copyStrideFrom(SAR.manifest(B, Cast->getOperand(0)));
+  }
+
+  if (auto *Phi = dyn_cast<PHINode>(Offset)) {
+    auto NumIncoming = Phi->getNumIncomingValues();
+    Value *SrcVal = nullptr;
+    if (NumIncoming == 1) {
+      // LCSSA Phi, just go right through it..
+      SrcVal = Phi->getIncomingValue(0);
+    } else if (auto *const CVal = Phi->hasConstantValue()) {
+      SrcVal = CVal;
+    } else if (NumIncoming == 2) {
+      auto identifyIncrement = [&](Value *incoming) -> bool {
+        if (auto *BOp = dyn_cast<BinaryOperator>(incoming)) {
+          // If this consumes the Phi node, we have found the increment.
+          return BOp->getOperand(0) == Phi || BOp->getOperand(1) == Phi;
+        } else if (auto *GEP = dyn_cast<GetElementPtrInst>(incoming)) {
+          return GEP->getPointerOperand() == Phi;
+        }
+        return false;
+      };
+
+      // Try the PHI node's incoming values both ways round.
+      if (identifyIncrement(Phi->getIncomingValue(1))) {
+        SrcVal = Phi->getIncomingValue(0);
+      } else if (identifyIncrement(Phi->getIncomingValue(0))) {
+        SrcVal = Phi->getIncomingValue(1);
+      }
+    }
+    assert(SrcVal && "Unexpected Phi node during stride manifestation");
+    return copyStrideFrom(SAR.manifest(B, SrcVal));
+  }
+
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Offset)) {
+    auto const &Ptr = SAR.manifest(B, GEP->getPointerOperand());
+    copyStrideFrom(Ptr);
+
+    PointerType *GEPPtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+    if (!GEPPtrTy) {
+      // A GEP base can be a vector of pointers, for instance. (Unexpected!)
+      return setMayDiverge();
+    }
+
+    Value *GEPStride = nullptr;
+    SmallVector<Value *, 4> Indices;
+    for (unsigned i = 0; i < GEP->getNumIndices(); i++) {
+      // Analyze each GEP offset.
+      Value *GEPIndex = GEP->getOperand(1 + i);
+      assert(GEPIndex && "Could not get operand from GEP");
+
+      auto const &idxOffset = SAR.manifest(B, GEPIndex);
+
+      Indices.push_back(GEPIndex);
+      if (!idxOffset.hasStride()) {
+        continue;
+      }
+
+      Type *MemTy = GetElementPtrInst::getIndexedType(
+          GEP->getSourceElementType(), Indices);
+
+      // Build stride instructions immediately before the GEP. Note that the
+      // process of manifesting the indices can change the insert point.
+      B.SetInsertPoint(GEP);
+      Value *idxStride = nullptr;
+      uint64_t MemSize =
+          SizeOrZero(GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy));
+      if (MemSize == 1) {
+        // Don't need to do anything if the size is 1
+        idxStride = idxOffset.ManifestStride;
+      } else {
+        if ((MemSize & (MemSize - 1)) == 0) {
+          // the size is a power of two, so shift to get the offset in bytes
+          auto *const SizeVal = getSizeInt(B, highbit(MemSize));
+          idxStride = B.CreateShl(idxOffset.ManifestStride, SizeVal);
+        } else {
+          // otherwise, multiply
+          auto *const SizeVal = getSizeInt(B, MemSize);
+          idxStride = B.CreateMul(idxOffset.ManifestStride, SizeVal);
+        }
+      }
+
+      // Add all the strides together,
+      // since `Base + (A * X) + (B * X) == Base + (A + B) * X`
+      if (GEPStride) {
+        GEPStride = B.CreateAdd(GEPStride, idxStride);
+      } else {
+        GEPStride = idxStride;
+      }
+    }
+
+    if (GEPStride) {
+      setStride(GEPStride);
+    }
+  }
+
+  return *this;
+}
+
+uint64_t OffsetInfo::getConstantMemoryStride(Type *PtrEleTy,
+                                             DataLayout const *DL) const {
+  uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
+  VECZ_FAIL_IF(!PtrEleSize);
+
+  // It's not a valid stride if it's not divisible by the element size.
+  // Can't generate a valid interleaved MemOp from it!
+  if (StrideInt != 0 && StrideInt % PtrEleSize != 0) {
+    return 0;
+  }
+  return StrideInt / PtrEleSize;
+}
+
+Value *OffsetInfo::buildMemoryStride(IRBuilder<> &B, Type *PtrEleTy,
+                                     DataLayout const *DL) const {
+  if (!ManifestStride) {
+    assert(Kind != eOffsetLinear &&
+           "buildMemoryStride: linear stride not manifest");
+    return nullptr;
+  }
+
+  uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
+  VECZ_FAIL_IF(!PtrEleSize);
+
+  // It's not a valid stride if it's not divisible by the element size.
+  // Can't generate a valid interleaved MemOp from it!
+  if (StrideInt != 0 && StrideInt % PtrEleSize != 0) {
+    return nullptr;
+  }
+
+  if ((PtrEleSize & (PtrEleSize - 1)) == 0) {
+    auto ShiftVal = highbit(PtrEleSize);
+    if (auto *BinOp = dyn_cast<BinaryOperator>(ManifestStride)) {
+      if (BinOp->getOpcode() == Instruction::Shl) {
+        if (auto *ConstSize = dyn_cast<ConstantInt>(BinOp->getOperand(1))) {
+          if (ConstSize->getZExtValue() == ShiftVal) {
+            return BinOp->getOperand(0);
+          }
+        }
+      }
+    }
+
+    auto *const stride =
+        B.CreateAShr(ManifestStride, ConstantInt::get(getSizeTy(B), ShiftVal));
+    return stride;
+  } else {
+    if (auto *BinOp = dyn_cast<BinaryOperator>(ManifestStride)) {
+      if (BinOp->getOpcode() == Instruction::Mul) {
+        if (auto *ConstSize = dyn_cast<ConstantInt>(BinOp->getOperand(1))) {
+          if (ConstSize->getZExtValue() == PtrEleSize) {
+            return BinOp->getOperand(0);
+          }
+        }
+      }
+    }
+
+    auto *const stride = B.CreateSDiv(
+        ManifestStride, ConstantInt::get(getSizeTy(B), PtrEleSize));
+    return stride;
+  }
+}
+
+OffsetInfo &OffsetInfo::combineAdd(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  BitMask &= LHS.BitMask | RHS.BitMask | (LHS.BitMask + RHS.BitMask);
+
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear + Linear
+      if (LHS.isStrideConstantInt() && RHS.isStrideConstantInt()) {
+        return setStride(LHS.StrideInt + RHS.StrideInt);
+      } else {
+        return setStride(nullptr);
+      }
+    } else {
+      // Linear + Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform + Linear
+    return copyStrideFrom(RHS);
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestAdd(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear + Linear
+      auto *const newAdd = B.CreateAdd(LHS.ManifestStride, RHS.ManifestStride);
+      return setStride(newAdd);
+    } else {
+      // Linear + Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform + Linear
+    return copyStrideFrom(RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineSub(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear - Linear
+      if (LHS.isStrideConstantInt() && RHS.isStrideConstantInt()) {
+        return setStride(LHS.StrideInt - RHS.StrideInt);
+      } else {
+        return setStride(nullptr);
+      }
+    } else {
+      // Linear - Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform - Linear
+    // Subtracting an item ID results in a negative stride.
+    if (RHS.isStrideConstantInt()) {
+      return setStride(-RHS.StrideInt);
+    } else {
+      return setStride(nullptr);
+    }
+  }
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestSub(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear - Linear
+      auto *const newSub = B.CreateSub(LHS.ManifestStride, RHS.ManifestStride);
+      return setStride(newSub);
+    } else {
+      // Linear - Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform - Linear
+    // Subtracting an item ID results in a negative stride.
+    auto *const newNeg = B.CreateNeg(RHS.ManifestStride);
+    return setStride(newNeg);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineAnd(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  BitMask = LHS.BitMask & RHS.BitMask;
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear & Linear -> can't analyze
+      return setMayDiverge();
+    } else {
+      // Linear & Uniform
+      // If we didn't lose any bits of the LHS, we can do it.
+      if (BitMask == LHS.BitMask) {
+        return copyStrideFrom(LHS);
+      } else {
+        return setMayDiverge();
+      }
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform & Linear
+    // If we didn't lose any bits of the RHS, we can do it.
+    if (BitMask == RHS.BitMask) {
+      return copyStrideFrom(RHS);
+    } else {
+      return setMayDiverge();
+    }
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestAnd(IRBuilder<> &, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    return copyStrideFrom(LHS);
+  } else if (RHS.hasStride()) {
+    return copyStrideFrom(RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineOr(const OffsetInfo &LHS,
+                                  const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Or is equivalent to an Add if the operands have no bits in common.
+    return combineAdd(LHS, RHS);
+  }
+
+  if (LHS.hasStride() || RHS.hasStride()) {
+    return setMayDiverge();
+  }
+
+  BitMask = LHS.BitMask | RHS.BitMask;
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestOr(IRBuilder<> &B, const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Or is equivalent to an Add if the operands have no bits in common.
+    return manifestAdd(B, LHS, RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineXor(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Xor is equivalent to an Add if the operands have no bits in common.
+    return combineAdd(LHS, RHS);
+  }
+
+  if (LHS.hasStride() || RHS.hasStride()) {
+    return setMayDiverge();
+  }
+
+  BitMask = LHS.BitMask | RHS.BitMask;
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestXor(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Xor is equivalent to an Add if the operands have no bits in common.
+    return manifestAdd(B, LHS, RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineShl(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if (RHS.hasStride()) {
+    return setMayDiverge();
+  } else if (LHS.hasStride()) {
+    auto *const Shift = RHS.getUniformValue();
+    if (!Shift) {
+      return setMayDiverge();
+    }
+
+    if (ConstantInt *CShift = dyn_cast<ConstantInt>(Shift)) {
+      auto const CVal = CShift->getZExtValue();
+      BitMask = LHS.BitMask << CVal;
+      return setStride(LHS.StrideInt << CVal);
+    }
+
+    BitMask = ~uint64_t(0);
+    return setStride(nullptr);
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestShl(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  auto *const Shift = RHS.getUniformValue();
+  if (Shift && LHS.hasStride()) {
+    auto *const sizeShift = matchSizeType(B, Shift, false);
+    auto *const newShl = B.CreateShl(LHS.ManifestStride, sizeShift);
+    return setStride(newShl);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineAShr(const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (RHS.hasStride()) {
+    return setMayDiverge();
+  } else if (LHS.hasStride()) {
+    auto *const Shift = RHS.getUniformValue();
+    if (!Shift) {
+      return setMayDiverge();
+    }
+
+    // We have to be careful with right shifts, because some bits of the stride
+    // could get shifted out of the right-hand-side, causing it not to be
+    // uniform anymore.
+    if (RHS.Kind == eOffsetConstant) {
+      auto CShift = RHS.getValueAsConstantInt();
+      if (CShift < 0 || CShift >= 64) {
+        // Unlikely, but just in case..
+        return setMayDiverge();
+      }
+
+      // Note that we shift the bitmask as a signed value.
+      // Note also that the BitMask is been initialized to the width of the
+      // integer type.
+      uint64_t const signMask = (BitMask >> 1) + 1;
+      if (LHS.BitMask & signMask) {
+        // If it's possible for the source value to be negative, all of the
+        // bits in the extended value might be set.
+        BitMask &= (LHS.BitMask >> CShift) | ~(BitMask >> CShift);
+      } else {
+        BitMask &= LHS.BitMask >> CShift;
+      }
+
+      if (LHS.isStrideConstantInt()) {
+        auto const lostBits = ((uint64_t(1) << CShift) - 1);
+        if ((LHS.StrideInt & lostBits) == 0 || (LHS.BitMask & lostBits) == 0) {
+          return setStride(LHS.StrideInt >> CShift);
+        }
+      } else if ((LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) {
+        return setStride(nullptr);
+      }
+    }
+    return setMayDiverge();
+  }
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestAShr(IRBuilder<> &B, const OffsetInfo &LHS,
+                                     const OffsetInfo &RHS) {
+  if (RHS.Kind == eOffsetConstant) {
+    auto *const Shift = RHS.getUniformValue();
+    auto const CShift = RHS.getValueAsConstantInt();
+
+    if (!LHS.isStrideConstantInt() &&
+        (LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) {
+      auto *const sizeShift = matchSizeType(B, Shift, false);
+      auto *const newAShr = B.CreateAShr(LHS.ManifestStride, sizeShift);
+      return setStride(newAShr);
+    }
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineMul(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if (LHS.hasStride() && RHS.hasStride()) {
+    // Linear * Linear = not Linear
+    return setMayDiverge();
+  }
+
+  if (LHS.hasStride()) {
+    // Linear * Uniform
+    if (LHS.isStrideConstantInt() && RHS.Kind == eOffsetConstant) {
+      return setStride(LHS.StrideInt * RHS.getValueAsConstantInt());
+    } else {
+      return setStride(nullptr);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform * Linear
+    if (RHS.isStrideConstantInt() && LHS.Kind == eOffsetConstant) {
+      return setStride(RHS.StrideInt * LHS.getValueAsConstantInt());
+    } else {
+      return setStride(nullptr);
+    }
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestMul(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    // Linear * Uniform
+    if (auto *const RHSUniform = RHS.getUniformValue()) {
+      auto *const sizeMul = matchSizeType(B, RHSUniform, true);
+      auto *const newMul = B.CreateMul(LHS.ManifestStride, sizeMul);
+      return setStride(newMul);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform * Linear
+    if (auto *const LHSUniform = LHS.getUniformValue()) {
+      auto *const sizeMul = matchSizeType(B, LHSUniform, true);
+      auto *const newMul = B.CreateMul(RHS.ManifestStride, sizeMul);
+      return setStride(newMul);
+    }
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::copyStrideFrom(const OffsetInfo &Other) {
+  Kind = Other.Kind;
+  StrideInt = Other.StrideInt;
+  ManifestStride = Other.ManifestStride;
+  return *this;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
new file mode 100644
index 0000000000000..4f532876c1ad1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -0,0 +1,253 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vecz/pass.h"
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/device_info.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/vectorization_factor.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+
+#include <cstdlib>
+#include <functional>
+#include <tuple>
+
+#include "vectorization_context.h"
+#include "vectorization_helpers.h"
+#include "vectorization_unit.h"
+#include "vectorizer.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+#include "vecz_pass_builder.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+
+/// @brief Provide debug logging for Vecz's PassManager
+///
+/// This flag is intended for testing and debugging purposes.
+cl::opt<bool> DebugVeczPipeline(
+    "debug-vecz-pipeline",
+    cl::desc("Enable debug logging of the vecz PassManager"));
+
+/// @brief Provide debug logging for Vecz's PassManager
+///
+/// This flag specifies a textual description of the optimization pass pipeline
+/// to run over the kernel.
+cl::opt<std::string> VeczPassPipeline(
+    "vecz-passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'."));
+
+namespace vecz {
+using FnVectorizationResult =
+    std::pair<Function *, compiler::utils::VectorizationFactor>;
+AnalysisKey VeczPassOptionsAnalysis::Key;
+
+PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  auto getVeczOptions = MAM.getResult<VeczPassOptionsAnalysis>(M);
+  auto preserved = PreservedAnalyses::none();
+  // Cache the current set of functions as the vectorizer will insert new ones,
+  // which we don't want to revisit.
+  SmallVector<std::pair<Function *, llvm::SmallVector<VeczPassOptions, 1>>, 4>
+      FnOpts;
+  for (auto &Fn : M.functions()) {
+    llvm::SmallVector<VeczPassOptions, 1> Opts;
+    if (!getVeczOptions(Fn, MAM, Opts)) {
+      continue;
+    }
+    FnOpts.emplace_back(std::make_pair(&Fn, std::move(Opts)));
+  }
+
+  ModulePassManager PM;
+
+  auto &device_info = MAM.getResult<compiler::utils::DeviceInfoAnalysis>(M);
+  TargetInfo *target_info = MAM.getResult<vecz::TargetInfoAnalysis>(M);
+  assert(target_info && "Missing TargetInfo");
+  auto &builtin_info = MAM.getResult<compiler::utils::BuiltinInfoAnalysis>(M);
+
+  VectorizationContext Ctx(M, *target_info, builtin_info);
+  VeczPassMachinery Mach(M.getContext(), target_info->getTargetMachine(), Ctx,
+                         /*verifyEach*/ false,
+                         DebugVeczPipeline
+                             ? compiler::utils::DebugLogging::Normal
+                             : compiler::utils::DebugLogging::None);
+  Mach.initializeStart();
+  Mach.getMAM().registerPass([&device_info] {
+    return compiler::utils::DeviceInfoAnalysis(device_info);
+  });
+  Mach.initializeFinish();
+
+  // Forcibly compute the DeviceInfoAnalysis so that cached retrievals work.
+  PM.addPass(
+      RequireAnalysisPass<compiler::utils::DeviceInfoAnalysis, Module>());
+
+  bool const Check = VeczPassPipeline.empty();
+  if (Check) {
+    if (!buildPassPipeline(PM)) {
+      return PreservedAnalyses::all();
+    }
+  } else {
+    if (auto Err = Mach.getPB().parsePassPipeline(PM, VeczPassPipeline)) {
+      // NOTE this is a command line user error print, not a debug print.
+      // We may want to hoist this out of Vecz once CA-4134 is resolved.
+      errs() << "vecz pipeline: " << toString(std::move(Err)) << "\n";
+      return PreservedAnalyses::all();
+    }
+  }
+
+  // Create the vectorization units and clone the kernels
+  using ResultTy =
+      SmallVector<std::pair<VectorizationUnit *, VeczPassOptions *>, 2>;
+  SmallDenseMap<Function *, ResultTy, 2> Results;
+  for (auto &P : FnOpts) {
+    Function *Fn = P.first;
+    ResultTy T;
+    Results.insert(std::make_pair(Fn, std::move(T)));
+    for (auto &Opts : P.second) {
+      auto *const VU =
+          createVectorizationUnit(Ctx, Fn, Opts, Mach.getFAM(), Check);
+      if (!VU) {
+        LLVM_DEBUG(llvm::dbgs() << Fn->getName() << " was not vectorized\n");
+        continue;
+      }
+      Results[Fn].emplace_back(std::make_pair(VU, &Opts));
+
+      if (auto *const VecFn = vecz::cloneFunctionToVector(*VU)) {
+        VU->setVectorizedFunction(VecFn);
+
+        // Allows the Vectorization Unit Analysis to work on the vector kernel
+        Ctx.setActiveVU(VecFn, VU);
+      } else {
+        LLVM_DEBUG(llvm::dbgs() << Fn->getName() << " could not be cloned\n");
+      }
+    }
+  }
+
+  // Vectorize everything
+  PM.run(M, Mach.getMAM());
+
+  auto AllOnModule = llvm::PreservedAnalyses::allInSet<AllAnalysesOn<Module>>();
+  auto eraseFailed = [&](VectorizationUnit *VU) {
+    Function *VectorizedFn = VU->vectorizedFunction();
+    if (VectorizedFn) {
+      // If we fail to vectorize a function, we still cloned and then
+      // deleted it which affects internal addresses. The module has changed
+      // and we can't cache any analyses.
+      Mach.getFAM().invalidate(*VectorizedFn, llvm::PreservedAnalyses::all());
+      // Remove the partially-vectorized function if something went wrong.
+      Ctx.clearActiveVU(VectorizedFn);
+      VU->setVectorizedFunction(nullptr);
+      VectorizedFn->eraseFromParent();
+    }
+    MAM.invalidate(M, AllOnModule);
+  };
+
+  // Fix up the metadata and clean out any dead kernels
+  for (auto &P : Results) {
+    Function *Fn = P.first;
+    auto &Result = P.second;
+    bool const IsKernel = compiler::utils::isKernel(*Fn);
+    bool DropScalarMDs = IsKernel && !Result.empty();
+    for (auto &R : Result) {
+      VectorizationUnit *VU = R.first;
+      trackVeczSuccessFailure(*VU);
+      if (!createVectorizedFunctionMetadata(*VU)) {
+        // We only drop the metadata from the scalar kernel when the number of
+        // Results is non-zero and they all succeeded
+        DropScalarMDs = false;
+        LLVM_DEBUG(dbgs() << Fn->getName() << " failed to vectorize\n");
+        eraseFailed(VU);
+      }
+    }
+    if (DropScalarMDs) {
+      compiler::utils::dropIsKernel(*Fn);
+    }
+  }
+  return PreservedAnalyses::none();
+}
+
+PreservedAnalyses VeczPassOptionsPrinterPass::run(Module &M,
+                                                  ModuleAnalysisManager &MAM) {
+  auto getVeczOptions = MAM.getResult<VeczPassOptionsAnalysis>(M);
+  for (auto &F : M.functions()) {
+    OS << "Function '" << F.getName() << "'";
+    llvm::SmallVector<VeczPassOptions, 1> Opts;
+    if (!getVeczOptions(F, MAM, Opts)) {
+      OS << " will not be vectorized\n";
+      continue;
+    }
+
+    OS << " will be vectorized {\n";
+    for (auto &O : Opts) {
+      OS << "  VF = ";
+      if (O.factor.isScalable()) {
+        OS << "vscale x ";
+      }
+      OS << O.factor.getKnownMin();
+
+      if (O.vecz_auto) {
+        OS << ", (auto)";
+      }
+
+      OS << ", vec-dim = " << O.vec_dim_idx;
+
+      if (O.local_size) {
+        OS << ", local-size = " << O.local_size;
+      }
+
+      OS << ", choices = [";
+      OS.tell();
+      auto AvailChoices = VectorizationChoices::queryAvailableChoices();
+      unsigned NumChoices = 0;
+
+      for (auto &C : AvailChoices) {
+        if (!O.choices.isEnabled(C.number)) {
+          continue;
+        }
+        if (!NumChoices) {
+          OS << "\n    ";
+        } else {
+          OS << ",";
+        }
+        OS << C.name;
+        NumChoices++;
+      }
+      // Pretty-print the list of choices on one line if empty, else formatted
+      // across several lines. Always end with a newline, meaning the options
+      // are closed off with a '}' on the first column.
+      if (NumChoices) {
+        OS << "\n  ]\n";
+      } else {
+        OS << "]\n";
+      }
+    }
+    OS << "}\n";
+  }
+
+  return PreservedAnalyses::all();
+}
+
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
new file mode 100644
index 0000000000000..4afe6cd9993e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
@@ -0,0 +1,51 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This is a simplified version of LLVMs llvm/lib/Passes/PassRegistry.def. It
+// outlines all vecz-specific passes (FIXME: not analyses).
+#ifndef MODULE_PASS
+#define MODULE_PASS(NAME, CREATE_PASS)
+#endif
+MODULE_PASS("builtin-inlining", BuiltinInliningPass())
+MODULE_PASS("define-builtins", DefineInternalBuiltinsPass())
+#undef MODULE_PASS
+
+#ifndef FUNCTION_PASS
+#define FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+FUNCTION_PASS("vecz-mem2reg", BasicMem2RegPass())
+FUNCTION_PASS("pre-linearize", PreLinearizePass())
+FUNCTION_PASS("remove-int-ptr", RemoveIntPtrPass())
+FUNCTION_PASS("squash-small-vecs", SquashSmallVectorsPass())
+FUNCTION_PASS("uniform-reassoc", UniformReassociationPass())
+FUNCTION_PASS("ternary-transform", TernaryTransformPass())
+FUNCTION_PASS("cfg-convert", ControlFlowConversionPass())
+FUNCTION_PASS("cleanup-divergence", DivergenceCleanupPass())
+FUNCTION_PASS("gep-elim", CommonGEPEliminationPass())
+FUNCTION_PASS("scalarize", ScalarizationPass())
+FUNCTION_PASS("mask-memops", SimplifyMaskedMemOpsPass())
+FUNCTION_PASS("packetizer", PacketizationPass())
+FUNCTION_PASS("inline-post-vecz", InlinePostVectorizationPass())
+FUNCTION_PASS("interleave-combine-loads", InterleavedGroupCombinePass(eInterleavedLoad))
+FUNCTION_PASS("interleave-combine-stores", InterleavedGroupCombinePass(eInterleavedStore))
+#undef FUNCTION_PASS
+
+#ifndef LOOP_PASS
+#define LOOP_PASS(NAME, CREATE_PASS)
+#endif
+LOOP_PASS("simplify-infinite-loops", SimplifyInfiniteLoopPass())
+LOOP_PASS("vecz-loop-rotate", VeczLoopRotatePass())
+#undef LOOP_PASS
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
new file mode 100644
index 0000000000000..6bc9efac517a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
@@ -0,0 +1,281 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "reachability.h"
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/Function.h>
+
+#include "debugging.h"
+
+#define DEBUG_TYPE "vecz-reachability"
+
+// HOW IT WORKS
+//
+// It builds two complementary topological sorts of the supplied basic blocks,
+// which it then uses to filter out obviously unreachable blocks as early as
+// possible. Where we have two blocks A and B and B has any topology index
+// less than that of A, then B is definitely not reachable from A. However,
+// if B has a higher index, it might be (but we have to check to be sure).
+//
+// For details on the above approach, see "Reachability Queries in Very Large
+// Graphs: A Fast Refined Online Search Approach" by
+// Renê R. Veloso, Loïc Cerf, Wagner Meira Jr, Mohammed J. Zaki.
+//
+// It also uses data from the Dominator Tree and Post Dominator Tree, in order
+// to skip ahead. If we want to know if B is reachable from A and we know
+// that C dominates B, if A->C is not ruled out by the topology indices then we
+// know there can be no path from A to B that does NOT go through C, therefore
+// we only need to check if C is reachable from A. The same follows in reverse
+// for Post Dominators.
+
+using namespace llvm;
+
+namespace vecz {
+
+Reachability::Reachability(DominatorTree &p_DT, PostDominatorTree &p_PDT,
+                           LoopInfo &p_LI)
+    : DT(p_DT), PDT(p_PDT), LI(p_LI) {}
+
+void Reachability::update(Function &F) {
+  if (graph.empty()) {
+    recalculate(F);
+  }
+}
+
+void Reachability::clear() {
+  indexMap.clear();
+  graph.clear();
+}
+
+void Reachability::recalculate(Function &F) {
+  clear();
+
+  indexMap.reserve(F.size());
+  graph.resize(F.size());
+  {
+    size_t i = 0;
+    for (auto &BB : F) {
+      indexMap[&BB] = i++;
+    }
+  }
+
+  for (auto &BB : F) {
+    auto &node = graph[indexMap[&BB]];
+
+    auto *const loop = LI.getLoopFor(&BB);
+    auto *const header = loop ? loop->getHeader() : nullptr;
+    for (BasicBlock *succ : successors(&BB)) {
+      if (succ == header) {
+        continue;
+      }
+
+      size_t succIndex = indexMap[succ];
+
+      node.successors.push_back(succIndex);
+      auto &succNode = graph[succIndex];
+      ++succNode.predecessors;
+    }
+    std::sort(node.successors.begin(), node.successors.end());
+
+    if (auto *DTNode = DT.getNode(&BB)) {
+      if (auto *IDom = DTNode->getIDom()) {
+        size_t dom = indexMap[IDom->getBlock()];
+        node.dom = dom;
+      }
+    }
+    if (auto *PDTNode = PDT.getNode(&BB)) {
+      if (auto *IPDom = PDTNode->getIDom()) {
+        size_t postDom = indexMap[IPDom->getBlock()];
+        node.postDom = postDom;
+      }
+    }
+  }
+
+  std::vector<size_t> roots;
+  size_t Xindex = 0;
+  size_t Yindex = 0;
+
+  // It would be surprising in fact if there was more than one root, because
+  // we only expect a single entry block for a function, however we deal with
+  // it for completeness, and in case this is required to be valid for some
+  // intermediate state.
+  {
+    size_t i = 0;
+    for (auto &node : graph) {
+      if (node.successors.empty()) {
+        node.postDom = ~size_t(0);
+      }
+      node.predTmp = node.predecessors;
+      if (node.predecessors == 0) {
+        roots.push_back(i);
+      }
+      ++i;
+    }
+  }
+  // A copy of the roots vector so we don't need to build it again when we come
+  // to construct the Y index.
+  std::vector<size_t> rootsY = roots;
+
+  while (!roots.empty()) {
+    size_t u = roots.back();
+    roots.pop_back();
+
+    auto &uNode = graph[u];
+    uNode.X = Xindex++;
+    for (size_t v : uNode.successors) {
+      auto &vNode = graph[v];
+      if (--vNode.predTmp == 0) {
+        roots.push_back(v);
+      }
+    }
+  }
+
+  for (auto &node : graph) {
+    node.predTmp = node.predecessors;
+  }
+  roots.swap(rootsY);
+
+  // Y heap represents right-most vertices (max X)
+  auto cmpY = [this](size_t lhs, size_t rhs) -> bool {
+    return graph[lhs].X < graph[rhs].X;
+  };
+
+  // The vector of roots has strictly decreasing X index, so it already has
+  // the property of a max heap. No need to make_heap!
+  while (!roots.empty()) {
+    std::pop_heap(roots.begin(), roots.end(), cmpY);
+    size_t u = roots.back();
+    roots.pop_back();
+
+    auto &uNode = graph[u];
+    uNode.Y = Yindex++;
+    for (auto vi = uNode.successors.rbegin(), ve = uNode.successors.rend();
+         vi != ve; ++vi) {
+      size_t v = *vi;
+      auto &vNode = graph[v];
+      if (--vNode.predTmp == 0) {
+        roots.push_back(v);
+        std::push_heap(roots.begin(), roots.end(), cmpY);
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    size_t i = 0;
+    for (auto &BB : F) {
+      auto &node = graph[i];
+      dbgs() << BB.getName() << ":\n";
+      dbgs() << "[ " << node.X << ", " << node.Y << " ] : ";
+      dbgs() << "( " << node.dom << ", " << node.postDom << " ) : ";
+      for (size_t s : node.successors) {
+        if (graph[s].X <= graph[i].X) {
+          dbgs() << "!x!";
+        }
+        if (graph[s].Y <= graph[i].Y) {
+          dbgs() << "!y!";
+        }
+        dbgs() << s << "; ";
+      }
+      dbgs() << "\n\n";
+      ++i;
+    }
+  });
+
+  assert(validate() && "Topological indices not valid for reachability graph");
+}
+
+bool Reachability::validate() const {
+  for (auto &node : graph) {
+    for (size_t s : node.successors) {
+      if (graph[s].X <= node.X || graph[s].Y <= node.Y) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool Reachability::isReachableImpl(size_t from, size_t to) const {
+  DenseSet<size_t> visited;
+  std::vector<size_t> worklist;
+
+  while (true) {
+    auto &nodeFrom = graph[from];
+    auto &nodeTo = graph[to];
+
+    if (nodeFrom.X > nodeTo.X || nodeFrom.Y > nodeTo.Y) {
+      return false;
+    }
+
+    size_t dom = nodeTo.dom;
+    size_t postDom = nodeFrom.postDom;
+    if (dom == from || postDom == to) {
+      return true;
+    }
+
+    auto &nodeDom = graph[dom];
+    if (nodeFrom.X < nodeDom.X && nodeFrom.Y < nodeDom.Y) {
+      to = dom;
+      continue;
+    }
+
+    if (postDom != ~size_t(0)) {
+      auto &nodePDom = graph[postDom];
+      if (nodePDom.X < nodeTo.X && nodePDom.Y < nodeTo.Y) {
+        from = postDom;
+        continue;
+      }
+    }
+
+    // possible false positive, so check recursively..
+    for (size_t succ : nodeFrom.successors) {
+      if (succ == to) {
+        return true;
+      }
+      auto &nodeSucc = graph[succ];
+      if (nodeSucc.X < nodeTo.X && nodeSucc.Y < nodeTo.Y) {
+        if (visited.insert(succ).second) {
+          worklist.push_back(succ);
+        }
+      }
+    }
+    if (worklist.empty()) {
+      return false;
+    }
+    from = worklist.back();
+    worklist.pop_back();
+  }
+  return false;
+}
+
+bool Reachability::isReachable(BasicBlock *from, BasicBlock *to) const {
+  auto fromI = indexMap.find(from);
+  if (fromI == indexMap.end()) {
+    return false;
+  }
+
+  auto toI = indexMap.find(to);
+  if (toI == indexMap.end()) {
+    return false;
+  }
+
+  return from == to || isReachableImpl(fromI->second, toI->second);
+}
+
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
new file mode 100644
index 0000000000000..7c834d3f5cb5d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
@@ -0,0 +1,55 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "simd_packet.h"
+
+#define DEBUG_TYPE "vecz-simd"
+
+using namespace llvm;
+using namespace vecz;
+
+SimdPacket::SimdPacket() : Mask(0) {}
+
+llvm::Value *SimdPacket::at(unsigned Index) const {
+  if (Index >= size()) {
+    return nullptr;
+  } else {
+    return (*this)[Index];
+  }
+}
+
+void SimdPacket::set(unsigned Index, Value *V) {
+  if (Index < size()) {
+    (*this)[Index] = V;
+    Mask.enable(Index);
+  }
+}
+
+SimdPacket &SimdPacket::update(const SimdPacket &Other) {
+  for (unsigned i = 0; i < size(); i++) {
+    if (Other.Mask.isEnabled(i)) {
+      (*this)[i] = Other[i];
+    }
+  }
+  Mask.Value |= Other.Mask.Value;
+  return *this;
+}
+
+void PacketMask::enableAll(unsigned NumLanes) {
+  for (unsigned i = 0; i < NumLanes; i++) {
+    enable(i);
+  }
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
new file mode 100644
index 0000000000000..bfc937f441009
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -0,0 +1,248 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfo.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Local.h>
+
+#include "debugging.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+using namespace vecz;
+
+#define DEBUG_TYPE "vecz-mem2reg"
+
+PreservedAnalyses BasicMem2RegPass::run(Function &F,
+                                        FunctionAnalysisManager &) {
+  LLVM_DEBUG(dbgs() << "\n\nVECZ MEM2REG on " << F.getName() << "\n");
+  bool modified = false;
+  if (F.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  // Find allocas that can be promoted.
+  SmallVector<AllocaInst *, 4> PromotableAllocas;
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (Instruction &I : EntryBB) {
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+      if (canPromoteAlloca(Alloca)) {
+        PromotableAllocas.push_back(Alloca);
+      }
+    }
+  }
+
+  // Promote them.
+  for (AllocaInst *Alloca : PromotableAllocas) {
+    if (promoteAlloca(Alloca)) {
+      LLVM_DEBUG(dbgs() << "VM2R: Promoted :" << *Alloca << "\n");
+      Alloca->eraseFromParent();
+      modified = true;
+    }
+  }
+
+  if (!modified) {
+    return PreservedAnalyses::all();
+  }
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool BasicMem2RegPass::canPromoteAlloca(AllocaInst *Alloca) const {
+  BasicBlock *ParentBB = Alloca->getParent();
+  Function *F = ParentBB->getParent();
+  BasicBlock &EntryBB = F->getEntryBlock();
+  if (&EntryBB != ParentBB) {
+    return false;
+  }
+
+  const unsigned SrcPointeeBits =
+      Alloca->getAllocatedType()->getPrimitiveSizeInBits();
+
+  if (SrcPointeeBits == 0) {
+    return false;
+  }
+
+  // Validate the alloca's users.
+  StoreInst *TheStore = nullptr;
+  SmallPtrSet<Value *, 4> NonStoreUsers;
+  for (User *U : Alloca->users()) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(U)) {
+      // There can be at most one store.
+      if (TheStore) {
+        return false;
+      }
+      // Stores must be in the entry block.
+      if (Store->getParent() != &EntryBB) {
+        return false;
+      }
+      // Check if the store is actually storing a value *in* the alloca and not
+      // using the alloca itself as the value to be stored. For example, in the
+      // following IR code, the store can be used to promote p_639 but not
+      // c_640:
+      //
+      // %c_640 = alloca %struct.S2, align 16
+      // %p_639 = alloca %struct.S2*, align 8
+      // store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+      //
+      // Also, if the alloca pointer is stored in some other variable, we can
+      // not promote the alloca as we need the pointer.
+      if (Store->getPointerOperand() != Alloca) {
+        return false;
+      }
+      // Everything is fine, use this store
+      TheStore = Store;
+    } else if (isa<LoadInst>(U)) {
+      // The loaded type doesn't necessarily equal the alloca type when opaque
+      // pointers are involved:
+      //   %a = alloca i32
+      //   %v = load i16, ptr %a
+      // We can only promote the alloca if we can bitcast between the two
+      // underlying types as well.
+      // We could probably zero-extend or trunc if we had to? See CA-4382.
+      const unsigned DstPointeeBits = U->getType()->getPrimitiveSizeInBits();
+      if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) {
+        return false;
+      }
+      NonStoreUsers.insert(U);
+    } else if (BitCastInst *Cast = dyn_cast<BitCastInst>(U)) {
+      // The bitcast must be from one pointer type to another.
+      PointerType *SrcPtrTy = dyn_cast<PointerType>(Cast->getSrcTy());
+      PointerType *DstPtrTy = dyn_cast<PointerType>(Cast->getType());
+      if (!SrcPtrTy || !DstPtrTy) {
+        return false;
+      }
+      // The cast must have one load user.
+      if (!Cast->hasOneUse()) {
+        return false;
+      }
+      User *CastUser = *Cast->user_begin();
+      if (!isa<LoadInst>(CastUser)) {
+        return false;
+      }
+      // Since this is a bitcast, we can only promote the alloca if we can
+      // bitcast between the two underlying types as well.
+      const unsigned DstPointeeBits =
+          CastUser->getType()->getPrimitiveSizeInBits();
+      if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) {
+        return false;
+      }
+      NonStoreUsers.insert(U);
+    } else {
+      // Do not allow other kinds of users.
+      return false;
+    }
+  }
+
+  // If the alloca has no value stored into it, then there is no value to get
+  // and we can't promote it.
+  if (!TheStore) {
+    return false;
+  }
+
+  // Stores must precede other users.
+  for (Instruction &I : EntryBB) {
+    if (NonStoreUsers.count(&I)) {
+      return false;
+    } else if (&I == TheStore) {
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
+  LLVM_DEBUG(dbgs() << "VM2R: NOW AT :" << *Alloca << "\n");
+  // Find the value stored in the alloca.
+  Value *StoredValue = nullptr;
+  SmallVector<Instruction *, 8> ToDelete;
+  for (User *U : Alloca->users()) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(U)) {
+      StoredValue = Store->getValueOperand();
+      ToDelete.push_back(Store);
+      DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
+      auto DbgIntrinsics = FindDbgAddrUses(Alloca);
+      for (auto oldDII : DbgIntrinsics) {
+        ConvertDebugDeclareToDebugValue(oldDII, Store, DIB);
+      }
+      break;
+    }
+  }
+  assert(StoredValue != nullptr && "Could not find value stored in alloca");
+
+  // Replace non-store users with the stored value.
+  for (User *U : Alloca->users()) {
+    if (isa<StoreInst>(U)) {
+      continue;
+    }
+    LoadInst *Load = dyn_cast<LoadInst>(U);
+    Value *NewValue = StoredValue;
+    BitCastInst *Cast = dyn_cast<BitCastInst>(U);
+    if (Cast) {
+      // We've already verified that a bitcast must have a load attached.
+      Load = cast<LoadInst>(*Cast->user_begin());
+      LLVM_DEBUG(dbgs() << "VM2R: Cast     :" << *Cast << "\n");
+    }
+    if (!Load) {
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "VM2R: Load     :" << *Load << "\n");
+    // Handle any type changes - not necessarily from the BitCastInst we've
+    // checked above! We've already verified that the loaded type type and the
+    // alloca size must be identical...
+    assert(Load->getType()->getPrimitiveSizeInBits() ==
+           Alloca->getAllocatedType()->getPrimitiveSizeInBits());
+    if (Load->getType() != NewValue->getType()) {
+      // ... but we haven't checked that the stored value is the right size:
+      //   %a = alloca i32
+      //   store i16, ptr %a
+      //   %v = load i32, ptr %a
+      // Note: we could do other things if the type sizes didn't match. See
+      // CA-4382.
+      if (Load->getType()->getPrimitiveSizeInBits() !=
+          NewValue->getType()->getPrimitiveSizeInBits()) {
+        return false;
+      }
+      NewValue = CastInst::CreateBitOrPointerCast(StoredValue, Load->getType(),
+                                                  "", Load);
+    }
+    LLVM_DEBUG(dbgs() << "VM2R: Replaced :" << *Load << "\n");
+    LLVM_DEBUG(dbgs() << "      |-> with :" << *NewValue << "\n");
+    Load->replaceAllUsesWith(NewValue);
+    if (Cast) {
+      ToDelete.push_back(Cast);
+    }
+    ToDelete.push_back(Load);
+  }
+
+  // Clean up instructions bottom-up (users first).
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    if (I->use_empty()) {
+      LLVM_DEBUG(dbgs() << "VM2R: Deleted  :" << *I << "\n");
+      I->eraseFromParent();
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
new file mode 100644
index 0000000000000..e81e2a0f32615
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -0,0 +1,325 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
+#include <multi_llvm/opaque_pointers.h>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+using namespace vecz;
+
+PreservedAnalyses BuiltinInliningPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  bool modified = false;
+  bool needToRunInliner = false;
+  llvm::FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M.functions()) {
+    SmallVector<Instruction *, 4> ToDelete;
+    for (BasicBlock &BB : F) {
+      if (!FAM.getResult<VectorizationUnitAnalysis>(F).hasResult()) {
+        continue;
+      }
+      for (Instruction &I : BB) {
+        // Only look at call instructions as those are the only things that can
+        // be builtins.
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI) {
+          continue;
+        }
+
+        bool NeedLLVMInline = false;
+        Value *NewCI = processCallSite(CI, NeedLLVMInline);
+        needToRunInliner |= NeedLLVMInline;
+        if ((NewCI == CI) || !NewCI) {
+          continue;
+        }
+
+        if (!CI->getType()->isVoidTy()) {
+          CI->replaceAllUsesWith(NewCI);
+        }
+        ToDelete.push_back(CI);
+        modified = true;
+      }
+    }
+    // Clean up.
+    while (!ToDelete.empty()) {
+      Instruction *I = ToDelete.pop_back_val();
+      I->eraseFromParent();
+    }
+  }
+
+  // Run the LLVM inliner if some calls were marked as needing inlining.
+  if (needToRunInliner) {
+    llvm::legacy::PassManager PM;
+    PM.add(llvm::createAlwaysInlinerLegacyPass());
+    modified |= PM.run(M);
+  }
+
+  // Recursively run the pass to inline any newly introduced functions.
+  if (modified) {
+    run(M, AM);
+  }
+
+  return modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
+                                ArrayRef<Value *> Args, llvm::CallBase *CB) {
+  LLVMContext &Context = F->getContext();
+  auto &DL = F->getParent()->getDataLayout();
+  unsigned PtrBits = DL.getPointerSizeInBits();
+
+  // Check the alignment constraints do not exceed the algorithmic requirements
+  // of doing 64 bits at time
+
+  // @llvm.memset defines 0 and 1 to both mean no alignment.
+  const auto &MSI = cast<MemSetInst>(CB);
+
+  // Note that once LLVM 8.0 is deprecated we can use actual alignment classes
+  Align Alignment = MSI->getDestAlign().valueOrOne();
+  Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
+  if (Alignment < std::max(Int64Alignment, Align(8u))) {
+    return nullptr;
+  }
+
+  Value *DstPtr = Args[0];
+  auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
+
+  Type *Int8Ty = B.getInt8Ty();
+  // FIXME: We implicitly assume pointers to i8 by doing byte-wise stores,
+  // below. See CA-4331.
+  if (!DstPtrTy->isOpaque() &&
+      multi_llvm::getPtrElementType(DstPtrTy) != Int8Ty) {
+    return nullptr;
+  }
+
+  Value *StoredValue = Args[1];
+  bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
+  llvm::StoreInst *MS = nullptr;
+
+  // For nicely named IR instructions
+  const std::string DstName = DstPtr->getName().str();
+
+  // We can only replace memset instructions if they have a constant length
+  ConstantInt *CL = dyn_cast<ConstantInt>(Args[2]);
+  if (!CL) {
+    return nullptr;
+  }
+  int64_t Bytes = CL->getValue().getZExtValue();
+
+  // Unlike memcpy, if we want to use 64bit stores in memset we need to
+  // construct the 64bit value from a 8bit one.
+  // First, check if we can get the value at compile time
+  ConstantInt *ConstantValue = dyn_cast<ConstantInt>(StoredValue);
+  Value *StoredValue64 = nullptr;
+  if (ConstantValue) {
+    // If we can get the value at compile time, calculate the 64bit value at
+    // compile time as well.
+    unsigned IntValue = ConstantValue->getZExtValue();
+    APInt APValue(64, IntValue);
+    for (int i = 1; IntValue && i < 8; ++i) {
+      APValue |= APValue << 8;
+    }
+    StoredValue64 = ConstantInt::get(Context, APValue);
+  } else {
+    StoredValue64 = B.CreateZExt(StoredValue, Type::getInt64Ty(Context));
+    for (int i = 1; i < 8; ++i) {
+      StoredValue64 = B.CreateOr(
+          StoredValue64,
+          B.CreateShl(StoredValue64,
+                      llvm::ConstantInt::get(Context, llvm::APInt(64, 8))));
+    }
+    // If we can't get the value at compile time, we have to emit instructions
+    // to generate it at runtime.
+  }
+  StoredValue64->setName("ms64val");
+
+  // Emit enough loads and stores to replicate the behaviour of memset.
+  int64_t byte = 0;
+  // Initially we use 64bit loads and stores, in order to avoid emitting too
+  // many instructions.
+  // We can't just get an Int64PtrTy because we need the correct address space
+  Type *DstInt64PtrTy = B.getInt64Ty()->getPointerTo(
+      cast<PointerType>(DstPtr->getType())->getAddressSpace());
+
+  for (; byte <= Bytes - 8; byte += 8) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetDstPtr = B.CreateBitCast(
+        B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx), DstInt64PtrTy, DstName);
+    MS = B.CreateStore(StoredValue64, OffsetDstPtr, IsVolatile);
+
+    // Set alignments for store to be minimum of that from
+    // the instruction and what is required for 8 byte stores
+    Align StoreAlign = byte == 0 ? Alignment : std::min(Align(8u), Alignment);
+    MS->setAlignment(StoreAlign);
+  }
+  // ...and then we fill in the remaining with 8bit stores.
+  for (; byte < Bytes; byte += 1) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx, DstName);
+    MS = B.CreateStore(StoredValue, OffsetDstPtr, IsVolatile);
+    MS->setAlignment(llvm::Align(1));
+  }
+
+  return MS;
+}
+
+static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
+                                ArrayRef<Value *> Args, llvm::CallBase *CB) {
+  LLVMContext &Context = F->getContext();
+  auto &DL = F->getParent()->getDataLayout();
+
+  const auto &MSI = cast<MemCpyInst>(CB);
+  Align DestAlignment = MSI->getDestAlign().valueOrOne();
+  Align SourceAlignment = MSI->getSourceAlign().valueOrOne();
+  Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
+
+  if (DestAlignment < std::max(Int64Alignment, Align(8u))) {
+    return nullptr;
+  }
+
+  if (SourceAlignment < std::max(Int64Alignment, Align(8u))) {
+    return nullptr;
+  }
+
+  unsigned PtrBits = DL.getPointerSizeInBits();
+
+  Value *DstPtr = Args[0];
+  Value *SrcPtr = Args[1];
+
+  auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
+  auto *SrcPtrTy = cast<PointerType>(DstPtr->getType());
+
+  Type *Int8Ty = B.getInt8Ty();
+  // FIXME: We implicitly assume pointers to i8 by doing byte-wise loads and
+  // stores, below. See CA-4331.
+  if ((!DstPtrTy->isOpaque() &&
+       multi_llvm::getPtrElementType(DstPtrTy) != Int8Ty) ||
+      ((!SrcPtrTy->isOpaque() &&
+        multi_llvm::getPtrElementType(SrcPtrTy) != Int8Ty))) {
+    return nullptr;
+  }
+
+  bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
+  llvm::StoreInst *MC = nullptr;
+
+  // For nicely named IR instructions
+  const std::string DstName = DstPtr->getName().str();
+  const std::string SrcName = SrcPtr->getName().str();
+
+  // Get the length as a constant
+  ConstantInt *CL = dyn_cast<ConstantInt>(Args[2]);
+  // We can only replace memcpy instructions if they have a constant length
+  if (!CL) {
+    return nullptr;
+  }
+  int64_t Length = CL->getValue().getSExtValue();
+
+  // Emit enough stores to replicate the behaviour of memcpy.
+  int64_t byte = 0;
+  // Initially we use 64bit loads and stores, in order to avoid emitting too
+  // many instructions...
+  // We can't just get an Int64PtrTy because we need the correct address space
+  Type *Int64Ty = B.getInt64Ty();
+  Type *SrcInt64PtrTy = Int64Ty->getPointerTo(
+      cast<PointerType>(SrcPtr->getType())->getAddressSpace());
+  Type *DstInt64PtrTy = Int64Ty->getPointerTo(
+      cast<PointerType>(DstPtr->getType())->getAddressSpace());
+
+  for (; byte <= Length - 8; byte += 8) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetSrcPtr = B.CreateBitCast(
+        B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx), SrcInt64PtrTy);
+    Value *OffsetDstPtr = B.CreateBitCast(
+        B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx), DstInt64PtrTy, DstName);
+    LoadInst *LoadValue =
+        B.CreateLoad(Int64Ty, OffsetSrcPtr, IsVolatile, SrcName);
+    MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile);
+
+    // Set alignments for stores and loads to be minimum of that from
+    // the instruction and what is required for 8 byte load/stores
+    Align StoreAlign =
+        byte == 0 ? DestAlignment : std::min(Align(8u), DestAlignment);
+    MC->setAlignment(StoreAlign);
+    Align LoadAlign =
+        byte == 0 ? DestAlignment : std::min(Align(8u), SourceAlignment);
+    LoadValue->setAlignment(LoadAlign);
+  }
+  // ...and then we fill in the remaining with 8bit stores.
+  for (; byte < Length; byte += 1) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetSrcPtr = B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx, DstName);
+    LoadInst *LoadValue =
+        B.CreateLoad(Int8Ty, OffsetSrcPtr, IsVolatile, SrcName);
+    MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile);
+    LoadValue->setAlignment(llvm::Align(1));
+    MC->setAlignment(llvm::Align(1));
+  }
+
+  return MC;
+}
+
+Value *BuiltinInliningPass::processCallSite(CallInst *CI,
+                                            bool &NeedLLVMInline) {
+  NeedLLVMInline = false;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return CI;
+  }
+
+  // Mark user function as needing inlining by LLVM, unless it has the NoInline
+  // attribute
+  if (!Callee->isDeclaration() &&
+      !Callee->hasFnAttribute(Attribute::NoInline)) {
+    CI->addFnAttr(Attribute::AlwaysInline);
+    NeedLLVMInline = true;
+    return CI;
+  }
+
+  // Specially inline some LLVM intrinsics.
+  if (Callee->isIntrinsic()) {
+    if (Callee->getIntrinsicID() == Intrinsic::memcpy) {
+      IRBuilder<> B(CI);
+      SmallVector<Value *, 4> Args(CI->args());
+      if (Value *Impl = emitBuiltinMemCpy(Callee, B, Args, CI)) {
+        return Impl;
+      }
+    }
+
+    if (Callee->getIntrinsicID() == Intrinsic::memset) {
+      IRBuilder<> B(CI);
+      SmallVector<Value *, 4> Args(CI->args());
+      if (Value *Impl = emitBuiltinMemSet(Callee, B, Args, CI)) {
+        return Impl;
+      }
+    }
+  }
+
+  return CI;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
new file mode 100644
index 0000000000000..af3286a6bb2a1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/common_gep_elimination_pass.h"
+
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/Dominators.h>
+
+#include <unordered_map>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "vectorization_unit.h"
+
+using namespace llvm;
+using namespace vecz;
+
+char CommonGEPEliminationPass::PassID = 0;
+
+PreservedAnalyses CommonGEPEliminationPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  // Redundant GEPs to remove
+  SmallPtrSet<GetElementPtrInst *, 16> toDelete;
+  // GEPs we come across.
+  std::unordered_multimap<Value *, GetElementPtrInst *> GEPs;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+        Value *Ptr = GEP->getPointerOperand();
+        // If this is the first time we meet the source of the GEP, just add
+        // it to the multimap and look for another GEP.
+        if (GEPs.find(Ptr) == GEPs.end()) {
+          GEPs.emplace(Ptr, GEP);
+          continue;
+        }
+
+        // The range of values that have the key `Ptr`.
+        auto Range = GEPs.equal_range(Ptr);
+        auto it = Range.first;
+        for (; it != Range.second; it++) {
+          auto *trackedGEP = it->second;
+          if (GEP->getNumIndices() != trackedGEP->getNumIndices()) {
+            continue;
+          }
+
+          // With opaque pointers, we need to check the element types as well.
+          if (GEP->getSourceElementType() !=
+              trackedGEP->getSourceElementType()) {
+            continue;
+          }
+
+          unsigned i = 0;
+          for (; i < GEP->getNumIndices(); i++) {
+            Value *lhs = GEP->getOperand(i + 1);
+            Value *rhs = trackedGEP->getOperand(i + 1);
+
+            // Both GEPs we compare are not the same, stop comparing.
+            if (lhs != rhs) {
+              break;
+            }
+          }
+
+          // trackedGEP does the same operation as GEP, so replace GEP
+          // with the already tracked GEP.
+          if (i == GEP->getNumIndices()) {
+            if (DT.dominates(trackedGEP->getParent(), GEP->getParent())) {
+              GEP->replaceAllUsesWith(trackedGEP);
+              toDelete.insert(GEP);
+              break;
+            }
+          }
+        }
+        // We iterated over all values whose key is Ptr, but haven't found
+        // a matching GEP, so add the latter to the multimap.
+        if (it == Range.second) {
+          GEPs.emplace(Ptr, GEP);
+        }
+      }
+    }
+  }
+
+  // Proceed to remove every duplicate GEP we found.
+  for (auto *GEP : toDelete) {
+    IRCleanup::deleteInstructionNow(GEP);
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
new file mode 100644
index 0000000000000..05f3602613001
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -0,0 +1,3135 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/control_flow_conversion_pass.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/InstructionSimplify.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <queue>
+#include <utility>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "control_flow_boscc.h"
+#include "control_flow_roscc.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "reachability.h"
+#include "transform/passes.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+using namespace llvm;
+using namespace vecz;
+
+class ControlFlowConversionState::Impl : public ControlFlowConversionState {
+ public:
+  Impl(Function &F, FunctionAnalysisManager &AM)
+      : ControlFlowConversionState(F, AM) {}
+
+  PreservedAnalyses run(Function &, FunctionAnalysisManager &);
+
+ private:
+  /// @brief utility struct used by LinearizeCFG to allow block retargeting
+  /// info to be stored in a single contiguous vector of variable-length
+  /// subvectors. This avoids having to use a vector of vectors, and all
+  /// the individual heap allocations that would involve. Empirically (based on
+  /// UnitCL) we have approximately one new target per Basic Block overall,
+  /// and never more than 2 (which is not to say more than 2 is impossible).
+  /// Since we iterate over all NewTargetInfos linearly, we only need to record
+  /// the number of targets for each block, and not their starting indices.
+  struct Linearization {
+    struct NewTargetInfo {
+      BasicBlock *BB;
+      size_t numTargets = 0;
+
+      NewTargetInfo(BasicBlock *bb) : BB(bb) {}
+    };
+
+    std::vector<NewTargetInfo> infos;
+    std::vector<BasicBlock *> data;
+
+    void beginBlock(BasicBlock *BB) { infos.emplace_back(BB); }
+    size_t currentSize() const { return infos.back().numTargets; }
+    void push(BasicBlock *BB) {
+      data.push_back(BB);
+      ++infos.back().numTargets;
+    }
+  };
+
+  /// @brief Type that maps exit blocks to exit mask information.
+  using DenseExitPHIMap = SmallDenseMap<BasicBlock const *, PHINode *, 2>;
+  /// @brief Type that maps exiting blocks to update mask information.
+  using DenseExitUpdateMap =
+      SmallDenseMap<BasicBlock const *, BinaryOperator *, 2>;
+
+  struct LoopMasksInfo {
+    /// @brief Keep track of which instances left the loop through which exit
+    ///        (persisted throughout the whole loop).
+    DenseExitPHIMap persistedDivergentExitMasks;
+    /// @brief Divergent loop exit masks updated for the current iteration.
+    DenseExitUpdateMap updatedPersistedDivergentExitMasks;
+    /// @brief Combined divergent loop exit masks of the current iteration.
+    Instruction *combinedDivergentExitMask = nullptr;
+    /// @brief Combined divergent loop exit masks of the whole loop.
+    Instruction *persistedCombinedDivergentExitMask = nullptr;
+  };
+
+  /// @brief Convert the function's CFG to data-flow.
+  /// @return true if the function's CFG was converted, false otherwise.
+  bool convertToDataFlow();
+
+  /// @brief Generate masks needed to do control-flow to data-flow conversion.
+  /// @return true if masks were generated successfully, false otherwise.
+  bool generateMasks();
+
+  /// @brief Generate masks for the given block.
+  /// @param[in] BB Block whose masks we are generating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createMasks(BasicBlock &BB);
+
+  /// @brief Create entry mask for the given block.
+  /// @param[in] BB Block whose masks we are generating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createEntryMasks(BasicBlock &BB);
+
+  /// @brief Create exit mask for the given block.
+  /// @param[in] BB Block whose masks we are generating.
+  /// @param[in] isBOSCCEntry Whether BB creates a uniform region.
+  /// @return true if no problem occurred, false otherwise.
+  bool createExitMasks(BasicBlock &BB, bool isBOSCCEntry = false);
+
+  /// @brief Create loop exit masks for the given loop.
+  /// @param[in,out] LTag Information on the loop we are evaluating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createLoopExitMasks(LoopTag &LTag);
+
+  /// @brief Combine all information about instances that left the loop in the
+  ///        current iteration.
+  /// @param[in,out] LTag Information on the loop we are evaluating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createCombinedLoopExitMask(LoopTag &LTag);
+
+  /// @brief Apply masks to basic blocks in the function, to prevent
+  /// side-effects for inactive instances.
+  ///
+  /// @return true if masks were applied successfully, false otherwise.
+  bool applyMasks();
+
+  /// @brief Apply a mask to the given basic block, to prevent side-effects for
+  /// inactive instances.
+  ///
+  /// @param[in] BB Basic block to apply masks to.
+  /// @param[in] mask Mask to apply.
+  ///
+  /// @return true if masks were applied successfully, false otherwise.
+  bool applyMask(BasicBlock &BB, Value *mask);
+
+  /// @brief Emit a call instructions to the masked version of the called
+  /// function.
+  ///
+  /// @param[in] CI The call instructions to create a masked version of
+  /// @param[in] entryBit The Value that determines if the lane is active or
+  /// not.
+  /// @return The call instruction to the masked version.
+  CallInst *emitMaskedVersion(CallInst *CI, Value *entryBit);
+
+  /// @brief Create a masked version of the given function
+  ///
+  /// The Function (F) to be masked will be extracted from the CallInst and a
+  /// new Function (NewFunction) will be generated. NewFunction takes the same
+  /// arguments as F, plus an additional boolean argument that determines if the
+  /// lane is active or not. If the boolean argument is true, then NewFunction
+  /// will execute F and (if it's not void) return its return value. Vararg
+  /// functions are supported by expanding their arguments.
+  ///
+  /// @param[in] CI The call instructions to create a masked version of
+  /// @return The masked function
+  Function *getOrCreateMaskedVersion(CallInst *CI);
+
+  /// @brief a type that maps unmasked instructions onto masked replacements.
+  using DeletionMap = SmallVector<std::pair<Instruction *, Value *>, 4>;
+
+  /// @brief Attempt to apply a mask to an Instruction as a Memory Operation
+  ///
+  /// @param[in] I The Binary Operation to apply the mask to
+  /// @param[in] mask The mask to apply to the MemOp
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @param[out] safeDivisors a cache of re-usable known non-zero divisors
+  /// @return true if it was a BinOp, false otherwise
+  bool tryApplyMaskToBinOp(Instruction &I, Value *mask, DeletionMap &toDelete,
+                           DenseMap<Value *, Value *> &safeDivisors);
+
+  /// @brief Attempt to apply a mask to a Memory Operation
+  ///
+  /// @param[in] op The MemOp to apply the mask to
+  /// @param[in] mask The mask to apply to the MemOp
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true of the MemOp got masked, false otherwise
+  bool tryApplyMaskToMemOp(MemOp &op, Value *mask, DeletionMap &toDelete);
+
+  /// @brief Attempt to apply a mask to an Instruction as a Memory Operation
+  ///
+  /// @param[in] CI The call instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the MemOp
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this call, false otherwise
+  bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
+
+  /// @brief Linearize a CFG.
+  /// @return true if no problem occurred, false otherwise.
+  bool partiallyLinearizeCFG();
+
+  /// @brief Create the reduction functions needed to vectorize the branch
+  /// @return true on success, false otherwise
+  bool createBranchReductions();
+
+  /// @brief Uniformize every divergent loop.
+  ///
+  /// @return true if no problem occurred, false otherwise.
+  bool uniformizeDivergentLoops();
+
+  /// @brief Assign a divergent loop a single loop exit from which all other
+  ///        exits will be rewired.
+  /// @param[in] LTag Tag of the processed loop
+  /// @return true if no problem occurred, false otherwise.
+  bool computeDivergentLoopPureExit(LoopTag &LTag);
+
+  /// @brief Rewire every loop exit block such that the loop can be considered
+  ///        uniform.
+  ///
+  /// @param[in] LTag Tag of the processed loop
+  /// @param[in] exitBlocks List of exit blocks before any transformation
+  /// @return true if no problem occurred, false otherwise.
+  bool rewireDivergentLoopExitBlocks(
+      LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Generate blend operations to discard execution of inactive
+  /// instances.
+  /// @param[in] LTag The loop whose live value is being handled.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateDivergentLoopResults(LoopTag &LTag);
+
+  /// @brief Generate loop live value update instructions.
+  /// @param[in] LLV   The loop live value we want to generate instructions for.
+  /// @param[in] LTag The loop whose live value is being handled.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateDivergentLoopResultUpdates(Value *LLV, LoopTag &LTag);
+
+  /// @brief Generate blend instruction for loop live values at the latch.
+  /// @param[in] LTag The loop whose live values are being handled.
+  /// @param[in] exitBlocks List of exit blocks before any transformation
+  /// @return true if no problem occurred, false otherwise.
+  bool blendDivergentLoopLiveValues(
+      LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Generate blend instruction for loop exit masks at the latch.
+  ///
+  /// @param[in] LTag Tag of the processed loop
+  /// @param[in] exitEdges List of exit edges before any transformation
+  /// @param[in] exitBlocks List of exit blocks before any transformation
+  /// @return true if no problem occurred, false otherwise.
+  bool blendDivergentLoopExitMasks(
+      LoopTag &LTag, const SmallVectorImpl<Loop::Edge> &exitEdges,
+      const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Replace uses of loop values outside of a divergent loop.
+  ///
+  /// @param[in] LTag Tag of the processed loop
+  /// @param[in] from Instruction to be replaced.
+  /// @param[in] to Instruction to replace `from` with.
+  /// @param[in] exitBlocks Exit blocks of the loop.
+  /// @return true if no problem occurred, false otherwise.
+  bool replaceUsesOutsideDivergentLoop(
+      LoopTag &LTag, Value *from, Value *to,
+      const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Assign new targets to edges based on the dominance-compact
+  ///        ordering.
+  /// @param[out] lin New target information for each BasicBlock
+  /// @return true if no problem occurred, false otherwise.
+  bool computeNewTargets(Linearization &lin);
+
+  /// @brief Linearize the CFG with the new calculated edges.
+  /// @return true if no problem occurred, false otherwise.
+  bool linearizeCFG();
+
+  /// @brief Generate blend operations to discard execution of inactive
+  /// instances.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateSelects();
+
+  /// @brief Split a phi instruction into several select instructions.
+  /// @param[in,out] PHI The PHI node we want to split.
+  /// @param[in]     B  The block PHI belongs to.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateSelectFromPHI(PHINode *PHI, BasicBlock *B);
+
+  /// @brief Repair the SSA form. First blend and create new masks from the
+  ///        new wires, then blend all the instructions that need blending.
+  /// @return true if no errors occurred.
+  bool repairSSA();
+
+  /// @brief Update the incoming blocks of phi nodes whose predecessors have
+  ///        changed whilst rewiring.
+  /// @return true if no errors occurred.
+  bool updatePHIsIncomings();
+
+  /// @brief Blend instructions before their uses if divergence happened
+  ///        inbetween.
+  /// @return true if no errors occurred.
+  bool blendInstructions();
+
+  /// @brief Simplify the mask instructions.
+  /// @return true if no errors occurred.
+  bool simplifyMasks();
+
+  /// @brief Check all blocks have a unique index order.
+  /// @return true if no errors occurred.
+  bool checkBlocksOrder() const;
+
+  /// @brief Upon modifying a mask, we need to update the in-memory masks as
+  ///        well.
+  /// @param[in] src The block whose mask changed
+  /// @param[in] from The old mask
+  /// @param[in] to The new mask
+  void replaceMasks(BasicBlock *src, Value *from, Value *to);
+
+  /// @brief Upon removing an instruction, we need to also update our internal
+  ///        containers.
+  /// @param[in] from The old value
+  /// @param[in] to The new value
+  void updateMaps(Value *from, Value *to);
+
+  BasicBlock *functionExitBlock = nullptr;
+  DenseSet<const Instruction *> blends;
+  DenseMap<Loop *, LoopMasksInfo> LoopMasks;
+};
+
+STATISTIC(VeczCFGFail,
+          "Number of kernels that failed control flow conversion [ID#L80]");
+
+// Set this to enable all-of masks in the latch of divergent loops. This can
+// be interesting if there exists an intrinsic that, when comparing vector
+// instructions, can immediately stop comparing if one of the operands if false.
+// In counterpart, this makes us update two more values per divergent loops
+// (said values allowing to keep track of which instances left the loop).
+//
+// Because no such intrinsic exists to my knowledge, we don't set this by
+// default.
+#undef ALL_OF_DIVERGENT_LOOP_LATCH
+
+namespace {
+
+Instruction *getInsertionPt(BasicBlock &BB) {
+  // We have to insert instructions after any Allocas
+  auto it = BB.getFirstInsertionPt();
+  while (isa<AllocaInst>(*it)) {
+    ++it;
+  }
+  return &*it;
+}
+
+Instruction *copyMask(Value *mask, Twine name, Instruction *insertBefore) {
+  VECZ_ERROR_IF(!mask || !insertBefore,
+                "Trying to copy mask with invalid arguments");
+  return BinaryOperator::CreateAnd(mask, getDefaultValue(mask->getType(), 1),
+                                   name, insertBefore);
+}
+
+Instruction *copyEntryMask(Value *mask, BasicBlock &BB) {
+  VECZ_ERROR_IF(!mask, "Trying to copy entry mask with invalid arguments");
+  return copyMask(mask, BB.getName() + ".entry_mask", getInsertionPt(BB));
+}
+
+Instruction *copyExitMask(Value *mask, StringRef base, BasicBlock &BB) {
+  VECZ_ERROR_IF(!mask, "Trying to copy exit mask with invalid arguments");
+  return copyMask(mask, base + ".exit_mask", BB.getTerminator());
+}
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+char ControlFlowConversionPass::PassID = 0;
+
+PreservedAnalyses ControlFlowConversionPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  ControlFlowConversionState::Impl state(F, AM);
+  return state.run(F, AM);
+}
+
+ControlFlowConversionState::ControlFlowConversionState(
+    Function &F, FunctionAnalysisManager &AM)
+    : F(F),
+      AM(AM),
+      VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
+      Ctx(AM.getResult<VectorizationContextAnalysis>(F).getContext()) {}
+
+PreservedAnalyses ControlFlowConversionState::Impl::run(
+    Function &F, FunctionAnalysisManager &AM) {
+  auto const &CFGR = AM.getResult<CFGAnalysis>(F);
+  if (CFGR.getFailed()) {
+    ++VeczCFGFail;
+    return VU.setFailed("Cannot vectorize the CFG for", &F, &F);
+  } else if (!CFGR.isConversionNeeded()) {
+    return PreservedAnalyses::all();
+  }
+  functionExitBlock = CFGR.getExitBlock();
+
+  if (!convertToDataFlow()) {
+    // This pass may leave the function in an invalid state. Instead of doing
+    // so, and hoping that later passes don't throw verification failures back
+    // at us, replace the function body with an unreachable statement. Marking
+    // vectorization has having failed will mean the function will later be
+    // deleted.
+    // Note that this is quite coarse-grained; we could be cleverer, e.g., by
+    // returning whether convertToDataFlow has (potentially) left behind an
+    // invalid function.
+    ++VeczCFGFail;
+    VU.setFailed("Control flow conversion failed for", &F, VU.scalarFunction());
+    F.deleteBody();
+    BasicBlock *BB = BasicBlock::Create(F.getContext(), "entry", &F);
+    IRBuilder<> IRB(BB);
+    IRB.CreateUnreachable();
+    return PreservedAnalyses::none();
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DivergenceAnalysis>();
+
+  return Preserved;
+}
+
+bool ControlFlowConversionState::replaceReachableUses(Reachability &RC,
+                                                      Instruction *from,
+                                                      Value *to,
+                                                      BasicBlock *src) {
+  for (auto it = from->use_begin(); it != from->use_end();) {
+    Use &U = *it++;
+    Instruction *user = cast<Instruction>(U.getUser());
+
+    if (user == to) {
+      continue;
+    }
+
+    BasicBlock *blockUse = user->getParent();
+
+    if (PHINode *PHI = dyn_cast<PHINode>(user)) {
+      // Cannot replace a use in a phi node with another phi node in the same
+      // block.
+      if (blockUse == src) {
+        if (isa<PHINode>(to)) {
+          continue;
+        }
+      } else {
+        // We must also check that 'src' can reach the incoming block to be
+        // allowed to replace the incoming value.
+        BasicBlock *incoming = PHI->getIncomingBlock(U);
+        if (!RC.isReachable(src, incoming)) {
+          continue;
+        }
+      }
+    }
+
+    if (auto toI = dyn_cast<Instruction>(to)) {
+      if (toI->getParent() == blockUse) {
+        for (Instruction &I : *src) {
+          // If we found the user before `to`, then skip this user as it lives
+          // before `to` in the same block.
+          if (&I == user) {
+            break;
+          }
+          if (&I == to) {
+            LLVM_DEBUG(dbgs() << "Replace  " << *from << " with " << *to
+                              << " in " << *user << "\n");
+            U.set(to);
+            break;
+          }
+        }
+        // We've handled all possible cases if `to` lives in the same block as
+        // `user`, so iterate over a new instruction.
+        continue;
+      }
+    }
+
+    // `to` is in a different block than `user` so just check for reachability
+    // across BasicBlocks and not within them.
+    if (RC.isReachable(src, blockUse)) {
+      LLVM_DEBUG(dbgs() << "Replace  " << *from << " with " << *to << " in "
+                        << *user << "\n");
+      U.set(to);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::convertToDataFlow() {
+  DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
+  LI = &AM.getResult<LoopAnalysis>(F);
+  UVR = &AM.getResult<UniformValueAnalysis>(F);
+
+  // Make sure every loop has an entry in the masks table before we start.
+  for (auto *L : *LI) {
+    LoopMasks[L];
+  }
+
+  if (!VU.choices().linearizeBOSCC()) {
+    ROSCCGadget ROSCC(*this);
+    ROSCC.run(F);
+  }
+
+  RC = std::make_unique<Reachability>(*DT, *PDT, *LI);
+
+  // We do this after ROSCC, because it may have modified the CFG.
+  DR = &AM.getResult<DivergenceAnalysis>(F);
+
+  if (VU.choices().linearizeBOSCC()) {
+    BOSCC = std::make_unique<BOSCCGadget>(*this);
+    if (!BOSCC->duplicateUniformRegions()) {
+      emitVeczRemarkMissed(&F, VU.scalarFunction(),
+                           "Could not duplicate uniform regions for");
+      return false;
+    }
+  }
+
+  // Reserve space for the masks table and default-construct all entries, to
+  // avoid re-hashing/element relocation on access.
+  MaskInfos.reserve(F.size());
+  for (auto &BB : F) {
+    MaskInfos[&BB];
+  }
+
+  if (!generateMasks()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(),
+                         "Could not generate masks for");
+    return false;
+  }
+  if (!applyMasks()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(), "Could not apply masks for");
+    return false;
+  }
+
+  if (!partiallyLinearizeCFG()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(),
+                         "Could not partially linearize the CFG for");
+    return false;
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateMasks() {
+  LLVM_DEBUG(dbgs() << "MASKS GENERATION\n");
+
+  RC->update(F);
+
+  VECZ_FAIL_IF(!createMasks(*functionExitBlock));
+
+  if (BOSCC) {
+    // The BOSCC entry blocks that have not been duplicated need exit masks
+    // towards uniform blocks.
+    SmallVector<BasicBlock *, 16> entryBlocks;
+    BOSCC->getUnduplicatedEntryBlocks(entryBlocks);
+    for (auto *const entry : entryBlocks) {
+      VECZ_FAIL_IF(!createExitMasks(*entry, true));
+    }
+
+    // Link the masks of the predicated regions to the uniform regions.
+    VECZ_FAIL_IF(!BOSCC->linkMasks());
+  }
+
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    VECZ_FAIL_IF(!createLoopExitMasks(*LTag));
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createMasks(BasicBlock &BB) {
+  // If we have already set the mask for this block, don't do it again.
+  // Uniform blocks are handled separately because of their lack of context.
+  if (MaskInfos[&BB].entryMask) {
+    return true;
+  }
+
+  auto *const LTag = DR->getTag(&BB).loop;
+  auto *const header = LTag ? LTag->header : nullptr;
+  // If BB is a header, we will need the mask from its preheader.
+  // KLOCWORK "NPD.CHECK.MIGHT" possible false positive
+  // LTag is only dereferenced if it's not nullptr, but Klocwork doesn't follow
+  // the logic.
+  if (header == &BB) {
+    BasicBlock *preheader = LTag->preheader;
+    VECZ_FAIL_IF(!createMasks(*preheader));
+  } else {
+    // Otherwise we will need the mask from every incoming edge.
+    for (BasicBlock *pred : predecessors(&BB)) {
+      VECZ_FAIL_IF(!createMasks(*pred));
+    }
+  }
+
+  VECZ_FAIL_IF(!createEntryMasks(BB));
+  VECZ_FAIL_IF(!createExitMasks(BB));
+
+  // If the block is a loop header, its entry mask is a phi function with
+  // incoming values from the preheader and:
+  //  - the latch for divergent loops,
+  //  - nothing else for uniform loops (because if we enter an uniform loop,
+  //    all instance that were active upon entry remain active upon exit).
+  if (header == &BB) {
+    BasicBlock *latch = LTag->latch;
+    VECZ_FAIL_IF(!createMasks(*latch));
+
+    if (LTag->isLoopDivergent()) {
+      auto *const entryMask = MaskInfos[&BB].entryMask;
+      assert(isa<PHINode>(entryMask) &&
+             "Divergent Loop entry mask must be a PHI Node!");
+      PHINode *phi = cast<PHINode>(entryMask);
+      // If header has two incoming values, we have already processed it.
+      if (phi->getNumIncomingValues() != 2) {
+        Value *latchMask = MaskInfos[latch].exitMasks[header];
+        phi->addIncoming(latchMask, latch);
+
+        LLVM_DEBUG(dbgs() << "Divergent loop header " << header->getName()
+                          << ": entry mask: " << *phi << "\n");
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
+  auto &maskInfo = MaskInfos[&BB];
+  if (maskInfo.entryMask) {
+    return true;
+  }
+
+  Type *maskTy = Type::getInt1Ty(BB.getContext());
+
+  // If the block is by_all (i.e. executed by all lanes), it will always be
+  // executed on active masks,
+  // Similarly, if the block is uniform, its mask is true by definition.
+  if (DR->isByAll(BB) || DR->isUniform(BB)) {
+    maskInfo.entryMask = copyEntryMask(getDefaultValue(maskTy, 1), BB);
+    LLVM_DEBUG(dbgs() << BB.getName() << ": entry mask: " << *maskInfo.entryMask
+                      << "\n");
+    return true;
+  }
+
+  // If the block has only one predecessor, set its entry mask to be its
+  // predecessor's exit mask.
+  const unsigned numPreds = std::distance(pred_begin(&BB), pred_end(&BB));
+  if (numPreds == 1) {
+    BasicBlock *pred = *pred_begin(&BB);
+    maskInfo.entryMask = copyEntryMask(MaskInfos[pred].exitMasks[&BB], BB);
+    LLVM_DEBUG(dbgs() << BB.getName()
+                      << ": entry mask: its single predecessor exit mask "
+                      << *maskInfo.entryMask << "\n");
+    return true;
+  }
+
+  // If the block is a loop header, its mask is a phi function with incoming
+  // values from the preheader and:
+  //  - the latch for divergent loops,
+  //  - nothing else for uniform loops (because if we enter a uniform loop,
+  //    all instance that were active upon entry remain active upon exit).
+  //
+  // Here we only store the preheader's exit block as we handle the latch
+  // in case the loop is divergent in the caller function.
+  auto const *const LTag = DR->getTag(&BB).loop;
+  if (LTag && LTag->header == &BB) {
+    BasicBlock *preheader = LTag->preheader;
+    VECZ_ERROR_IF(!preheader, "BasicBlock tag is not defined");
+
+    if (LTag->isLoopDivergent()) {
+      PHINode *PHI =
+          PHINode::Create(maskTy, 2, BB.getName() + ".entry_mask", &BB.front());
+      PHI->addIncoming(MaskInfos[preheader].exitMasks[&BB], preheader);
+      maskInfo.entryMask = PHI;
+      LLVM_DEBUG(dbgs() << "Loop divergent loop header " << BB.getName()
+                        << ": entry mask: " << *maskInfo.entryMask << "\n");
+
+    } else {
+      maskInfo.entryMask =
+          copyEntryMask(MaskInfos[preheader].exitMasks[&BB], BB);
+      LLVM_DEBUG(dbgs() << "Uniform loop header " << BB.getName()
+                        << ": entry mask: " << *maskInfo.entryMask << "\n");
+    }
+    return true;
+  }
+
+  // If the dominator of this block is also post-dominated by this block,
+  // then if one is executed, the other must be also. So copy the mask.
+  auto *IDom = DT->getNode(&BB)->getIDom();
+  while (IDom) {
+    BasicBlock *DomBB = IDom->getBlock();
+    if (DR->getTag(DomBB).loop == LTag && PDT->dominates(&BB, DomBB)) {
+      maskInfo.entryMask = copyEntryMask(MaskInfos[DomBB].entryMask, BB);
+      LLVM_DEBUG(dbgs() << "Copied-via-domination " << BB.getName()
+                        << ": entry mask: " << *maskInfo.entryMask << "\n");
+      return true;
+    }
+    IDom = IDom->getIDom();
+  }
+
+  // In any other case, its mask is the disjunction of every incoming edge.
+  // The union of every predecessor if it is a join point of a varying branch.
+  if (DR->isBlend(BB)) {
+    for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) {
+      if (it == pred_begin(&BB)) {
+        maskInfo.entryMask = copyEntryMask(MaskInfos[*it].exitMasks[&BB], BB);
+        LLVM_DEBUG(dbgs() << "Blend block " << BB.getName()
+                          << ": entry mask: " << *maskInfo.entryMask << "\n");
+      } else {
+        Instruction *insertBefore =
+            cast<Instruction>(maskInfo.entryMask)->getNextNode();
+        maskInfo.entryMask = BinaryOperator::CreateOr(
+            maskInfo.entryMask, MaskInfos[*it].exitMasks[&BB],
+            BB.getName() + ".entry_mask", insertBefore);
+
+        LLVM_DEBUG(dbgs() << "Blend block " << BB.getName()
+                          << ": entry mask: " << *maskInfo.entryMask << "\n");
+      }
+    }
+  } else {
+    // A phi function of the predecessors otherwise.
+    PHINode *PHI = PHINode::Create(maskTy, numPreds,
+                                   BB.getName() + ".entry_mask", &BB.front());
+    for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) {
+      PHI->addIncoming(MaskInfos[*it].exitMasks[&BB], *it);
+    }
+    maskInfo.entryMask = PHI;
+    LLVM_DEBUG(dbgs() << BB.getName() << ": entry mask: " << *maskInfo.entryMask
+                      << "\n");
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB,
+                                                       bool isBOSCCEntry) {
+  assert((!isBOSCCEntry || BOSCC) &&
+         "Creating BOSCC Exit Masks when BOSCC object does not exist!");
+
+  auto &maskInfo = MaskInfos[&BB];
+
+  // If BB is a BOSCC entry, we want to compute the uniform exit masks for
+  // this block.
+  if (!isBOSCCEntry && !maskInfo.exitMasks.empty()) {
+    return true;
+  }
+
+  const unsigned numSucc = std::distance(succ_begin(&BB), succ_end(&BB));
+
+  // If BB has no successor, there is obviously nothing to do.
+  if (numSucc == 0) {
+    return true;
+  }
+
+  // If BB has only one successor, then the exit mask is the entry mask of BB.
+  if (numSucc == 1) {
+    BasicBlock *succ = *succ_begin(&BB);
+    maskInfo.exitMasks[succ] =
+        copyExitMask(maskInfo.entryMask, succ->getName(), BB);
+    LLVM_DEBUG(dbgs() << BB.getName() << ": exit mask to single successor "
+                      << succ->getName() << ": " << *maskInfo.entryMask
+                      << "\n");
+    return true;
+  }
+
+  const bool isVarying = DR->getTag(&BB).hasVaryingBranch();
+
+  // If BB has more than 1 successor, the exit mask of each successor is the
+  // conjunction of the entry mask of BB and the condition to jump to the
+  // successor.
+  auto *T = BB.getTerminator();
+  IRBuilder<> B(T);
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+    BasicBlock *trueBB = BI->getSuccessor(0);
+    BasicBlock *falseBB = BI->getSuccessor(1);
+    assert(trueBB && "Could not get successor 0 of branch");
+    assert(falseBB && "Could not get successor 1 of branch");
+
+    if (isBOSCCEntry) {
+      if (BasicBlock *trueBBUniform = BOSCC->getBlock(trueBB)) {
+        trueBB = trueBBUniform;
+      }
+      if (BasicBlock *falseBBUniform = BOSCC->getBlock(falseBB)) {
+        falseBB = falseBBUniform;
+      }
+    }
+
+    Value *cond = BI->getCondition();
+    if (isVarying) {
+      maskInfo.exitMasks[trueBB] = B.CreateAnd(
+          maskInfo.entryMask, cond, trueBB->getName() + ".exit_mask");
+
+      // For the false edge, we have to negate the condition.
+      Value *falseCond = B.CreateNot(cond, cond->getName() + ".not");
+      maskInfo.exitMasks[falseBB] = B.CreateAnd(
+          maskInfo.entryMask, falseCond, falseBB->getName() + ".exit_mask");
+
+      LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                        << trueBB->getName() << ": "
+                        << *maskInfo.exitMasks[trueBB] << "\n");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                        << falseBB->getName() << ": "
+                        << *maskInfo.exitMasks[falseBB] << "\n");
+    } else {
+      maskInfo.exitMasks[trueBB] = B.CreateSelect(
+          cond, maskInfo.entryMask, getDefaultValue(cond->getType()),
+          trueBB->getName() + ".exit_mask");
+      maskInfo.exitMasks[falseBB] =
+          B.CreateSelect(cond, getDefaultValue(cond->getType()),
+                         maskInfo.entryMask, falseBB->getName() + ".exit_mask");
+
+      LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                        << trueBB->getName() << ": "
+                        << *maskInfo.exitMasks[trueBB] << "\n");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                        << falseBB->getName() << ": "
+                        << *maskInfo.exitMasks[falseBB] << "\n");
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+    Value *cond = SI->getCondition();
+    BasicBlock *defaultDest = SI->getDefaultDest();
+
+    if (isBOSCCEntry) {
+      if (BasicBlock *defaultDestUniform = BOSCC->getBlock(defaultDest)) {
+        defaultDest = defaultDestUniform;
+      }
+    }
+
+    // The default condition is the negation of the disjunction of every case
+    // condition, so that if no case has its condition true, then we can choose
+    // default.
+    Value *caseConds = nullptr;
+    for (auto c : SI->cases()) {
+      Value *caseCond = B.CreateICmpEQ(cond, c.getCaseValue());
+      caseConds = !caseConds ? caseCond : B.CreateOr(caseConds, caseCond);
+      BasicBlock *caseBlock = c.getCaseSuccessor();
+      if (isBOSCCEntry) {
+        if (BasicBlock *caseBlockUniform = BOSCC->getBlock(caseBlock)) {
+          caseBlock = caseBlockUniform;
+        }
+      }
+
+      if (isVarying) {
+        maskInfo.exitMasks[caseBlock] = B.CreateAnd(
+            maskInfo.entryMask, caseCond, caseBlock->getName() + ".exit_mask");
+        LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                          << caseBlock->getName() << ": "
+                          << *maskInfo.exitMasks[caseBlock] << "\n");
+      } else {
+        maskInfo.exitMasks[caseBlock] = B.CreateSelect(
+            caseCond, maskInfo.entryMask, getDefaultValue(caseCond->getType()),
+            caseBlock->getName() + ".exit_mask");
+        LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                          << caseBlock->getName() << ": "
+                          << *maskInfo.exitMasks[caseBlock] << "\n");
+      }
+    }
+
+    VECZ_ERROR_IF(!caseConds, "No switch condition was found");
+
+    Value *negCond = B.CreateNot(caseConds, caseConds->getName() + ".not");
+    if (isVarying) {
+      maskInfo.exitMasks[defaultDest] = B.CreateAnd(
+          negCond, maskInfo.entryMask, defaultDest->getName() + ".exit_mask");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                        << defaultDest->getName() << ": "
+                        << *maskInfo.exitMasks[defaultDest] << "\n");
+    } else {
+      maskInfo.exitMasks[defaultDest] = B.CreateSelect(
+          negCond, maskInfo.entryMask, getDefaultValue(negCond->getType()),
+          defaultDest->getName() + ".exit_mask");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                        << defaultDest->getName() << ": "
+                        << *maskInfo.exitMasks[defaultDest] << "\n");
+    }
+  } else {
+    // We should not have a case where we don't have a BranchInst nor a
+    // SwitchInst but more than 1 successors.
+    return false;
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
+  auto &LMask = LoopMasks[LTag.loop];
+  // If the Loop already has a CombinedExitMasks we have already processed it.
+  if (LMask.combinedDivergentExitMask) {
+    return true;
+  }
+
+  Type *maskTy = Type::getInt1Ty(F.getContext());
+  SmallVector<Loop::Edge, 1> exitEdges;
+  LTag.loop->getExitEdges(exitEdges);
+  for (Loop::Edge &EE : exitEdges) {
+    auto const *const exitingBlock = EE.first;
+    auto const *const exitBlock = EE.second;
+    // Divergent loop need to keep track of which instance left at which exit.
+    if (LTag.isLoopDivergent() && DR->isDivergent(*exitBlock)) {
+      // The value of the exit mask of a divergent loop is a phi function
+      // between the mask update and the loop exit mask phi.
+      auto *const exitMask =
+          PHINode::Create(maskTy, 2, exitBlock->getName() + ".loop_exit_mask",
+                          LTag.header->getFirstNonPHI());
+      LMask.persistedDivergentExitMasks[exitingBlock] = exitMask;
+      if (BOSCC) {
+        BOSCC->createReference(exitMask, getDefaultValue(maskTy));
+      }
+    }
+  }
+
+  for (Loop *L : LTag.loop->getSubLoops()) {
+    VECZ_FAIL_IF(!createLoopExitMasks(DR->getTag(L)));
+  }
+
+  // If the loop is uniform, all instances that enter the loop will leave it
+  // together.
+  if (!LTag.isLoopDivergent()) {
+    return true;
+  }
+
+  // Check if the exit edge leaves multiple loops, in which case we return the
+  // next inner loop left by it.
+  auto nextInnerLoopLeft = [this, &LTag](BasicBlock *exitingBlock,
+                                         BasicBlock *exitBlock) -> Loop * {
+    Loop *innerLoop = nullptr;
+    Loop *loop = DR->getTag(exitingBlock).loop->loop;
+    // Iterate until we reach the current loop.
+    while (loop && loop != LTag.loop) {
+      // If this is an exit edge.
+      if (loop->contains(exitingBlock) && !loop->contains(exitBlock)) {
+        innerLoop = loop;
+      }
+
+      loop = loop->getParentLoop();
+    }
+
+    return innerLoop;
+  };
+
+  for (Loop::Edge &EE : exitEdges) {
+    BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
+    BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
+
+    if (DR->isDivergent(*exitBlock)) {
+      PHINode *REM = LMask.persistedDivergentExitMasks[exitingBlock];
+      REM->addIncoming(getDefaultValue(REM->getType()), LTag.preheader);
+
+      auto const *const exitingLTag = DR->getTag(exitingBlock).loop;
+      VECZ_ERROR_IF(!exitingLTag, "Loop tag is not defined");
+
+      // By default, the second operand of the mask update is the exit
+      // condition.
+      auto &exitMasks = MaskInfos[exitingBlock].exitMasks;
+      Value *maskUpdateOperand = exitMasks[exitBlock];
+
+      // If the exit leaves multiple loops and the current loop is not the
+      // innermost left by this exit, set the update mask to be a disjunction
+      // with the exit mask and the accumulated update mask from the next inner
+      // loop left by this exit.
+      if (exitingLTag->loop != LTag.loop) {
+        if (Loop *nestedLoop = nextInnerLoopLeft(exitingBlock, exitBlock)) {
+          maskUpdateOperand =
+              LoopMasks[nestedLoop]
+                  .updatedPersistedDivergentExitMasks[exitingBlock];
+        }
+      }
+
+      BinaryOperator *maskUpdate = BinaryOperator::CreateOr(
+          REM, maskUpdateOperand,
+          exitBlock->getName() + ".loop_exit_mask.update",
+          exitingBlock->getTerminator());
+
+      LMask.updatedPersistedDivergentExitMasks[exitingBlock] = maskUpdate;
+
+      if (BOSCC) {
+        // The uniform version of divergent loop exit masks is the edge's exit
+        // mask.
+        BOSCC->addReference(maskUpdate, exitMasks[exitBlock]);
+      }
+
+      // If this is the outermost loop left by this exit, update the exit
+      // mask.
+      if (DR->getTag(exitBlock).outermostExitedLoop == &LTag) {
+        VECZ_ERROR_IF(!isa<Instruction>(exitMasks[exitBlock]),
+                      "Trying to replace uses of a value");
+        VECZ_FAIL_IF(
+            !replaceReachableUses(*RC, cast<Instruction>(exitMasks[exitBlock]),
+                                  maskUpdate, exitBlock));
+
+        exitMasks[exitBlock] = maskUpdate;
+      }
+
+      REM->addIncoming(maskUpdate, LTag.latch);
+
+      LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                        << ": divergent loop exit edges ["
+                        << exitingBlock->getName() << " -> "
+                        << exitBlock->getName() << "]: exit mask: " << *REM
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                        << ": divergent loop exit edges ["
+                        << exitingBlock->getName() << " -> "
+                        << exitBlock->getName()
+                        << "]: update exit mask: " << *maskUpdate << "\n");
+    }
+  }
+
+  VECZ_FAIL_IF(!createCombinedLoopExitMask(LTag));
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createCombinedLoopExitMask(
+    LoopTag &LTag) {
+  // Gather every information on every instance that left the loop in the
+  // current iteration.
+  SmallVector<Loop::Edge, 1> exitEdges;
+  auto *const Loop = LTag.loop;
+  Loop->getExitEdges(exitEdges);
+  auto &LMask = LoopMasks[Loop];
+  for (Loop::Edge &EE : exitEdges) {
+    BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
+    BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
+    if (DR->isDivergent(*exitBlock)) {
+      if (!LMask.combinedDivergentExitMask) {
+        LMask.combinedDivergentExitMask = copyMask(
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand(
+                1),
+            Loop->getName() + ".combined_divergent_exit_mask",
+            LTag.latch->getTerminator());
+
+        LMask.persistedCombinedDivergentExitMask = copyMask(
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock],
+            Loop->getName() + ".persisted_combined_divergent_exit_mask",
+            LTag.latch->getTerminator());
+      } else {
+        LMask.combinedDivergentExitMask = BinaryOperator::CreateOr(
+            LMask.combinedDivergentExitMask,
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand(
+                1),
+            Loop->getName() + ".combined_divergent_exit_mask",
+            LTag.latch->getTerminator());
+
+        LMask.persistedCombinedDivergentExitMask = BinaryOperator::CreateOr(
+            LMask.persistedCombinedDivergentExitMask,
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock],
+            Loop->getName() + ".persisted_combined_divergent_exit_mask",
+            LTag.latch->getTerminator());
+      }
+    }
+  }
+
+  VECZ_ERROR_IF(!LMask.combinedDivergentExitMask ||
+                    !LMask.persistedCombinedDivergentExitMask,
+                "Divergent loop has no loop exit condition");
+
+  LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                    << ": current iteration combine divergent loop exit: "
+                    << *LMask.combinedDivergentExitMask << "\n");
+  LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                    << ": whole loop combine divergent loop exit: "
+                    << *LMask.persistedCombinedDivergentExitMask << "\n");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::applyMasks() {
+  for (auto &BB : F) {
+    // Use masks with instructions that have side-effects.
+    if (!DR->isUniform(BB) && !DR->isByAll(BB)) {
+      auto *const entryMask = MaskInfos[&BB].entryMask;
+      VECZ_ERROR_IF(!entryMask, "BasicBlock should have an entry mask");
+      VECZ_FAIL_IF(!applyMask(BB, entryMask));
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
+  // Packetization hasn't happened yet so this better be a scalar 1 bit int.
+  assert(mask->getType()->isIntegerTy(1) && "CFG mask type should be int1");
+  // Map the unmasked instruction with the masked one.
+  DeletionMap toDelete;
+  DenseMap<Value *, Value *> safeDivisors;
+
+  for (Instruction &I : BB) {
+    if (tryApplyMaskToBinOp(I, mask, toDelete, safeDivisors)) {
+      continue;
+    }
+    Optional<MemOp> memOp = MemOp::get(&I);
+    // Turn loads and stores into masked loads and stores.
+    if (memOp && (memOp->isLoad() || memOp->isStore())) {
+      if (!tryApplyMaskToMemOp(*memOp, mask, toDelete)) {
+        return false;
+      }
+    } else if (auto *CI = dyn_cast<CallInst>(&I)) {
+      // Turn calls into masked calls if possible.
+      if (!applyMaskToCall(CI, mask, toDelete)) {
+        return false;
+      }
+    } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
+      // We need to apply masks to atomic functions, but it is currently not
+      // implemented. See CA-3294.
+      return false;
+    } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
+      // We have to be careful with infinite loops, because if they exist on a
+      // divergent code path, they will always be entered and will hang the
+      // kernel. Therefore, we replace the branch condition with the mask of
+      // the preheader, to ensure they only loop if at least one lane is
+      // actually executed.
+      if (branch->isConditional()) {
+        auto *const cond = dyn_cast<Constant>(branch->getCondition());
+        if (cond && cond->isOneValue()) {
+          auto *const loop = DR->getTag(&BB).loop;
+          if (loop && loop->latch == &BB) {
+            auto *const loopMask = MaskInfos[loop->preheader].entryMask;
+            branch->setCondition(loopMask);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto &pair : toDelete) {
+    Instruction *unmasked = pair.first;
+    Value *masked = pair.second;
+    updateMaps(unmasked, masked);
+    IRCleanup::deleteInstructionNow(unmasked);
+  }
+  return true;
+}
+
+CallInst *ControlFlowConversionState::Impl::emitMaskedVersion(CallInst *CI,
+                                                              Value *entryBit) {
+  // Get the masked function
+  Function *newFunction = Ctx.getOrCreateMaskedFunction(CI);
+  VECZ_FAIL_IF(!newFunction);
+  SmallVector<Value *, 8> fnArgs;
+  for (unsigned i = 0; i < CI->arg_size(); ++i) {
+    fnArgs.push_back(CI->getOperand(i));
+  }
+  fnArgs.push_back(entryBit);
+
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", CI);
+  newCI->setCallingConv(CI->getCallingConv());
+  newCI->setAttributes(CI->getAttributes());
+
+  return newCI;
+}
+
+bool ControlFlowConversionState::Impl::tryApplyMaskToBinOp(
+    Instruction &I, Value *mask, DeletionMap &toDelete,
+    DenseMap<Value *, Value *> &safeDivisors) {
+  if (auto *binOp = dyn_cast<BinaryOperator>(&I)) {
+    if (!VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions)) {
+      // we don't need to mask division operations if they don't trap
+      return true;
+    }
+    // We might have to mask integer divides to avoid division errors.
+    // NOTE we don't generate any specific error checks ourselves, on the
+    // assumption that the incoming IR is already guarded against these,
+    // so it is sufficient to use the mask generated from the CFG.
+    bool isUnsigned = false;
+    switch (binOp->getOpcode()) {
+      case Instruction::UDiv:
+      case Instruction::URem:
+        isUnsigned = true;
+        LLVM_FALLTHROUGH;
+      case Instruction::SDiv:
+      case Instruction::SRem: {
+        auto *divisor = binOp->getOperand(1);
+        // no need to mask divides by a constant..
+        if (auto *C = dyn_cast<Constant>(divisor)) {
+          if (C->isZeroValue()) {
+            // Divides by constant zero can be a NOP since there is no
+            // division by zero exception in OpenCL.
+            auto *nop = binOp->getOperand(0);
+            I.replaceAllUsesWith(nop);
+            toDelete.emplace_back(&I, nop);
+          }
+        } else {
+          auto &masked = safeDivisors[divisor];
+          if (!masked) {
+            // NOTE this function does not check for the pattern
+            // "select (x eq 0) 1, x" or equivalent, so we might want to
+            // write it ourselves, but Instruction Combining cleans it up.
+            // NOTE that for a signed division, we also have to consider the
+            // potential overflow situation, which is not so simple
+            if (isUnsigned &&
+                isKnownNonZero(divisor, F.getParent()->getDataLayout())) {
+              // Static analysis concluded it can't be zero, so we don't need
+              // to do anything.
+              masked = divisor;
+            } else {
+              masked = SelectInst::Create(
+                  mask, divisor, ConstantInt::get(divisor->getType(), 1),
+                  divisor->getName() + ".masked", &I);
+            }
+          }
+
+          if (masked != divisor) {
+            binOp->setOperand(1, masked);
+          }
+        }
+      } break;
+
+      default:
+        break;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp(
+    MemOp &memOp, Value *mask, DeletionMap &toDelete) {
+  VECZ_FAIL_IF(!memOp.isLoad() && !memOp.isStore());
+  auto *I = memOp.getInstr();
+  VECZ_FAIL_IF(!I);
+  auto *dataVecTy = dyn_cast<FixedVectorType>(memOp.getDataType());
+  const unsigned dataWidth = dataVecTy ? dataVecTy->getNumElements() : 1;
+  Value *wideMask = mask;
+  if (dataWidth > 1) {
+    // If it's a vector mem-op it gets the same mask for every element
+    IRBuilder<> B(I);
+    wideMask = B.CreateVectorSplat(dataWidth, mask);
+  }
+
+  // Turn loads and stores into masked loads and stores.
+  if (memOp.isLoadStoreInst()) {
+    // Create a new mem-op the same as the original except for the addition
+    // of the mask.
+    Value *newVal = nullptr;
+    if (memOp.isLoad()) {
+      newVal = createMaskedLoad(
+          Ctx, memOp.getDataType(), memOp.getPointerOperand(), wideMask,
+          /*VL*/ nullptr, memOp.getAlignment(), I->getName(), I);
+    } else {
+      newVal = createMaskedStore(
+          Ctx, memOp.getDataOperand(), memOp.getPointerOperand(), wideMask,
+          /*VL*/ nullptr, memOp.getAlignment(), I->getName(), I);
+    }
+
+    VECZ_FAIL_IF(!newVal);
+    if (!I->getType()->isVoidTy()) {
+      I->replaceAllUsesWith(newVal);
+    }
+    toDelete.emplace_back(I, newVal);
+    return true;
+  }
+
+  if (auto *opMask = memOp.getMaskOperand()) {
+    memOp.setMaskOperand(
+        BinaryOperator::CreateAnd(wideMask, opMask, "composite_mask", I));
+    return true;
+  }
+
+  return false;
+}
+
+bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
+                                                       Value *mask,
+                                                       DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at CallInst " << *CI << "\n");
+  // It might be that we need to mask the function call here because we
+  // won't be able to packetize it later on.
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    callee = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
+  }
+  VECZ_FAIL_IF(!callee);  // TODO: CA-1505: Support indirect function calls.
+  // Check to see if this is a function that we know we won't be able to
+  // handle in any other way.
+  VECZ_FAIL_IF(callee->cannotDuplicate());
+
+  // Do not mess with internal builtins
+  if (Ctx.isInternalBuiltin(callee)) {
+    LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an internal builtin\n");
+    return true;
+  }
+
+  // Builtins without side effects do not need to be masked.
+  auto const props = Ctx.builtins().analyzeBuiltin(*callee).properties;
+  if (props & compiler::utils::eBuiltinPropertyNoSideEffects) {
+    LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n");
+    return true;
+  }
+  if (props & compiler::utils::eBuiltinPropertyWorkItem) {
+    LLVM_DEBUG(dbgs() << "vecz-cf: Called function is a workitem ID builtin\n");
+    return true;
+  }
+  if (props & compiler::utils::eBuiltinPropertyExecutionFlow) {
+    LLVM_DEBUG(
+        dbgs() << "vecz-cf: Called function is an execution flow builtin\n");
+    // Masking this kind of builtin (a barrier) is not valid.
+    return false;
+  }
+  // Functions without side-effects do not need to be masked.
+  if (callee->onlyReadsMemory() || callee->doesNotAccessMemory()) {
+    LLVM_DEBUG(
+        dbgs() << "vecz-cf: Called function does not have any side-effects\n");
+    return true;
+  }
+
+  // Create the new function and replace the old one with it
+  CallInst *newCI = emitMaskedVersion(CI, mask);
+  VECZ_FAIL_IF(!newCI);
+  if (!CI->getType()->isVoidTy()) {
+    CI->replaceAllUsesWith(newCI);
+  }
+  toDelete.emplace_back(CI, newCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << *CI << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
+  // Two methods are possible to transform the divergent loops into uniform
+  // ones:
+  // 1) rewire the exit edges to the single latch, which means the loop live
+  //    masks have to be updated at each exiting block.
+  // 2) delete the divergent loop exit edges and update the loop live masks at
+  //    the latch.
+  //
+  // The former means more overhead when a loop exit is reached because we
+  // always have to update the masks, but it allows to retain the exiting
+  // branches.
+  // The latter means we only blend at the latch, thus less overhead at the
+  // loop exits, but if we reach a divergent loop exit, and it happens that all
+  // lanes have exited the loop, we still have to finish the iteration until we
+  // reach the latch and exit the loop.
+  //
+  // We are currently using the latter.
+  VECZ_FAIL_IF(!uniformizeDivergentLoops());
+
+  // ... and actually rewire them.
+  VECZ_FAIL_IF(!linearizeCFG());
+
+  // Transform phi nodes into selects for blocks that got blended.
+  VECZ_FAIL_IF(!generateSelects());
+
+  // Connect BOSCC regions if it is activated.
+  VECZ_FAIL_IF(BOSCC && !BOSCC->connectBOSCCRegions());
+
+  // Repair the CFG because the rewiring broke it.
+  VECZ_FAIL_IF(!repairSSA());
+
+  // Now we create the opaque calls to builtins that compute the real branch
+  // values. This must come before instruction simplification, otherwise LLVM
+  // can fold branch predicates that appear unreachable now, but would later
+  // become vector masks, thus mangling the control flow..
+  VECZ_FAIL_IF(!createBranchReductions());
+
+  // ... and now we can do instruction simplification on the masks and know they
+  // won't be prematurely folded.
+  VECZ_FAIL_IF(!simplifyMasks());
+
+  // Finally, if we used BOSCC it might want to do some tidying up.
+  VECZ_FAIL_IF(BOSCC && !BOSCC->cleanUp());
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createBranchReductions() {
+  // Try to retrieve the builtin if it already exists.
+  const auto baseName =
+      Twine(VectorizationContext::InternalBuiltinPrefix).concat("divergence");
+  const StringRef nameAny = "_any";
+  const StringRef nameAll = "_all";
+
+  Type *boolTy = Type::getInt1Ty(F.getContext());
+  FunctionType *FT = FunctionType::get(boolTy, {boolTy}, false);
+
+  for (BasicBlock &BB : F) {
+    const bool needsAllOfMask = DR->hasFlag(BB, eBlockNeedsAllOfMask);
+
+    // If the block is uniform and is not a bossc indirection, all its lanes
+    // are true or false, not both. Thus, we don't need to packetize the
+    // condition.
+    if (!needsAllOfMask && DR->isUniform(BB)) {
+      continue;
+    }
+
+    auto *TI = BB.getTerminator();
+    if (BranchInst *Branch = dyn_cast<BranchInst>(TI)) {
+      if (Branch->isConditional()) {
+        auto *const cond = Branch->getCondition();
+        if (isa<Constant>(cond)) {
+          continue;
+        }
+
+        const auto &name = needsAllOfMask ? nameAll : nameAny;
+        Function *const F = Ctx.getOrCreateInternalBuiltin(
+            Twine(baseName).concat(name).str(), FT);
+        VECZ_FAIL_IF(!F);
+
+        auto *const newCall = CallInst::Create(
+            F, {cond}, Twine(cond->getName()).concat(name), Branch);
+        Branch->setCondition(newCall);
+      }
+    } else if (isa<SwitchInst>(TI) &&
+               DR->hasFlag(BB, eBlockHasDivergentBranch)) {
+      // Not sure what to actually do with switch instructions..
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::uniformizeDivergentLoops() {
+  LLVM_DEBUG(dbgs() << "CFC: UNIFORMIZE DIVERGENT LOOPS\n");
+
+  // For every divergent loop of the function, we want to create a new exit edge
+  // whose source is the latch of the loop. That exit is called "pure". The
+  // target of this edge is a new divergent loop exit that will start a cascade
+  // of if conditions to branch to the original loop exits. The divergent loop
+  // exits will no longer be exits, while the optional loop exits will retain
+  // their branch but they will be rewired to the pure exit.
+  //
+  // Given the following *divergent* loop:
+  //
+  //                           preheader
+  //                               |
+  //                             header <---------.
+  //                              / \             |
+  //                            ... ...           |
+  //                            /     \           |
+  //                     %exit2.o     ...         |
+  //                     /            / \         |
+  //                    %d     %exit1.o ...       |
+  //                           /          \       |
+  //                          %b          ...     |
+  //                                      / \     |
+  //                               %exit2.r ...   |
+  //                               /          \   |
+  //                              %c   %latch.r --'
+  //                                   /
+  //                            %exit1.r
+  //                               |
+  //                               %a
+  //
+  // with:
+  // - %a, %b, %c, %d = a group of non specific basic blocks
+  // - %exit*.*       = loop exits
+  // - *.o            = optional blocks
+  // - *.r            = divergent blocks
+  // - %latch.r       = the latch of the loop. It is necessarily a divergent
+  //                    block because the loop is divergent
+  //
+  // The following transformation is performed:
+  //
+  //                     preheader
+  //                         |
+  //                       header <---------.
+  //                        / \             |
+  //                      ... ...           |
+  //                      /     \           |
+  //        %exit2.split1.o     ...         |
+  //        |                   / \         |
+  //         \    %exit1.split1.o ...       |
+  //          \   |                 \       |
+  //           \   \                ...     |
+  //            \   \                 \     |
+  //             \   \                ...   |
+  //              \   \                 \   |
+  //               \   \         %latch.r --'
+  //                \   \           |
+  //                 `---`-> %loop.pure_exit
+  //                               / |
+  //                        %exit1.r %exit1.else.r
+  //                        /             / |
+  //                       %a      %exit2.r %exit2.else.r
+  //                               /             / |
+  //                              %c            /  |
+  //                                           /   |
+  //                             %exit1.split2.o   %exit1.else.o
+  //                             /                      / |
+  //                            %b        %exit2.split2.o %exit2.else.o
+  //                                      /
+  //                                     %d
+  //
+  // with:
+  // - %exit*.split1.o = the first half of the original %exit*.o with only
+  //                     phi nodes
+  // - %exit*.split2.o = the second half of the original %exit*.o without the
+  //                     phi nodes
+  // - %loop.pure_exit = a new loop exit starting a cascade of ifs towards the
+  //                     original loop exits
+  // - %exit*.else.*   = a new block whose only purpose is to branch to other
+  //                     blocks
+  //
+  // Each introduced conditional branch uses the entry mask of the exit block
+  // as the condition.
+  // Each introduced divergent conditional block is marked as Div causing, thus
+  // linearizing them.
+  // Each introduced optional conditional block is marked as divergent, thus
+  // retaining the branches and branching to the true path only if any of the
+  // lanes that executed the loop left through the exit the true path targets.
+  //
+  // The state of the loop after the transformation is invalid and relies on
+  // the linearizer to correctly rewire the introduced blocks. The result of the
+  // above transformed loop after linearization will be:
+  //
+  //                            preheader
+  //                                |
+  //                              header <---------.
+  //                               / \             |
+  //                             ... ...           |
+  //                             /     \           |
+  //               %exit2.split1.o     ...         |
+  //                      |              \         |
+  //                      |              ...       |
+  //                      |                \       |
+  //                      |                ...     |
+  //                      |                / \     |
+  //                      |  %exit1.split1.o ...   |
+  //                       \        |          \   |
+  //                        \       |   %latch.r --'
+  //                         \      |      |
+  //                          `---> %loop.pure_exit
+  //                                       |
+  //                                    %exit1.r
+  //                                       |
+  //                                       %a
+  //                                       |
+  //                                 %exit1.else.r
+  //                                       |
+  //                                    %exit2.r
+  //                                       |
+  //                                       %c
+  //                                       |
+  //                                 %exit2.else.r
+  //                                      / |
+  //                         %exit1.split.o %exit1.else.o
+  //                         /                   / |
+  //                        %b      %exit2.split.o %exit2.else.o
+  //                                /                   ...
+  //                               %d
+  //
+  // Note that only one branch introduced from an optional loop exit
+  // ('%exit2.else.r' and '%exit1.else.o' in this example) can evaluate to
+  // true because as soon as an optional loop exit is taken, all the active
+  // lanes in the loop leave through it.
+  // However, as many as all the branches introduced from divergent loop exits
+  // may evaluate to true. The '...' at the end of the CFG will be replaced by
+  // whatever would originally succeed the original divergent loop exits.
+  bool modified = false;
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    if (LTag->isLoopDivergent()) {
+      Loop *L = LTag->loop;
+
+      // Store the loop exit blocks and edges before doing any modification.
+      SmallVector<BasicBlock *, 2> exitBlocks;
+      SmallVector<Loop::Edge, 2> exitEdges;
+      {
+        L->getExitEdges(exitEdges);
+        // 1) Retrieve the unique loop exit blocks.
+        // 2) Remove any loop exit for which 'L' is not the outermost loop left.
+        // 3) Sort the loop exit blocks.
+        //
+        // We can't use the `getUniqueExitBlocks' method because the loop may
+        // not be in a canonical form because of BOSCC.
+        if (BOSCC) {
+          L->getExitBlocks(exitBlocks);
+          SmallPtrSet<BasicBlock *, 1> _uniqueExitBlocks;
+          for (auto it = exitBlocks.begin(); it != exitBlocks.end();) {
+            if (!_uniqueExitBlocks.insert(*it).second) {
+              it = exitBlocks.erase(it);
+            } else {
+              ++it;
+            }
+          }
+        } else {
+          L->getUniqueExitBlocks(exitBlocks);
+        }
+        // Only handle outermost loops left by the exits.
+        exitBlocks.erase(
+            std::remove_if(exitBlocks.begin(), exitBlocks.end(),
+                           [this, LTag](BasicBlock *EB) {
+                             return DR->getTag(EB).outermostExitedLoop != LTag;
+                           }),
+            exitBlocks.end());
+        // Order the loop exit blocks such that:
+        // - divergent loop exits come first
+        // - smallest DCBI come first
+        auto const middle = std::partition(
+            exitBlocks.begin(), exitBlocks.end(),
+            [this](BasicBlock *BB) { return DR->isDivergent(*BB); });
+        std::sort(exitBlocks.begin(), middle,
+                  [this](BasicBlock *LHS, BasicBlock *RHS) {
+                    return DR->getTagIndex(LHS) < DR->getTagIndex(RHS);
+                  });
+        std::sort(middle, exitBlocks.end(),
+                  [this](BasicBlock *LHS, BasicBlock *RHS) {
+                    return DR->getTagIndex(LHS) < DR->getTagIndex(RHS);
+                  });
+      }
+
+      if (exitBlocks.empty()) {
+        LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                          << " has no loop exits eligible for rewiring.\n");
+        continue;
+      }
+
+      VECZ_FAIL_IF(!computeDivergentLoopPureExit(*LTag));
+      VECZ_FAIL_IF(!rewireDivergentLoopExitBlocks(*LTag, exitBlocks));
+
+      VECZ_FAIL_IF(!generateDivergentLoopResults(*LTag));
+      VECZ_FAIL_IF(!blendDivergentLoopLiveValues(*LTag, exitBlocks));
+      VECZ_FAIL_IF(!blendDivergentLoopExitMasks(*LTag, exitEdges, exitBlocks));
+
+      modified = true;
+    }
+  }
+
+  // We have modified the divergent loops into uniform ones, thus changing the
+  // dominance-compact block ordering. We need to recompute it.
+  if (modified) {
+    DT->recalculate(F);
+    PDT->recalculate(F);
+    // And make sure we correctly updated the DomTrees.
+    VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+    VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+    VECZ_FAIL_IF(!computeBlockOrdering());
+
+    RC->clear();
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::computeDivergentLoopPureExit(
+    LoopTag &LTag) {
+  LLVM_DEBUG(dbgs() << "CFC: COMPUTE PURE EXIT FOR LOOP "
+                    << LTag.loop->getName() << "\n");
+
+  auto *const latchBB = LTag.latch;
+  BasicBlock *pureExit =
+      BasicBlock::Create(F.getContext(), LTag.loop->getName() + ".pure_exit",
+                         &F, latchBB->getNextNode());
+  BasicBlockTag &pureExitTag = DR->getOrCreateTag(pureExit);
+
+  // Set the tags.
+  auto &LMask = LoopMasks[LTag.loop];
+  MaskInfos[pureExit].entryMask = LMask.persistedCombinedDivergentExitMask;
+  pureExitTag.outermostExitedLoop = &LTag;
+
+  auto *const preheaderLoopTag = DR->getTag(LTag.preheader).loop;
+  if (preheaderLoopTag) {
+    pureExitTag.loop = preheaderLoopTag;
+    preheaderLoopTag->loop->addBasicBlockToLoop(pureExit, *LI);
+  }
+  DR->setFlag(*pureExit,
+              static_cast<BlockDivergenceFlag>(
+                  BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit |
+                  BlockDivergenceFlag::eBlockHasDivergentBranch |
+                  BlockDivergenceFlag::eBlockIsDivergent));
+
+  LTag.pureExit = pureExit;
+
+  LLVM_DEBUG(dbgs() << "Pure exit: " << pureExit->getName() << "\n");
+
+  if (BOSCC) {
+    BOSCC->addInRegions(pureExit, latchBB);
+  }
+
+  auto *latchT = latchBB->getTerminator();
+#ifndef ALL_OF_DIVERGENT_LOOP_LATCH
+  Value *cond = MaskInfos[latchBB].exitMasks[LTag.header];
+  auto *newT = BranchInst::Create(LTag.header, pureExit, cond, latchBB);
+#else
+  // Exit the loop through the single divergent loop exit only if all instances
+  // that entered the loop left it.
+  ICmpInst *cond = new ICmpInst(
+      latchT, CmpInst::ICMP_EQ, LMask.persistedCombinedDivergentExitMask,
+      MaskInfos[LTag.preheader].exitMasks[LTag.header]);
+  auto *newT = BranchInst::Create(pureExit, LTag.header, cond, latchBB);
+  DR->setFlag(*latchBB, eBlockNeedsAllOfMask);
+#endif
+
+  updateMaps(latchT, newT);
+
+  IRCleanup::deleteInstructionNow(latchT);
+
+  MaskInfos[latchBB].exitMasks[pureExit] =
+      LMask.persistedCombinedDivergentExitMask;
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks(
+    LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  LLVM_DEBUG(dbgs() << "CFC: REWIRE EXIT BLOCKS FOR LOOP "
+                    << LTag.loop->getName() << "\n");
+
+  auto removeSuccessor = [this](Instruction *T, unsigned succIdx) {
+    switch (T->getOpcode()) {
+      default:
+        // Any other kind of Terminator cannot be handled and until
+        // proven otherwise, should not.
+        break;
+      case Instruction::Br: {
+        unsigned keepIdx = succIdx == 0 ? 1 : 0;
+        auto *newT = BranchInst::Create(T->getSuccessor(keepIdx), T);
+
+        updateMaps(T, newT);
+
+        IRCleanup::deleteInstructionNow(T);
+        break;
+      }
+      case Instruction::Switch: {
+        SwitchInst *SI = cast<SwitchInst>(T);
+        if (succIdx == 0) {
+          SI->setDefaultDest(SI->getSuccessor(1));
+          SI->removeCase(SI->case_begin());
+        } else {
+          SI->removeCase(std::next(SI->case_begin(), succIdx - 1));
+        }
+        break;
+      }
+      case Instruction::IndirectBr: {
+        IndirectBrInst *IBI = cast<IndirectBrInst>(T);
+        IBI->removeDestination(succIdx);
+        break;
+      }
+    }
+  };
+
+  // 'divergentLE' represents the current virtual divergent loop exit that a
+  // loop exit needs to be rewired to/from.
+  BasicBlock *divergentLE = LTag.pureExit;
+  for (unsigned idx = 0; idx < exitBlocks.size(); ++idx) {
+    BasicBlock *EB = exitBlocks[idx];
+
+    // The target of 'divergentLE'.
+    BasicBlock *target = nullptr;
+
+    // If 'EB' is optional, we split it at the terminator so that the exiting
+    // block keeps its edge towards it. The second half of 'EB' will be targeted
+    // by the cascade if.
+    if (DR->isOptional(*EB)) {
+      LLVM_DEBUG(dbgs() << "Optional loop exit " << EB->getName() << ":\n");
+
+      target =
+          EB->splitBasicBlock(EB->getTerminator(), EB->getName() + ".split");
+      auto &targetTag = DR->getOrCreateTag(target);
+
+      LLVM_DEBUG(dbgs() << "\tSplit " << EB->getName() << " into "
+                        << target->getName() << "\n");
+
+      // Set the tags.
+      // We have to be very careful copying a value from one key to another, in
+      // case one key did not exist, and constructing it caused rehashing.
+      {
+        auto EBmasks = MaskInfos[EB];
+        MaskInfos[target] = std::move(EBmasks);
+      }
+
+      auto *const EBLoopTag = DR->getTag(EB).loop;
+      if (EBLoopTag) {
+        targetTag.loop = EBLoopTag;
+        EBLoopTag->loop->addBasicBlockToLoop(target, *LI);
+      }
+
+      // If 'EB' is the preheader of a loop then 'target' takes its place.
+      for (auto *const ordered : DR->getLoopOrdering()) {
+        if (ordered->preheader == EB) {
+          LLVM_DEBUG(dbgs()
+                     << "\t" << target->getName() << " is now the preheader of "
+                     << ordered->loop->getName() << "\n");
+          ordered->preheader = target;
+        }
+      }
+
+      if (BOSCC) {
+        BOSCC->addReference(target, EB);
+        BOSCC->addInRegions(target, EB);
+      }
+      DR->setFlag(*target, DR->getFlag(*EB));
+
+      // Rewire 'EB' to the pure exit.
+      auto *const pureExit = LTag.pureExit;
+      EB->getTerminator()->setSuccessor(0, pureExit);
+
+      LLVM_DEBUG(dbgs() << "\t" << EB->getName() << " now targets "
+                        << pureExit->getName() << "\n");
+
+      // Retain branch for optional loop exits.
+      DR->clearFlag(*divergentLE,
+                    BlockDivergenceFlag::eBlockHasDivergentBranch);
+      // Set all-of mask because the first successor of 'divergentLE' is taken
+      // if no one existed from the optional loop exit.
+      DR->setFlag(*divergentLE, eBlockNeedsAllOfMask);
+
+      // 'EB' now has only one single exit edge.
+      auto &EBmasks = MaskInfos[EB];
+      EBmasks.exitMasks[pureExit] = EBmasks.entryMask;
+    } else {
+      LLVM_DEBUG(dbgs() << "Divergent loop exit " << EB->getName() << ":\n");
+
+      // Otherwise, the edge exiting-block-to-divergent-exit-block is removed ..
+      {
+        SmallPtrSet<BasicBlock *, 1> predsToRemove;
+        for (BasicBlock *pred : predecessors(EB)) {
+          auto const *const predLTag = DR->getTag(pred).loop;
+          // All predecessors of the divergent loop exit that belong in a loop
+          // contained in the outermost loop left by that exit need their
+          // edge removed.
+          if (predLTag && LTag.loop->contains(predLTag->loop)) {
+            predsToRemove.insert(pred);
+          }
+        }
+        for (BasicBlock *pred : predsToRemove) {
+          auto *predT = pred->getTerminator();
+          for (unsigned succIdx = 0; succIdx < predT->getNumSuccessors();
+               ++succIdx) {
+            if (predT->getSuccessor(succIdx) == EB) {
+              removeSuccessor(predT, succIdx);
+              LLVM_DEBUG(dbgs() << "\tRemove predecessor: " << pred->getName()
+                                << "\n");
+
+              break;
+            }
+          }
+        }
+        PHINode *PHI = nullptr;
+        while ((PHI = dyn_cast<PHINode>(&EB->front()))) {
+          VECZ_FAIL_IF(!generateSelectFromPHI(PHI, EB));
+        }
+      }
+
+      // ... and the exit block gets targeted by the current divergent loop
+      // exit.
+      target = EB;
+    }
+
+    VECZ_ERROR_IF(!target, "No target was found");
+
+    // If we are processing the last exit block, and it happens to be divergent
+    // there is no optional exit loop it can branch to, so create an
+    // unconditional branch.
+    if ((idx + 1 == exitBlocks.size()) && DR->isDivergent(*target)) {
+      BranchInst::Create(target, divergentLE);
+      auto &maskInfo = MaskInfos[divergentLE];
+      maskInfo.exitMasks[target] = maskInfo.entryMask;
+
+      LLVM_DEBUG(dbgs() << "\tVirtual Divergent Loop Exit "
+                        << divergentLE->getName()
+                        << ":\n\t\tSuccessor 0: " << target->getName() << "\n");
+    } else {
+      // The DCBI ordering sets the right sibling to be of an index less than
+      // the left sibling if they are on the same level of dominance. For that
+      // reason, we want to set the original loop exit as the right sibling so
+      // that the latter gets processed first while linearizing, and branches
+      // to the left sibling. We thus have to negate the condition to do so.
+      //
+      // The said condition is the entry mask of the exit block, i.e. whether
+      // any exiting block left through it.
+      auto &targetMasks = MaskInfos[target];
+      Instruction *cond = cast<Instruction>(targetMasks.entryMask);
+      // If that entry mask is defined in the loop (if the exit block has only
+      // one predecessor), then we can directly use that mask as the condition.
+      // Otherwise, we must move the latter in the pure exit so that
+      // 'divergentLE' can refer to it.
+      if (cond->getParent() == target) {
+        if (PHINode *PHI = dyn_cast<PHINode>(cond)) {
+          VECZ_FAIL_IF(!generateSelectFromPHI(PHI, target));
+          cond = cast<Instruction>(targetMasks.entryMask);
+        }
+        std::queue<Instruction *> toMove;
+        toMove.push(cond);
+        // Make sure to move all the operands of the condition that are in its
+        // definition block.
+        while (!toMove.empty()) {
+          Instruction *move = toMove.front();
+          toMove.pop();
+          move->moveBefore(*LTag.pureExit, LTag.pureExit->begin());
+          for (Value *op : move->operands()) {
+            if (Instruction *opI = dyn_cast<Instruction>(op)) {
+              if (opI->getParent() == target) {
+                toMove.push(opI);
+              }
+            }
+          }
+        }
+      }
+
+      auto *negCond = BinaryOperator::CreateNot(cond, cond->getName() + ".not",
+                                                divergentLE);
+      BasicBlock *newDivergentLE = BasicBlock::Create(
+          F.getContext(), EB->getName() + ".else", &F, EB->getNextNode());
+      BranchInst::Create(newDivergentLE, target, negCond, divergentLE);
+
+      // The divergentLE block "ought" to exist in the masks map already, but
+      // it is safer to take a local copy and retire `targetMasks`.
+      auto *const targetEntryMask = targetMasks.entryMask;
+
+      // No use of `targetMasks` after this line
+      auto &divgLEMask = MaskInfos[divergentLE];
+      divgLEMask.exitMasks[target] = targetEntryMask;
+      divgLEMask.exitMasks[newDivergentLE] = negCond;
+
+      LLVM_DEBUG(dbgs() << "\tCreate new virtual divergent loop exit "
+                        << newDivergentLE->getName() << "\n");
+
+      LLVM_DEBUG(
+          dbgs() << "\tVirtual Divergent Loop Exit " << divergentLE->getName()
+                 << ":\n\t\tSuccessor 0: " << target->getName()
+                 << "\n\t\tSuccessor 1: " << newDivergentLE->getName() << "\n");
+
+      auto &newDivergentLETag = DR->getOrCreateTag(newDivergentLE);
+
+      // Set the tags.
+      MaskInfos[newDivergentLE].entryMask = negCond;
+      if (auto *const divLoopTag = DR->getTag(divergentLE).loop) {
+        newDivergentLETag.loop = divLoopTag;
+        newDivergentLETag.loop->loop->addBasicBlockToLoop(newDivergentLE, *LI);
+      }
+
+      DR->setFlag(*newDivergentLE,
+                  static_cast<BlockDivergenceFlag>(
+                      DR->getFlag(*divergentLE) |
+                      BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit |
+                      BlockDivergenceFlag::eBlockHasDivergentBranch |
+                      BlockDivergenceFlag::eBlockIsDivergent));
+
+      if (BOSCC) {
+        BOSCC->addInRegions(newDivergentLE, LTag.latch);
+      }
+
+      divergentLE = newDivergentLE;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateDivergentLoopResults(
+    LoopTag &LTag) {
+  LLVM_DEBUG(dbgs() << "CFC: GENERATE DIVERGENT LOOP RESULTS FOR "
+                    << LTag.loop->getName() << "\n");
+
+  // First create instructions to save the value of the last iteration ...
+  IRBuilder<> B(getInsertionPt(*LTag.header));
+  for (Value *LLV : LTag.loopLiveValues) {
+    LTag.loopResultPrevs[LLV] =
+        B.CreatePHI(LLV->getType(), 2, LLV->getName() + ".prev");
+    LLVM_DEBUG(dbgs() << "Create result phi: "
+                      << LTag.loopResultPrevs[LLV]->getName() << "\n");
+  }
+
+  // ... then create instructions to retrieve the updated value in the current
+  // iteration.
+  for (Value *LLV : LTag.loopLiveValues) {
+    VECZ_FAIL_IF(!generateDivergentLoopResultUpdates(LLV, LTag));
+  }
+
+  if (BOSCC) {
+    // Clone the loop live values update instructions in the uniform version.
+    if (Loop *uniformL = BOSCC->getLoop(LTag.loop)) {
+      auto *const uniformHeader = DR->getTag(uniformL).header;
+      for (Value *LLV : LTag.loopLiveValues) {
+        BOSCC->addReference(LTag.loopResultUpdates[LLV], LLV);
+        PHINode *LRP = LTag.loopResultPrevs[LLV];
+        // We only need to clone the value of the previous iteration.
+        PHINode *uniformLRP = cast<PHINode>(LRP->clone());
+
+        uniformLRP->setIncomingValue(1, LLV);
+
+        uniformLRP->insertBefore(getInsertionPt(*uniformHeader));
+        BOSCC->createReference(LRP, uniformLRP, true);
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateDivergentLoopResultUpdates(
+    Value *LLV, LoopTag &LTag) {
+  auto &LMask = LoopMasks[LTag.loop];
+  Value *mask = LMask.combinedDivergentExitMask;
+  VECZ_ERROR_IF(!mask, "Divergent loop does not have an exit mask");
+  PHINode *PHI = LTag.loopResultPrevs[LLV];
+  SelectInst *select = SelectInst::Create(
+      mask, LLV, PHI, LLV->getName() + ".update", LTag.latch->getTerminator());
+  LTag.loopResultUpdates[LLV] = select;
+
+  // The PHI function of each loop live value has one incoming value from
+  // the preheader if this is the outermost loop, or from the PHI function from
+  // the outer loop otherwise.
+  auto *const ParentL = LTag.loop->getParentLoop();
+  auto *const ParentLT = ParentL ? &DR->getTag(ParentL) : nullptr;
+  if (!ParentLT || !ParentLT->loopResultPrevs.count(LLV)) {
+    PHI->addIncoming(getDefaultValue(PHI->getType()), LTag.preheader);
+  } else {
+    BasicBlock *LLVDef = cast<Instruction>(LLV)->getParent();
+    if (LLVDef != LTag.header && DR->isReachable(LLVDef, LTag.header)) {
+      PHI->addIncoming(LLV, LTag.preheader);
+    } else {
+      PHI->addIncoming(ParentLT->loopResultPrevs[LLV], LTag.preheader);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Create result update: " << *select << "\n");
+
+  // The second incoming value is the updated value from the latch.
+  PHI->addIncoming(select, LTag.latch);
+
+  LLVM_DEBUG(dbgs() << "Update result phi: " << *PHI << "\n");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::blendDivergentLoopLiveValues(
+    LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  LLVM_DEBUG(dbgs() << "CFC: BLEND DIVERGENT LOOP LIVE VALUES FOR "
+                    << LTag.loop->getName() << "\n");
+
+  // Get the exit blocks that were not removed.
+  SmallVector<BasicBlock *, 1> optionalExitBlocks;
+  LTag.loop->getExitBlocks(optionalExitBlocks);
+  // Remove the pure exit from it.
+  for (auto it = optionalExitBlocks.begin(); it != optionalExitBlocks.end();
+       ++it) {
+    if (*it == LTag.pureExit) {
+      (void)optionalExitBlocks.erase(it);
+      break;
+    }
+  }
+
+  for (Value *LLV : LTag.loopLiveValues) {
+    BasicBlock *LLVDef = cast<Instruction>(LLV)->getParent();
+    PHINode *prev = LTag.loopResultPrevs[LLV];
+    SelectInst *update = LTag.loopResultUpdates[LLV];
+
+    VECZ_ERROR_IF(
+        !update,
+        "Divergent loop live value does not have an update instruction");
+    VECZ_ERROR_IF(
+        !prev, "Divergent loop live value does not have a persist instruction");
+
+    PHINode *blend = PHINode::Create(
+        LLV->getType(), 2, LLV->getName() + ".blend", &LTag.pureExit->front());
+
+    // Replace all uses outside the loop.
+    VECZ_FAIL_IF(
+        !replaceUsesOutsideDivergentLoop(LTag, LLV, blend, optionalExitBlocks));
+
+    for (BasicBlock *EB : exitBlocks) {
+      if (DR->isOptional(*EB)) {
+        if (!DR->isReachable(LLVDef, EB)) {
+          blend->addIncoming(prev, EB);
+        } else {
+          blend->addIncoming(LLV, EB);
+        }
+      }
+    }
+    blend->addIncoming(update, LTag.latch);
+
+    if (BOSCC) {
+      BOSCC->addReference(blend, update);
+    }
+
+    LLVM_DEBUG(dbgs() << "Create blend " << *blend << " for LLV " << *LLV
+                      << "\n");
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::blendDivergentLoopExitMasks(
+    LoopTag &LTag, const SmallVectorImpl<Loop::Edge> &exitEdges,
+    const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  LLVM_DEBUG(dbgs() << "CFC: BLEND DIVERGENT LOOP EXIT MASKS FOR "
+                    << LTag.loop->getName() << "\n");
+
+  // Get the exit blocks that were not removed.
+  SmallVector<BasicBlock *, 1> optionalExitBlocks;
+  LTag.loop->getExitBlocks(optionalExitBlocks);
+  // Remove the pure exit from it.
+  for (auto it = optionalExitBlocks.begin(); it != optionalExitBlocks.end();
+       ++it) {
+    if (*it == LTag.pureExit) {
+      (void)optionalExitBlocks.erase(it);
+      break;
+    }
+  }
+
+  auto &LMask = LoopMasks[LTag.loop];
+  for (const Loop::Edge &EE : exitEdges) {
+    BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
+    BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
+
+    if (DR->isDivergent(*exitBlock)) {
+      PHINode *prev = LMask.persistedDivergentExitMasks[exitingBlock];
+      BinaryOperator *update =
+          LMask.updatedPersistedDivergentExitMasks[exitingBlock];
+
+      VECZ_ERROR_IF(
+          !update,
+          "Divergent loop exit mask does not have an update instruction");
+      VECZ_ERROR_IF(
+          !prev,
+          "Divergent loop exit mask does not have a persist instruction");
+
+      PHINode *blend =
+          PHINode::Create(prev->getType(), 2, prev->getName() + ".blend",
+                          &LTag.pureExit->front());
+
+      // Replace all uses outside the loop.
+      VECZ_FAIL_IF(!replaceUsesOutsideDivergentLoop(LTag, update, blend,
+                                                    optionalExitBlocks));
+
+      for (BasicBlock *EB : exitBlocks) {
+        if (DR->isOptional(*EB)) {
+          blend->addIncoming(prev, EB);
+        }
+      }
+      blend->addIncoming(update, LTag.latch);
+
+      if (BOSCC) {
+        BOSCC->addReference(blend, update);
+      }
+
+      LLVM_DEBUG(dbgs() << "Create blend " << *blend << " for loop exit mask "
+                        << *update << "\n");
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::replaceUsesOutsideDivergentLoop(
+    LoopTag &LTag, Value *from, Value *to,
+    const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  for (auto it = from->use_begin(); it != from->use_end();) {
+    Use &U = *it++;
+    Instruction *user = cast<Instruction>(U.getUser());
+    BasicBlock *blockUse = user->getParent();
+    // Don't replace uses within the loop.
+    if (LTag.loop->contains(blockUse) ||
+        // If the use is in a loop exit block, then 'to' can't reach it.
+        std::count(exitBlocks.begin(), exitBlocks.end(), blockUse)) {
+      continue;
+    }
+    // If the use is in a pure exit block of a divergent loop, don't replace
+    // the use if it comes from an optional exit block of that loop.
+    if (PHINode *PHI = dyn_cast<PHINode>(user)) {
+      auto const *const exitedLoop = DR->getTag(blockUse).outermostExitedLoop;
+      if (exitedLoop && exitedLoop->pureExit == blockUse) {
+        BasicBlock *incoming = PHI->getIncomingBlock(U);
+        if (!exitedLoop->loop->contains(incoming)) {
+          continue;
+        }
+      }
+    }
+    U.set(to);
+    LLVM_DEBUG(dbgs() << "Replace loop value " << *from << " with blend "
+                      << to->getName() << "\n");
+  }
+
+  return true;
+}
+
+namespace {
+using DenseDeferralMap =
+    SmallDenseMap<BasicBlock *, SmallPtrSet<BasicBlock *, 2>, 32>;
+
+void addDeferral(BasicBlock *newSrc, BasicBlock *deferred,
+                 DenseDeferralMap &deferrals) {
+  auto newSrcIt = deferrals.find(newSrc);
+  if (newSrcIt != deferrals.end()) {
+    // If the deferral edge already exists, there is no need to add it again.
+    if (newSrcIt->second.count(deferred)) {
+      LLVM_DEBUG(dbgs() << "\t\tDeferral (" << newSrc->getName() << ", "
+                        << deferred->getName() << ") already exists\n");
+      return;
+    }
+  }
+  auto deferredIt = deferrals.find(deferred);
+  if (deferredIt != deferrals.end()) {
+    // If the deferral edge already exists the other way around, we don't want
+    // to add it the opposite way, in risk of creating an infinite loop in the
+    // CFG.
+    if (deferredIt->second.count(newSrc)) {
+      LLVM_DEBUG(dbgs() << "\t\tOpposite deferral (" << deferred->getName()
+                        << ", " << newSrc->getName() << ") already exists\n");
+      return;
+    }
+  }
+
+  deferrals[newSrc].insert(deferred);
+
+  LLVM_DEBUG(dbgs() << "\t\tAdd deferral (" << newSrc->getName() << ", "
+                    << deferred->getName() << ")\n");
+}
+
+void removeDeferrals(BasicBlock *src, DenseDeferralMap &deferrals) {
+  auto deferredIt = deferrals.find(src);
+  if (deferredIt != deferrals.end()) {
+#ifndef NDEBUG
+    for (BasicBlock *deferred : deferredIt->second) {
+      LLVM_DEBUG(dbgs() << "\tRemove deferral (" << src->getName() << ", "
+                        << deferred->getName() << ")\n");
+    }
+#endif
+    deferrals.erase(deferredIt);
+  }
+}
+}  // namespace
+
+bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
+  // The entry block cannot be targeted.
+  auto const &DCBI = DR->getBlockOrdering();
+  size_t const numBlocks = DCBI.size();
+  DenseSet<BasicBlock *> targets(numBlocks - 1);
+  for (auto const &tag : make_range(std::next(DCBI.begin()), DCBI.end())) {
+    targets.insert(tag.BB);
+  }
+
+  DenseDeferralMap deferrals;
+
+  LLVM_DEBUG(dbgs() << "CFC: COMPUTE NEW TARGETS\n");
+
+  // For each basic block, select its new targets based on previous blocks that
+  // have been deferred because of divergence, and their current successors.
+  // Select the target that has the lowest DCBI, i.e. the block whose dominance
+  // englobes or is equal to the other available targets.
+  //
+  // If we assign a target different from the current successor of the block,
+  // we must add a deferral edge from the selected target to the current
+  // successor (that got replaced by the selected target) such that an edge
+  // from the current block to the replaced successor exists in the modified
+  // graph.
+  lin.infos.reserve(numBlocks);
+  lin.data.reserve(numBlocks);
+  for (size_t BBIndex = 0; BBIndex != numBlocks; ++BBIndex) {
+    auto const &BBTag = DR->getBlockTag(BBIndex);
+    BasicBlock *BB = BBTag.BB;
+    lin.beginBlock(BB);
+
+    LLVM_DEBUG(dbgs() << "BB " << BB->getName() << ":\n");
+
+    // Retrieve the rewire list for 'BB'.
+    SmallPtrSet<BasicBlock *, 8> availableTargets;
+    {
+      auto deferredIt = deferrals.find(BB);
+      if (deferredIt != deferrals.end()) {
+        for (BasicBlock *deferred : deferredIt->second) {
+          availableTargets.insert(deferred);
+        }
+      }
+    }
+
+    if (!DR->isDivCausing(*BB) ||
+        // Loop latches must have their branch retained.
+        (BBTag.loop && BBTag.loop->latch == BB)) {
+      // If 'BB' ends in a uniform branch.
+      LLVM_DEBUG(dbgs() << "  uniform branch\n");
+
+      // Keep track of what blocks we have targeted in case we have a deferred
+      // block that is a current successor (which could lead in choosing the
+      // same block twice!).
+      SmallPtrSet<BasicBlock *, 8> targeted;
+
+      for (BasicBlock *succ : successors(BB)) {
+        size_t nextIndex = ~size_t(0);
+        for (BasicBlock *deferred : availableTargets) {
+          if (targeted.count(deferred)) {
+            continue;
+          }
+
+          size_t const deferredIndex = DR->getTagIndex(deferred);
+          if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) {
+            nextIndex = deferredIndex;
+          }
+        }
+
+        size_t const succIndex = DR->getTagIndex(succ);
+        if (!targeted.count(succ)) {
+          // If we have not found a target or there is a better one.
+          if (nextIndex == ~size_t(0) || nextIndex > succIndex) {
+            nextIndex = succIndex;
+          }
+        }
+
+        VECZ_ERROR_IF(nextIndex == ~size_t(0), "No target was found");
+
+        auto *const next = DR->getBlockTag(nextIndex).BB;
+        lin.push(next);
+        targeted.insert(next);
+
+        LLVM_DEBUG(dbgs() << "\tsuccessor " << lin.currentSize() - 1 << ": "
+                          << next->getName() << "\n");
+
+        // Virtually remove backedges.
+        if (!BBTag.isLoopBackEdge(next)) {
+          targets.erase(next);
+          // Don't add deferred edges to blocks already processed.
+          if (BBIndex < nextIndex) {
+            auto S = availableTargets;
+            S.insert(succ);
+
+            for (BasicBlock *deferred : S) {
+              if (deferred != next) {
+                addDeferral(next, deferred, deferrals);
+              }
+            }
+          }
+        }
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "  divergent branch\n");
+
+      for (BasicBlock *succ : successors(BB)) {
+        availableTargets.insert(succ);
+      }
+
+      size_t nextIndex = ~size_t(0);
+      for (BasicBlock *deferred : availableTargets) {
+        size_t const deferredIndex = DR->getTagIndex(deferred);
+        if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) {
+          LLVM_DEBUG(dbgs()
+                     << (nextIndex == ~size_t(0)
+                             ? "\tchoosing successor: "
+                             : "\tpreferring instead successor: ")
+                     << DR->getBlockTag(deferredIndex).BB->getName() << "\n");
+          nextIndex = deferredIndex;
+        }
+      }
+
+      VECZ_ERROR_IF(nextIndex == ~size_t(0), "No target was found");
+
+      BasicBlock *const next = DR->getBlockTag(nextIndex).BB;
+      lin.push(next);
+
+      // The last eBlockIsVirtualDivergentLoopExit introduced from an optional
+      // loop exit wasn't given a block to branch to, it is thus empty.
+      if (DR->hasFlag(*BB,
+                      BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit) &&
+          !BB->getTerminator()) {
+        BranchInst::Create(next, BB);
+      }
+
+      LLVM_DEBUG(dbgs() << "\tsuccessor 0: " << next->getName() << "\n");
+
+      // Virtually remove backedges.
+      if (!BBTag.isLoopBackEdge(next)) {
+        targets.erase(next);
+        // Don't add deferred edges to blocks already processed.
+        if (BBIndex < nextIndex) {
+          for (BasicBlock *deferred : availableTargets) {
+            if (deferred != next) {
+              addDeferral(next, deferred, deferrals);
+            }
+          }
+        }
+      }
+    }
+
+    // Remove the deferrals that involved 'BB'.
+    removeDeferrals(BB, deferrals);
+
+    // clang-format off
+    LLVM_DEBUG(
+        dbgs() << "  deferral list:";
+        if (deferrals.empty()) {
+          dbgs() << " (empty)\n";
+        } else {
+          dbgs() << "\n";
+          for (const auto &pair : deferrals) {
+            for (BasicBlock *deferred : pair.second) {
+              LLVM_DEBUG(dbgs() << "\t(" << pair.first->getName() << ", "
+                                << deferred->getName() << ")\n");
+            }
+          }
+        }
+    );
+    // clang-format on
+  }
+
+  // There shouldn't remain any deferral edges.
+  VECZ_ERROR_IF(!deferrals.empty(), "Deferrals remain");
+  // All blocks should have been targeted at least once.
+  VECZ_ERROR_IF(!targets.empty(), "Not all blocks have been rewired");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::linearizeCFG() {
+  LLVM_DEBUG(dbgs() << "CFC: LINEARIZE\n");
+
+  // Compute the new targets ...
+  Linearization lin;
+  VECZ_FAIL_IF(!computeNewTargets(lin));
+
+  auto dataIt = lin.data.begin();
+  for (auto const &newTargetInfo : lin.infos) {
+    BasicBlock *BB = newTargetInfo.BB;
+
+    // Get the new target info for this block
+    auto const numTargets = newTargetInfo.numTargets;
+    auto const newTargets = dataIt;
+    dataIt += numTargets;
+
+    LLVM_DEBUG(dbgs() << BB->getName() << ":\n");
+
+    auto *T = BB->getTerminator();
+
+    // If we have set a new target that is already a successor of BB, but we
+    // have not set it at the same successor's position, then do it!
+    // It will avoid to have to update the phi nodes.
+    SmallDenseMap<BasicBlock *, unsigned, 2> successors;
+    for (unsigned idx = 0; idx < T->getNumSuccessors(); ++idx) {
+      BasicBlock *succ = T->getSuccessor(idx);
+      successors.try_emplace(succ, idx);
+    }
+    for (unsigned idx = 0; idx < numTargets; ++idx) {
+      auto succIt = successors.find(newTargets[idx]);
+      // If we have a successor set as a new target ...
+      if (succIt != successors.end()) {
+        // ... but we have not set it at the same position ...
+        if (succIt->second != idx && succIt->second < numTargets) {
+          // .. then swap both blocks.
+          std::swap(newTargets[idx], newTargets[succIt->second]);
+        }
+      }
+    }
+
+    // Now iterate over the new targets to set them as successors of BB if
+    // they were not already.
+    unsigned idx = 0;
+    for (; idx < numTargets; ++idx) {
+      BasicBlock *const newTarget = newTargets[idx];
+
+      VECZ_ERROR_IF(
+          idx >= T->getNumSuccessors(),
+          "BasicBlock should not have more successors after linearization");
+
+      BasicBlock *oldSucc = T->getSuccessor(idx);
+
+      LLVM_DEBUG(dbgs() << "\tOld successor: " << oldSucc->getName() << "\n");
+
+      // If we have set the current successor to be the new target, there is
+      // nothing to do.
+      if (oldSucc == newTarget) {
+        LLVM_DEBUG(dbgs() << "\tUntouched successor: " << oldSucc->getName()
+                          << "\n");
+        continue;
+      }
+
+      // Uniform blocks should not be rewired.
+      VECZ_ERROR_IF(DR->isUniform(*oldSucc),
+                    "Uniform BasicBlock should not have its edge modified");
+
+      // Otherwise update the successor.
+      T->setSuccessor(idx, newTarget);
+      LLVM_DEBUG(dbgs() << "\tAdd successor: " << newTarget->getName() << "\n");
+    }
+
+    // We have either processed a divergent branch (with only one successor), or
+    // we have processed a uniform branch (with all its successors untouched).
+    VECZ_ERROR_IF(idx != 1 && idx != T->getNumSuccessors(),
+                  "Number of processed new targets is undefined");
+
+    // Finally, clear the remaining successors that have not been set as new
+    // targets.
+    if (idx != T->getNumSuccessors()) {
+      for (; idx < T->getNumSuccessors(); ++idx) {
+        BasicBlock *succ = T->getSuccessor(idx);
+
+        // Uniform blocks should not be rewired.
+        VECZ_ERROR_IF(DR->isUniform(*succ),
+                      "Uniform BasicBlock should not have its edge modified");
+
+        LLVM_DEBUG(dbgs() << "\tRemove successor: " << succ->getName() << "\n");
+      }
+
+      auto *newT = BranchInst::Create(T->getSuccessor(0), T);
+
+      updateMaps(T, newT);
+
+      IRCleanup::deleteInstructionNow(T);
+    }
+  }
+  assert(dataIt == lin.data.end() &&
+         "Failed to reach end of Linearization data vector!");
+
+  // Updating on-the-fly the DomTree and PostDomTree whilst rewiring the CFG
+  // is extremely tedious, and may not even be possible due to all the invalid
+  // states that happen during it ... Therefore, we have no choice but to
+  // recalculate the DomTree and PostDomTree from scratch.
+  DT->recalculate(F);
+  PDT->recalculate(F);
+  VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+  VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+  VECZ_FAIL_IF(!computeBlockOrdering());
+  RC->clear();
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateSelects() {
+  LLVM_DEBUG(dbgs() << "CFC: GENERATE SELECTS FROM PHI NODES\n");
+  // For each basic block that has only one predecessor and phi nodes, we need
+  // to either blend those phi nodes into select instructions or try to move
+  // the phi nodes up the chain of linearized path.
+  for (auto const &BTag : DR->getBlockOrdering()) {
+    BasicBlock *B = BTag.BB;
+    if (B->hasNPredecessors(1) || DR->isBlend(*B)) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&B->front())) {
+        LLVM_DEBUG(dbgs() << B->getName() << ":\n");
+        SmallPtrSet<BasicBlock *, 2> incomings(PHI->block_begin(),
+                                               PHI->block_end());
+        BasicBlock *cur = B;
+        while (cur->hasNPredecessors(1) && !incomings.empty()) {
+          cur = cur->getSinglePredecessor();
+          if (incomings.count(cur)) {
+            break;
+          }
+        }
+        // Only move the phis up the chain of linearized path:
+        // - if the block whose phis we are processing is not a blend block
+        //   (because the latter do need to have its phis transformed into
+        //   selects),
+        // - if the last block of the chain is not an incoming block, and
+        // - if the last block of the chain is a convergence block.
+        if (!DR->isBlend(*B) && !incomings.count(cur) &&
+            cur->hasNPredecessorsOrMore(2) && PHI->getNumIncomingValues() > 1) {
+          // All PHI nodes have the same incoming blocks so we update the exit
+          // masks of the incoming blocks of the first PHI node here.
+          for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) {
+            auto &maskInfo = MaskInfos[PHI->getIncomingBlock(i)];
+            Value *&exitMask = maskInfo.exitMasks[cur];
+
+            if (!exitMask) {
+              exitMask = maskInfo.exitMasks[B];
+            }
+          }
+
+          while ((PHI = dyn_cast<PHINode>(&B->front()))) {
+            LLVM_DEBUG(dbgs() << "\tMove " << *PHI << " in " << cur->getName()
+                              << "\n");
+            PHI->moveBefore(*cur, cur->begin());
+          }
+        } else {
+          while ((PHI = dyn_cast<PHINode>(&B->front()))) {
+            VECZ_FAIL_IF(!generateSelectFromPHI(PHI, B));
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateSelectFromPHI(PHINode *PHI,
+                                                             BasicBlock *B) {
+  const unsigned phiNumIncVals = PHI->getNumIncomingValues();
+  VECZ_ERROR_IF(phiNumIncVals == 0, "PHINode does not have any incoming value");
+
+  Value *newVal = nullptr;
+  auto &maskInfo = MaskInfos[B];
+  if (PHI == maskInfo.entryMask) {
+    // The entry mask of a blend value should be the conjunction of the incoming
+    // masks, so change it.
+    maskInfo.entryMask = copyEntryMask(PHI->getIncomingValue(0), *B);
+    for (unsigned i = 1; i < phiNumIncVals; i++) {
+      Value *V = PHI->getIncomingValue(i);
+      Instruction *insertBefore =
+          cast<Instruction>(maskInfo.entryMask)->getNextNode();
+      maskInfo.entryMask = BinaryOperator::CreateOr(
+          maskInfo.entryMask, V, B->getName() + ".entry_mask", insertBefore);
+    }
+    newVal = maskInfo.entryMask;
+  } else {
+    Value *select = PHI->getIncomingValue(0);
+    for (unsigned i = 1; i < phiNumIncVals; i++) {
+      Value *V = PHI->getIncomingValue(i);
+      BasicBlock *PHIB = PHI->getIncomingBlock(i);
+      Value *cond = MaskInfos[PHIB].exitMasks[B];
+      VECZ_ERROR_IF(!cond, "Exit mask does not exist");
+
+      Instruction *insertBefore = &*B->getFirstInsertionPt();
+      if (i == 1) {
+        if (Instruction *condI = dyn_cast<Instruction>(cond)) {
+          BasicBlock *maskParent = condI->getParent();
+          if (maskParent == B) {
+            insertBefore = condI->getNextNode();
+          }
+        }
+      } else {
+        insertBefore = cast<Instruction>(select)->getNextNode();
+      }
+      select = SelectInst::Create(cond, V, select, PHI->getName() + ".blend",
+                                  insertBefore);
+    }
+    newVal = select;
+  }
+
+  LLVM_DEBUG(dbgs() << "\tReplace " << *PHI << " with " << *newVal << "\n");
+
+  updateMaps(PHI, newVal);
+
+  PHI->replaceAllUsesWith(newVal);
+
+  IRCleanup::deleteInstructionNow(PHI);
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::repairSSA() {
+  // Check that all the blocks have a unique position
+  VECZ_FAIL_IF(!checkBlocksOrder());
+  RC->update(F);
+
+  VECZ_FAIL_IF(!updatePHIsIncomings());
+  VECZ_FAIL_IF(!blendInstructions());
+
+  VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+  VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
+  // We need to update the incoming blocks of phi nodes whose predecessors may
+  // have changed since we have not changed the phi nodes during the rewiring.
+  for (auto const &BBTag : DR->getBlockOrdering()) {
+    BasicBlock *BB = BBTag.BB;
+    SmallPtrSet<BasicBlock *, 4> preds(pred_begin(BB), pred_end(BB));
+    for (auto it = BB->begin(); it != BB->end();) {
+      Instruction &I = *it++;
+      PHINode *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI) {
+        break;
+      }
+
+      SmallPtrSet<BasicBlock *, 4> incomings(PHI->block_begin(),
+                                             PHI->block_end());
+
+      // If no predecessors of `BB` is an incoming block of its PHI Node, then
+      // completely transform the PHI Node into multiple select instructions.
+      bool intersect = false;
+      for (BasicBlock *inc : incomings) {
+        for (BasicBlock *pred : preds) {
+          if (pred == inc) {
+            intersect = true;
+            break;
+          }
+        }
+        if (intersect) {
+          break;
+        }
+      }
+      if (!intersect) {
+        VECZ_FAIL_IF(!generateSelectFromPHI(PHI, BB));
+        continue;
+      }
+      // Otherwise, only transform the incoming blocks of predecessors that got
+      // linearized into selects.
+      //
+      // Instruction that will combine the phi node and the select instructions
+      // created from it if some incoming blocks are no longer predecessors.
+      Instruction *newBlend = nullptr;
+      Instruction *insertBefore = getInsertionPt(*BB);
+
+      auto &maskInfo = MaskInfos[BB];
+      const bool isEntryMask = PHI == maskInfo.entryMask;
+      for (unsigned idx = 0; idx < PHI->getNumIncomingValues(); ++idx) {
+        BasicBlock *incoming = PHI->getIncomingBlock(idx);
+        if (preds.count(incoming)) {
+          continue;
+        }
+        // If the incoming block is no longer a predecessor, transform it into
+        // a select instruction, or a binary OR if it is an entry mask.
+        Value *V = PHI->getIncomingValue(idx);
+
+        if (isEntryMask) {
+          // The entry mask of a blend value should be the conjunction of
+          // the incoming masks, so change it.
+          if (!newBlend) {
+            newBlend = BinaryOperator::CreateOr(
+                PHI, V, BB->getName() + ".entry_mask", insertBefore);
+          } else {
+            newBlend = BinaryOperator::CreateOr(
+                newBlend, V, BB->getName() + ".entry_mask", insertBefore);
+          }
+          maskInfo.entryMask = newBlend;
+        } else {
+          Value *cond = MaskInfos[incoming].exitMasks[BB];
+          VECZ_ERROR_IF(!cond, "Exit mask does not exist");
+          if (!newBlend) {
+            newBlend = SelectInst::Create(
+                cond, V, PHI, PHI->getName() + ".blend", insertBefore);
+          } else {
+            newBlend = SelectInst::Create(
+                cond, V, newBlend, PHI->getName() + ".blend", insertBefore);
+          }
+        }
+        PHI->removeIncomingValue(idx--);
+      }
+
+      // If we have created select instructions from `PHI`, update the users
+      // of the latter.
+      if (newBlend) {
+        VECZ_FAIL_IF(!replaceReachableUses(*RC, PHI, newBlend, BB));
+        updateMaps(PHI, newBlend);
+      }
+
+      // And add any new incoming blocks that do not replace any previous.
+      for (BasicBlock *pred : preds) {
+        if (!incomings.count(pred)) {
+          PHI->addIncoming(getDefaultValue(PHI->getType()), pred);
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::blendInstructions() {
+  LLVM_DEBUG(dbgs() << "CFC: BLEND INSTRUCTIONS\n");
+
+  auto addSuccessors = [this](BasicBlockTag const &BTag, BlockQueue &queue,
+                              DenseSet<BasicBlock *> &visited,
+                              BasicBlockTag const &dstTag) {
+    for (BasicBlock *succ : successors(BTag.BB)) {
+      // Allow latch if 'succ' belongs in 'dst's loop and 'dst' is the header
+      // of that loop.
+      const bool allowLatch =
+          dstTag.isLoopHeader() && dstTag.loop->loop->contains(succ);
+
+      if (!allowLatch && BTag.isLoopBackEdge(succ)) {
+        continue;
+      }
+
+      if (allowLatch) {
+        // the fast Reachability calculation can't follow back edges yet
+        if (!DR->isReachable(succ, dstTag.BB, allowLatch)) {
+          continue;
+        }
+      } else if (!RC->isReachable(succ, dstTag.BB)) {
+        continue;
+      }
+
+      if (visited.insert(succ).second) {
+        LLVM_DEBUG(dbgs() << "\t\t\tInsert " << succ->getName()
+                          << " in the queue\n");
+        queue.push(DR->getTagIndex(succ));
+      }
+    }
+
+    // clang-format off
+    LLVM_DEBUG(
+        dbgs() << "\t\t\tWorklist: [";
+        if (!queue.empty()) {
+          dbgs() << DR->getBlockTag(*queue.begin()).BB->getName();
+          for (auto it = std::next(queue.begin()); it != queue.end(); ++it) {
+            dbgs() << ", " << DR->getBlockTag(*it).BB->getName();
+          }
+          dbgs() << "]\n";
+        }
+    );
+    // clang-format on
+  };
+
+  DenseMap<Instruction *, SmallDenseMap<BasicBlock *, Value *, 2>> blendMap;
+
+  auto getValueOfAt = [&blendMap](Instruction *opDef,
+                                  BasicBlock *B) -> Value * {
+    auto it = blendMap.find(opDef);
+    if (it != blendMap.end()) {
+      auto it2 = it->second.find(B);
+      if (it2 != it->second.end()) {
+        return it2->second;
+      }
+    }
+    return nullptr;
+  };
+
+  auto createBlend = [this, &blendMap, &getValueOfAt](
+                         BasicBlock *B, Instruction *opDef) -> Value * {
+    if (Value *V = getValueOfAt(opDef, B)) {
+      return V;
+    }
+
+    Type *T = opDef->getType();
+    const unsigned numPreds = std::distance(pred_begin(B), pred_end(B));
+    Value *blend = nullptr;
+    PHINode *PHI =
+        PHINode::Create(T, numPreds, opDef->getName() + ".merge", &B->front());
+
+    auto const *const LTag = DR->getTag(B).loop;
+    bool hasVisitedPred = false;
+    for (BasicBlock *pred : predecessors(B)) {
+      Value *incomingV = nullptr;
+      if (Value *predV = getValueOfAt(opDef, pred)) {
+        incomingV = predV;
+        hasVisitedPred = true;
+      } else {
+        // When blending a loop header, the value coming from the latch should
+        // be the one coming from the preheader if that value dominates the
+        // latch and the latch has no definition of the value we are trying to
+        // blend.
+        if (DR->getTag(pred).isLoopBackEdge(B)) {
+          if (Value *preheaderV = getValueOfAt(opDef, LTag->preheader)) {
+            if (auto *instV = dyn_cast<Instruction>(preheaderV)) {
+              if (DT->dominates(instV->getParent(), pred)) {
+                incomingV = preheaderV;
+              }
+            } else {
+              incomingV = preheaderV;
+            }
+          }
+        }
+      }
+
+      if (!incomingV) {
+        incomingV = getDefaultValue(T);
+      }
+      PHI->addIncoming(incomingV, pred);
+    }
+    if (!hasVisitedPred) {
+      IRCleanup::deleteInstructionNow(PHI);
+      return nullptr;
+    }
+
+    if (PHI->hasConstantValue()) {
+      blend = PHI->getIncomingValue(0);
+      IRCleanup::deleteInstructionNow(PHI);
+    } else {
+      blend = PHI;
+      blends.insert(PHI);
+    }
+
+    blendMap[opDef][B] = blend;
+
+    return blend;
+  };
+
+  // Manually set the entry point of persisted loop live values and persisted
+  // loop exit masks.
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    auto *const header = LTag->header;
+    for (Value *LLV : LTag->loopLiveValues) {
+      Instruction *LLVI = cast<Instruction>(LLV);
+      if (LLVI->getParent() != header) {
+        blendMap[LLVI][header] = LTag->loopResultPrevs[LLV];
+      }
+    }
+
+    auto &LMask = LoopMasks[LTag->loop];
+    for (auto &UPREM : LMask.updatedPersistedDivergentExitMasks) {
+      if (UPREM.first != header) {
+        blendMap[UPREM.second][header] =
+            LMask.persistedDivergentExitMasks[UPREM.first];
+      }
+    }
+  }
+
+  SmallPtrSet<Value *, 16> spareBlends;
+
+  for (auto const &dstTag : DR->getBlockOrdering()) {
+    BasicBlock *dst = dstTag.BB;
+    LLVM_DEBUG(dbgs() << "Blending instructions used in " << dst->getName()
+                      << ":\n");
+    for (Instruction &I : *dst) {
+      // Don't try to blend a blend value.
+      if (blends.count(&I)) {
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "\tInstruction " << I << ":\n");
+
+      for (unsigned idx = 0; idx < I.getNumOperands(); ++idx) {
+        Instruction *opDef = dyn_cast<Instruction>(I.getOperand(idx));
+        if (!opDef) {
+          continue;
+        }
+
+        BasicBlock *src = opDef->getParent();
+
+        LLVM_DEBUG(dbgs() << "\t\tOperand " << *opDef << "\n\t\tdefined in "
+                          << src->getName() << ":\n");
+
+        blendMap[opDef][src] = opDef;
+
+        // There exists two possible ways to early exit the blend instruction:
+        // - if the current block dominates the 'dst'.
+        // - if the current block dominates the incoming block of the phi node
+        //   'I' we are blending in 'dst'.
+        //
+        // 'dst' can freely access the values of 'src'.
+        if (DT->dominates(src, dst)) {
+          LLVM_DEBUG(dbgs() << "\t\t\tDefinition dominates use\n");
+          continue;
+        }
+        // The incoming block of this phi node is dominated by the definition
+        // block of the incoming value.
+        BasicBlock *incoming = nullptr;
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          incoming = PHI->getIncomingBlock(idx);
+          if (DT->dominates(src, incoming)) {
+            LLVM_DEBUG(dbgs() << "\t\t\tDefinition dominates use\n");
+            continue;
+          }
+        }
+
+        DenseSet<BasicBlock *> visited;
+        BlockQueue queue(*DR);
+
+        auto const &srcTag = DR->getTag(src);
+
+        addSuccessors(srcTag, queue, visited, dstTag);
+
+        auto *const srcLoop = srcTag.loop;
+        if (srcLoop && srcLoop->isLoopDivergent()) {
+          if (dst != srcLoop->header) {
+            auto &srcMasks = LoopMasks[srcLoop->loop];
+            auto const &headerTag = DR->getTag(srcLoop->header);
+
+            // If 'opDef' is an update loop exit mask, set an entry point in
+            // the loop header.
+            auto UPREMIt =
+                srcMasks.updatedPersistedDivergentExitMasks.find(src);
+            if (UPREMIt != srcMasks.updatedPersistedDivergentExitMasks.end()) {
+              if (UPREMIt->second == opDef) {
+                LLVM_DEBUG(dbgs()
+                           << "\t\t\tFound persisted value of the operand: "
+                           << srcMasks.persistedDivergentExitMasks[src]
+                           << "\n");
+                addSuccessors(headerTag, queue, visited, dstTag);
+              }
+            }
+            // If 'opDef' is a loop live value, set an entry point in the loop
+            // header.
+            if (srcLoop->loopLiveValues.count(opDef)) {
+              LLVM_DEBUG(dbgs()
+                         << "\t\t\tFound persisted value of the operand: "
+                         << srcLoop->loopResultPrevs[opDef] << "\n");
+              addSuccessors(headerTag, queue, visited, dstTag);
+            }
+          }
+        }
+
+        while (!queue.empty()) {
+          BasicBlockTag const &curTag = queue.pop();
+          BasicBlock *const cur = curTag.BB;
+
+          LLVM_DEBUG(dbgs() << "\t\t\tPopping " << cur->getName() << "\n");
+
+          // We have reached 'dst' without finding a block that dominates it,
+          // we need to create a phi node if the user is not one, and replace
+          // the operand with the last blended value.
+          if (cur == dst) {
+            LLVM_DEBUG(dbgs() << "\t\t\tReached destination: ");
+            VECZ_ERROR_IF(!queue.empty(), "Blocks remain in the queue");
+            if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+              BasicBlock *incoming = PHI->getIncomingBlock(idx);
+              Value *V = getValueOfAt(opDef, incoming);
+              VECZ_ERROR_IF(!V, "No blend value was found");
+              I.setOperand(idx, V);
+            } else {
+              Value *blend = createBlend(cur, opDef);
+              VECZ_ERROR_IF(!blend, "No blend value was found");
+              spareBlends.erase(blend);
+              I.setOperand(idx, blend);
+            }
+            LLVM_DEBUG(dbgs() << "new operand: " << *I.getOperand(idx) << "\n");
+            break;
+          }
+
+          const bool curDomDst = DT->dominates(cur, dst);
+          const bool curDomInc = incoming && DT->dominates(cur, incoming);
+          const bool srcDomCur = DT->dominates(src, cur);
+
+          auto &opDefBlend = blendMap[opDef];
+          // If either condition is true, we can early exit:
+          // - 'dst' can freely access the values of 'cur',
+          // - 'incoming' can freely access the values of 'cur'.
+          if ((curDomDst || curDomInc) && queue.empty()) {
+            LLVM_DEBUG(dbgs() << "\t\t\tBlock " << cur->getName()
+                              << " dominates destination: ");
+            if (srcDomCur) {
+              auto *const blend = opDefBlend[src];
+              opDefBlend[cur] = blend;
+              I.setOperand(idx, blend);
+            } else {
+              auto *const blend = createBlend(cur, opDef);
+              VECZ_ERROR_IF(!blend, "No blend value was found");
+              spareBlends.erase(blend);
+              I.setOperand(idx, blend);
+            }
+            LLVM_DEBUG(dbgs() << "new operand: " << *I.getOperand(idx) << "\n");
+            break;
+          }
+
+          addSuccessors(curTag, queue, visited, dstTag);
+
+          // 'cur' can freely access 'opDef'.
+          if (srcDomCur) {
+            // DANGER! operator[] returns a reference, which may be invalidated
+            // by a second call to it. Therefore we have to copy the value via
+            // a temporary variable.
+            auto *const blendSrc = opDefBlend[src];
+            opDefBlend[cur] = blendSrc;
+            continue;
+          }
+
+          // 'cur' does not have a blend value of 'opDef' so create one.
+          Value *blend = createBlend(cur, opDef);
+          VECZ_ERROR_IF(!blend, "No blend value was found");
+          if (isa<PHINode>(blend)) {
+            spareBlends.insert(blend);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto *blend : spareBlends) {
+    auto *I = cast<Instruction>(blend);
+    if (I->use_empty()) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::simplifyMasks() {
+  const SimplifyQuery Q(F.getParent()->getDataLayout(), nullptr, DT);
+
+  // We might like to just look at the masks pointed to by the block/loop tags,
+  // however linearization and/or BOSCC can sometimes delete them from under
+  // our nose so it's only safe just to go through all the boolean operations
+  // and see if we can simplify any of them.
+  for (auto const &BBTag : DR->getBlockOrdering()) {
+    SmallVector<Instruction *, 16> toDelete;
+    for (auto &I : *BBTag.BB) {
+      if (isa<SelectInst>(&I) || (I.getType()->getScalarSizeInBits() == 1 &&
+                                  (isa<BinaryOperator>(&I) ||
+                                   isa<PHINode>(&I) || isa<ICmpInst>(&I)))) {
+        if (I.use_empty()) {
+          toDelete.push_back(&I);
+        } else {
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+          Value *simpleMask = simplifyInstruction(&I, Q);
+#else
+          Value *simpleMask = SimplifyInstruction(&I, Q);
+#endif
+          if (simpleMask && simpleMask != &I) {
+            I.replaceAllUsesWith(simpleMask);
+            toDelete.push_back(&I);
+          }
+        }
+      }
+    }
+    for (auto *I : toDelete) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::computeBlockOrdering() {
+  LLVM_DEBUG(dbgs() << "CFC: COMPUTE BLOCK ORDERING\n");
+  RC->clear();
+  return DR->computeBlockOrdering(*DT);
+}
+
+bool ControlFlowConversionState::Impl::checkBlocksOrder() const {
+  auto const &DCBI = DR->getBlockOrdering();
+  VECZ_ERROR_IF(F.size() != DCBI.size(),
+                "Worklist does not contain all blocks");
+
+  uint32_t next = 0u;
+  for (auto const &BBTag : DCBI) {
+    VECZ_ERROR_IF(BBTag.pos != next,
+                  "BasicBlock indices not in consecutive order");
+    ++next;
+  }
+
+  return true;
+}
+
+void ControlFlowConversionState::Impl::updateMaps(Value *from, Value *to) {
+  // Because we keep track of mapping values between uniform and predicated
+  // version, since we replace 'from' with 'to', we also have to update
+  // the hashtable.
+  if (BOSCC) {
+    BOSCC->updateValue(from, to);
+  }
+
+  // Because we keep track of loop live values, since we replace 'from' with
+  // 'to', we also have to update the hashset.
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    if (LTag->loopLiveValues.erase(from)) {
+      LTag->loopLiveValues.insert(to);
+      auto LRPIt = LTag->loopResultPrevs.find(from);
+      if (LRPIt != LTag->loopResultPrevs.end()) {
+        PHINode *from = LRPIt->second;
+        LTag->loopResultPrevs.erase(LRPIt);
+        LTag->loopResultPrevs[to] = from;
+      }
+      auto LRUIt = LTag->loopResultUpdates.find(from);
+      if (LRUIt != LTag->loopResultUpdates.end()) {
+        SelectInst *select = LRUIt->second;
+        LTag->loopResultUpdates.erase(LRUIt);
+        LTag->loopResultUpdates[to] = select;
+      }
+    }
+  }
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
new file mode 100644
index 0000000000000..98b03796c53b1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -0,0 +1,141 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/inline_post_vectorization_pass.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+/// @brief Process a call site, inlining it or marking it as needing inlining
+/// if required.
+///
+/// @param[in] CI Call site to inspect.
+/// @param[out] NeedLLVMInline Whether the call site needs LLVM inlining.
+/// @param[in] BI Builtin database.
+///
+/// @return New return value for the call instruction.
+Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
+                       compiler::utils::BuiltinInfo &BI) {
+  NeedLLVMInline = false;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return CI;
+  }
+
+  // Mark called function as needing inlining by LLVM, unless it has the
+  // NoInline attribute
+  if (!Callee->isDeclaration() &&
+      !Callee->hasFnAttribute(Attribute::NoInline)) {
+    CI->addFnAttr(Attribute::AlwaysInline);
+    NeedLLVMInline = true;
+    return CI;
+  }
+
+  // Emit builtins inline when they have no vector/scalar equivalent.
+  IRBuilder<> B(CI);
+  auto const Builtin = BI.analyzeBuiltin(*Callee);
+  if (Builtin.properties &
+      compiler::utils::eBuiltinPropertyInlinePostVectorization) {
+    SmallVector<Value *, 4> Args(CI->args());
+    if (Value *Impl = BI.emitBuiltinInline(Callee, B, Args)) {
+      VECZ_ERROR_IF(
+          Impl->getType() != CI->getType(),
+          "The inlined function type must match that of the original function");
+      return Impl;
+    }
+  }
+
+  // Vectorized uses of the subgroup local id will have been replaced with step
+  // vectors starting from zero. Uniform uses should be replaced with zero in
+  // order to maintain equivalence between the scalar/vector forms. Do this
+  // here due to a tight coupling between the vectorized version and these
+  // remaining scalar versions.
+  if (Builtin.isValid() && Builtin.ID == BI.getSubgroupLocalIdBuiltin()) {
+    return ConstantInt::getNullValue(CI->getType());
+  }
+
+  return CI;
+}
+
+}  // namespace
+
+PreservedAnalyses InlinePostVectorizationPass::run(
+    Function &F, FunctionAnalysisManager &AM) {
+  bool modified = false;
+  bool needToRunInliner = false;
+  auto &BI =
+      AM.getResult<VectorizationContextAnalysis>(F).getContext().builtins();
+
+  SmallVector<Instruction *, 4> ToDelete;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Look for calls to builtins with no vector/scalar equivalent.
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI) {
+        continue;
+      }
+
+      bool NeedLLVMInline = false;
+      Value *NewCI = processCallSite(CI, NeedLLVMInline, BI);
+      needToRunInliner |= NeedLLVMInline;
+      if ((NewCI == CI) || !NewCI) {
+        continue;
+      }
+
+      if (!CI->getType()->isVoidTy()) {
+        CI->replaceAllUsesWith(NewCI);
+      }
+      ToDelete.push_back(CI);
+      modified = true;
+    }
+  }
+
+  // Clean up.
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    I->eraseFromParent();
+  }
+
+  // Run the LLVM inliner if some calls were marked as needing inlining.
+  if (needToRunInliner) {
+    llvm::legacy::PassManager PM;
+    PM.add(llvm::createAlwaysInlinerLegacyPass());
+    PM.run(*F.getParent());
+    modified = true;
+  }
+
+  // Recursively run the pass to inline any newly introduced functions.
+  if (modified) {
+    run(F, AM);
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
new file mode 100644
index 0000000000000..5c67d36ad9698
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -0,0 +1,350 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/instantiation_pass.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include <memory>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/packetization_helpers.h"
+#include "transform/packetizer.h"
+#include "vectorization_context.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz-instantiation"
+
+#undef VECZ_FAIL
+#define VECZ_FAIL() return packetizer.getEmptyRange();
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczInstantiated, "Number of instructions instantiated [ID#I00]");
+STATISTIC(VeczPacketizeFailInstantiate,
+          "Packetize: instantiation failures [ID#P84]");
+
+InstantiationPass::InstantiationPass(Packetizer &pp)
+    : Ctx(pp.context()), packetizer(pp) {}
+
+PacketRange InstantiationPass::instantiate(Value *V) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  if (auto info = packetizer.getPacketized(V)) {
+    unsigned SimdWidth = packetizer.width().getFixedValue();
+    return info.getAsPacket(SimdWidth);
+  }
+
+  // Handle uniform values first, which instantiate to the same value for all
+  // items.
+  auto *Ins = dyn_cast<Instruction>(V);
+  if (Ins && packetizer.uniform().isMaskVarying(V)) {
+    PacketRange P = simdBroadcast(Ins);
+    if (!P) {
+      emitVeczRemark(&packetizer.function(), V,
+                     "Failed to broadcast Mask Varying instruction");
+      VECZ_FAIL();
+    }
+    return assignInstance(P, V);
+  }
+
+  if (!packetizer.uniform().isVarying(V)) {
+    return assignInstance(broadcast(V), V);
+  }
+
+  if (Ins) {
+    return instantiateInstruction(Ins);
+  }
+
+  VECZ_STAT_FAIL_IF(true, VeczPacketizeFailInstantiate);
+}
+
+PacketRange InstantiationPass::instantiateInternal(Value *V) {
+  if (packetizer.uniform().isVarying(V)) {
+    // The packetizer will call back into the instantiator when it needs to
+    VECZ_FAIL_IF(packetizer.width().isScalable());
+    unsigned SimdWidth = packetizer.width().getFixedValue();
+    return packetizer.packetize(V).getAsPacket(SimdWidth);
+  } else {
+    return instantiate(V);
+  }
+}
+
+PacketRange InstantiationPass::instantiateInstruction(Instruction *Ins) {
+  // Figure out what kind of instruction it is and try to instantiate it.
+  switch (Ins->getOpcode()) {
+    default:
+      // No special handling of this Instruction so just clone across lanes..
+      break;
+
+    case Instruction::Call:
+      return assignInstance(instantiateCall(cast<CallInst>(Ins)), Ins);
+
+    case Instruction::Alloca:
+      return assignInstance(instantiateAlloca(cast<AllocaInst>(Ins)), Ins);
+  }
+
+  return assignInstance(instantiateByCloning(Ins), Ins);
+}
+
+PacketRange InstantiationPass::assignInstance(const PacketRange P, Value *V) {
+  if (!P) {
+    emitVeczRemarkMissed(&packetizer.function(), V, "Could not instantiate");
+    VECZ_STAT_FAIL_IF(!P, VeczPacketizeFailInstantiate);
+  } else {
+    ++VeczInstantiated;
+  }
+  return P;
+}
+
+PacketRange InstantiationPass::broadcast(Value *V) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  unsigned SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(V, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    P[i] = V;
+  }
+  return P;
+}
+
+PacketRange InstantiationPass::instantiateCall(CallInst *CI) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  unsigned SimdWidth = packetizer.width().getFixedValue();
+  // Handle special call instructions that return a lane ID.
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  auto const Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension());
+  if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
+    auto const Uniformity = Builtin.uniformity;
+    if (Uniformity == compiler::utils::eBuiltinUniformityNever) {
+      // can't handle these (global/local linear ID probably)
+      VECZ_FAIL();
+    } else if (Uniformity & compiler::utils::eBuiltinUniformityInstanceID) {
+      Type *RetTy = CI->getType();
+      PacketRange P = packetizer.createPacket(CI, SimdWidth);
+      VECZ_FAIL_IF(!P);
+      IRBuilder<> B(CI);
+      for (unsigned j = 0; j < SimdWidth; j++) {
+        P[j] = B.CreateAdd(CI, ConstantInt::get(RetTy, j));
+      }
+      packetizer.deleteInstructionLater(CI);
+      return P;
+    }
+  }
+
+  // We can't instantiate noduplicate functions
+  VECZ_FAIL_IF(CI->hasFnAttr(Attribute::NoDuplicate));
+
+  packetizer.deleteInstructionLater(CI);
+  // Check if the instruction has any uses or not, and also if we want to
+  // instantiate call instructions with loops or not.
+  if (CI->hasNUsesOrMore(1) ||
+      !packetizer.choices().instantiateCallsInLoops()) {
+    // Instantiate as always
+    SmallVector<PacketRange, 4> OpPackets;
+    for (unsigned i = 0; i < CI->arg_size(); i++) {
+      Value *Op = CI->getArgOperand(i);
+      const PacketRange OpPacket = instantiateInternal(Op);
+      VECZ_FAIL_IF(!OpPacket);
+      OpPackets.push_back(OpPacket);
+    }
+    PacketRange P = packetizer.createPacket(CI, SimdWidth);
+    VECZ_FAIL_IF(!P);
+    IRBuilder<> B(CI);
+    for (unsigned j = 0; j < SimdWidth; j++) {
+      SmallVector<Value *, 4> Ops;
+      for (unsigned i = 0; i < CI->arg_size(); i++) {
+        Ops.push_back(OpPackets[i][j]);
+      }
+      auto *NewCI = B.CreateCall(CI->getFunctionType(), CI->getCalledOperand(),
+                                 Ops, CI->getName());
+      NewCI->setCallingConv(CI->getCallingConv());
+      NewCI->setAttributes(CI->getAttributes());
+      P[j] = NewCI;
+    }
+    return P;
+  } else {
+    // Instantiate in a loop
+    BasicBlock *BeforeCI = CI->getParent();
+    BasicBlock *AfterCI = SplitBlock(BeforeCI, CI);
+    BasicBlock *LoopHeader = BasicBlock::Create(
+        CI->getContext(), "instloop.header", CI->getFunction(), AfterCI);
+    BasicBlock *LoopBody = BasicBlock::Create(CI->getContext(), "instloop.body",
+                                              CI->getFunction(), AfterCI);
+
+    // Change the branch instruction from BeforeCI -> AfterCI to BeforeCI ->
+    // LoopHeader
+    BeforeCI->getTerminator()->setSuccessor(0, LoopHeader);
+
+    IRBuilder<> B(LoopHeader);
+    // Create the induction variable
+    PHINode *Ind = B.CreatePHI(B.getInt32Ty(), 2, "instance");
+
+    // Create the conditional jump based on the current iteration number
+    Value *ICmp = B.CreateICmpULT(Ind, B.getInt32(SimdWidth));
+    B.CreateCondBr(ICmp, LoopBody, AfterCI);
+
+    B.SetInsertPoint(LoopBody);
+    SmallVector<Value *, 4> Operands;
+    for (auto &Arg : CI->args()) {
+      // We call the packetizer explicitly, instead of calling the
+      // instantiator, because we need a packetized value and not an
+      // instantiateed one.
+      Value *Packetized = packetizer.packetize(Arg).getAsValue();
+      VECZ_FAIL_IF(!Packetized);
+      VECZ_ERROR_IF(!Packetized->getType()->isVectorTy(),
+                    "The packetized Value has to be of a vector type");
+      Operands.push_back(Packetized);
+    }
+    // Each Op is an element extracted from a packetized instruction.
+    SmallVector<Value *, 4> Ops;
+    for (unsigned i = 0; i < Operands.size(); ++i) {
+      Ops.push_back(B.CreateExtractElement(Operands[i], Ind));
+    }
+    // Create the function call
+    auto CO = CI->getCalledOperand();
+    FunctionType *FTy = CI->getFunctionType();
+    CallInst *NewCI = B.CreateCall(FTy, CO, Ops);
+    NewCI->setCallingConv(CI->getCallingConv());
+    NewCI->setAttributes(CI->getAttributes());
+    // Increment the induction variable and jump back to the loop header
+    Value *IndInc = B.CreateAdd(Ind, B.getInt32(1), "");
+    B.CreateBr(LoopHeader);
+
+    // Set the operands to the Phi node in the loop header
+    Ind->addIncoming(B.getInt32(0), BeforeCI);
+    Ind->addIncoming(IndInc, LoopBody);
+
+    // Set the Packet, even though we are not going to be using this value (we
+    // have checked if the call has 0 users). We don't need to populate it.
+    return packetizer.createPacket(CI, SimdWidth);
+  }
+}
+
+PacketRange InstantiationPass::instantiateAlloca(AllocaInst *Alloca) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  unsigned SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(Alloca, SimdWidth);
+  VECZ_FAIL_IF(!P);
+  IRBuilder<> B(Alloca);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Type *Ty = Alloca->getAllocatedType();
+    AllocaInst *New = B.CreateAlloca(Ty, nullptr, Alloca->getName());
+    New->setAlignment(Alloca->getAlign());
+
+    P[i] = New;
+  }
+  packetizer.deleteInstructionLater(Alloca);
+  return P;
+}
+
+PacketRange InstantiationPass::instantiateByCloning(Instruction *I) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  auto SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(I, SimdWidth);
+  if (!P || P.at(SimdWidth - 1)) {
+    return P;
+  }
+
+  // Clone breadth first so that the packet is complete before fixing up the
+  // operands, that way we get less stack-thrashing, especially when there
+  // is a circular dependency.
+  SmallVector<Instruction *, 16> Clones;
+  for (decltype(SimdWidth) i = 0; i < SimdWidth; ++i) {
+    if (P.at(i)) {
+      Clones.push_back(nullptr);
+      continue;
+    }
+    Instruction *Clone = I->clone();
+    Clone->insertBefore(I);
+    P[i] = Clone;
+    Clones.push_back(Clone);
+  }
+
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    Value *V = I->getOperand(i);
+    if (isa<BasicBlock>(V) || isa<Constant>(V)) {
+      continue;
+    }
+
+    if (const auto OpP = instantiateInternal(V)) {
+      for (decltype(SimdWidth) lane = 0; lane < SimdWidth; ++lane) {
+        if (auto *Clone = Clones[lane]) {
+          if (auto *At = OpP.at(lane)) {
+            Clone->setOperand(i, At);
+          }
+        }
+      }
+    } else {
+      VECZ_FAIL();
+    }
+  }
+
+  packetizer.deleteInstructionLater(I);
+  return P;
+}
+
+PacketRange InstantiationPass::simdBroadcast(Instruction *I) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  auto SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(I, SimdWidth);
+  if (!P || P.at(0)) {
+    return P;
+  }
+
+  for (auto &i : P) {
+    i = I;
+  }
+
+  auto Op = MemOp::get(I);
+  if (!Op || !Op->getMaskOperand()) {
+    return P;
+  }
+
+  if (auto *MaskInst = dyn_cast<Instruction>(Op->getMaskOperand())) {
+    const auto MP = instantiateInternal(MaskInst);
+    VECZ_FAIL_IF(!MP);
+
+    auto W = SimdWidth;
+    SmallVector<Value *, 16> Reduce;
+    for (decltype(SimdWidth) i = 0; i < SimdWidth; i++) {
+      Reduce.push_back(MP.at(i));
+    }
+
+    IRBuilder<> B(buildAfter(Reduce.back(), packetizer.function()));
+    while ((W >>= 1)) {
+      for (decltype(W) i = 0; i < W; ++i) {
+        Reduce[i] = B.CreateOr(Reduce[i], Reduce[i + W], "any_of_mask");
+      }
+    }
+    Op->setMaskOperand(Reduce.front());
+  }
+
+  return P;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
new file mode 100644
index 0000000000000..5d63a0a0dbecd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -0,0 +1,548 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/interleaved_group_combine_pass.h"
+
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/ScalarEvolution.h>
+#include <llvm/Analysis/ScalarEvolutionExpressions.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/Local.h>
+#include <multi_llvm/opaque_pointers.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "memory_operations.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+char InterleavedGroupCombinePass::PassID = 0;
+
+struct GroupMemberInfo {
+  int64_t Offset;
+  int64_t Order;
+  CallInst *MemOp;
+  Value *Ptr;
+  Type *DataTy;
+};
+
+/// @brief Information about an interleaved operation.
+struct InterleavedGroupCombinePass::InterleavedOpInfo {
+  /// @brief Interleaved operation.
+  CallInst *Op;
+  /// @brief Kind of interleaved operation.
+  InterleavedOperation Kind;
+  /// @brief Interleaved stride.
+  int Stride;
+  /// @brief Whether the operation was removed or not.
+  bool Removed;
+};
+
+struct InterleavedGroupCombinePass::InterleavedGroupInfo {
+  BasicBlock *BB = nullptr;
+  SmallVector<Value *, 4> Data;
+  SmallVector<GroupMemberInfo, 4> Info;
+  Value *Base = nullptr;
+  unsigned Stride = 0;
+  int Offset = 0;
+  InterleavedOperation Kind = eInterleavedInvalid;
+
+  void clear() {
+    BB = nullptr;
+    Data.clear();
+    Info.clear();
+    Base = nullptr;
+    Stride = 0;
+    Offset = 0;
+    Kind = eInterleavedInvalid;
+  }
+
+  bool isConsecutive() const {
+    auto InfoIt = Info.begin();
+    auto InfoE = Info.end();
+    assert(InfoIt != InfoE);
+    int ExpectedOffset = Info.front().Offset;
+    for (++InfoIt; InfoIt != InfoE; ++InfoIt) {
+      if (InfoIt->Offset != ++ExpectedOffset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool canDeinterleaveMask(const Instruction &Mask) const;
+};
+
+namespace {
+
+bool canSwap(Instruction *IA, Instruction *IB) {
+  // we need to check for usage-relations here, because a load instruction
+  // might depend on a mask calculation and its uses that might end up
+  // swapped
+  for (auto *const Op : IB->operand_values()) {
+    if (isa<GetElementPtrInst>(Op)) {
+      // GEPs get eliminated later so ignore them for now
+      continue;
+    }
+    if (Op == IA) {
+      return false;
+    }
+  }
+
+  if (IA->mayReadOrWriteMemory()) {
+    if (isa<FenceInst>(IB)) {
+      // can't swap any memory operation with a fence
+      return false;
+    }
+  } else {
+    // if either instruction is not a memory operation, we can swap them.
+    return true;
+  }
+
+  if (IB->mayReadOrWriteMemory()) {
+    if (isa<FenceInst>(IA)) {
+      return false;
+    }
+  } else {
+    return true;
+  }
+
+  // can't swap a write with a write, or a write with a read,
+  // but it should be ok to swap two reads
+  if (IA->mayWriteToMemory() || IB->mayWriteToMemory()) {
+    return false;
+  }
+
+  return true;
+}
+
+bool canMoveUp(const SmallVectorImpl<Value *> &Group, Instruction *IB) {
+  auto Ig = Group.rbegin();
+  auto Ie = Group.rend();
+  Instruction *IA = IB;
+
+  // It looks through all preceding instructions, skipping over any that are
+  // already in the Group, until it reaches the first member of the group,
+  // terminating if it can't move IB through the current instruction.
+  // If it reaches the first member of the Group, it is safe to move IB there.
+  while ((IA = IA->getPrevNode())) {
+    if (IA == *Ig) {
+      if (++Ig == Ie) {
+        // we met every group member so we're done
+        return true;
+      }
+    } else if (!canSwap(IA, IB)) {
+      return false;
+    }
+  }
+  // if we get here, it means we didn't pass any of the other group members,
+  // which shouldn't be able to happen.
+  assert(false);
+  return false;
+}
+
+bool canMoveDown(const SmallVectorImpl<Value *> &Group, Instruction *IA) {
+  auto Ig = Group.rbegin();
+  auto Ie = Group.rend();
+  Instruction *IB = IA;
+
+  // It looks through all following instructions, skipping over any that are
+  // already in the Group, until it reaches the first member of the group,
+  // terminating if it can't move IA through the current instruction.
+  // If it reaches the first member of the Group, it is safe to move IA there.
+  while ((IB = IB->getNextNode())) {
+    if (IB == *Ig) {
+      if (++Ig == Ie) {
+        // we met every group member so we're done
+        return true;
+      }
+    } else if (!canSwap(IA, IB)) {
+      return false;
+    }
+  }
+  // if we get here, it means we didn't pass any of the other group members,
+  // which shouldn't be able to happen.
+  assert(false);
+  return false;
+}
+
+}  // namespace
+
+bool InterleavedGroupCombinePass::InterleavedGroupInfo::canDeinterleaveMask(
+    const Instruction &Mask) const {
+  // If the mask definition is not in the same block as the group members, it
+  // is safe to de-interleave.
+  if (Mask.getParent() != BB) {
+    return true;
+  }
+
+  SmallPtrSet<Instruction *, 2> Ops;
+  for (auto &Op : Mask.operands()) {
+    if (auto *OpI = dyn_cast<Instruction>(Op.get())) {
+      // We only care about operands in the same basic block, since otherwise
+      // they cannot be group members or in between group members.
+      if (OpI->getParent() == BB) {
+        Ops.insert(OpI);
+      }
+    }
+  }
+
+  // If the mask has no dependency on anything in the group basic block, it is
+  // safe to de-interleave.
+  if (Ops.empty()) {
+    return true;
+  }
+
+  // Note that the mask can hardly depend on the last group member, since it is
+  // itself an operand of this member.
+  Instruction *IA = cast<Instruction>(Data.back());
+
+  // It looks through all instructions from the last member of the group
+  // back to the first, looking to see if the mask depends on any of them.
+  // If it reaches the first member of the Group, it is safe to move the mask.
+  // If it finds any of the mask's own operands as group members or in
+  // between group members, the mask cannot be (trivially) moved.
+  while (IA) {
+    if (Ops.count(IA)) {
+      // We found something the mask depends on, so we can't de-interleave...
+      return false;
+    } else if (IA == Data.front()) {
+      // we met every group member so we're done
+      return true;
+    }
+    IA = IA->getPrevNode();
+  }
+
+  // the mask definition was before every group member
+  return true;
+}
+
+PreservedAnalyses InterleavedGroupCombinePass::run(
+    Function &F, FunctionAnalysisManager &AM) {
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+  IRCleanup IC;
+
+  const bool IsLoad =
+      (Kind == eInterleavedLoad) || (Kind == eMaskedInterleavedLoad);
+
+  LLVM_DEBUG(dbgs() << "vecz: InterleavedGroupCombinePass on " << F.getName()
+                    << "\n");
+
+  scalarEvolution = &AM.getResult<ScalarEvolutionAnalysis>(F);
+
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+  const auto &DL = F.getParent()->getDataLayout();
+  std::vector<InterleavedOpInfo> InterleavedOps;
+  for (BasicBlock &BB : F) {
+    // Look for interleaved operations.
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI) {
+        continue;
+      }
+
+      Optional<MemOp> Op = MemOp::get(CI);
+      // We can't optimize interleaved memops if we don't know the stride at
+      // runtime, since we need to check if the stride and the group size match.
+      if (!Op || !Op->isStrideConstantInt()) {
+        continue;
+      }
+      int64_t Stride = Op->getStrideAsConstantInt();
+      if ((Stride == 0) || (Stride == 1)) {
+        continue;
+      }
+      Value *Mask = Op->getMaskOperand();
+      InterleavedOpInfo Info;
+
+      const bool OpIsLoad = Op->isLoad();
+      Info.Kind = OpIsLoad
+                      ? (Mask ? eMaskedInterleavedLoad : eInterleavedLoad)
+                      : (Mask ? eMaskedInterleavedStore : eInterleavedStore);
+      Info.Op = CI;
+      Info.Stride = Stride;
+      Info.Removed = false;
+
+      // only add the interleaved operation kinds we actually care about
+      if (IsLoad == OpIsLoad) {
+        InterleavedOps.push_back(Info);
+      }
+    }
+    if (!InterleavedOps.empty()) {
+      if (Kind == eInterleavedStore) {
+        // stores are collated downwards, so reverse the list..
+        std::reverse(InterleavedOps.begin(), InterleavedOps.end());
+      }
+
+      InterleavedGroupInfo Group;
+      Group.BB = &BB;
+
+      while (findGroup(InterleavedOps, UVR, Group)) {
+        // Loads have their uses afterwards, while stores use preceding values.
+        // Group.Info is in forwards order for Loads, reverse order for Stores.
+        IRBuilder<> B(Group.Info.front().MemOp);
+
+        Value *Base = Group.Base;
+        if (Kind == eInterleavedLoad && Group.Offset != 0) {
+          auto *EltTy = Group.Info.front().DataTy->getScalarType();
+          assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+                     cast<PointerType>(Base->getType()), EltTy) &&
+                 "Unhandled interleaved access");
+          // if it's a Load group that was out of order, we have to use the
+          // sequentially first GEP in order to preserve use-def ordering,
+          // which means we have to offset it with an additional GEP and
+          // hope this optimizes out later.
+          // Note that this is not necessary for Stores, since instructions
+          // are inserted at the last Store.
+          Base = Group.Info.front().Ptr;
+          auto *Offset = ConstantInt::getSigned(
+              DL.getIntPtrType(Base->getType()), Group.Offset);
+
+          Base = B.CreateInBoundsGEP(EltTy, Base, Offset, "reorder_offset");
+        }
+
+        SmallVector<Value *, 4> Masks;
+        if (Group.Kind == eMaskedInterleavedStore ||
+            Group.Kind == eMaskedInterleavedLoad) {
+          Masks.reserve(Group.Data.size());
+          for (auto *V : Group.Data) {
+            Optional<MemOp> Op = MemOp::get(cast<Instruction>(V));
+            assert(Op && "Unanalyzable interleaved access?");
+            Masks.push_back(Op->getMaskOperand());
+          }
+        }
+        if (Ctx.targetInfo().optimizeInterleavedGroup(
+                B, Group.Kind, Group.Data, Masks, Base, Group.Stride)) {
+          for (Value *V : Group.Data) {
+            if (Instruction *Ins = dyn_cast<Instruction>(V)) {
+              IC.deleteInstructionLater(Ins);
+            }
+          }
+        }
+
+        // Remove the group no matter whether we optimized it or not. Otherwise
+        // we will just iterate indefinitely.
+        for (const auto &Info : Group.Info) {
+          InterleavedOps[Info.Order].Removed = true;
+        }
+      }
+      InterleavedOps.clear();
+    }
+  }
+  IC.deleteInstructions();
+
+  LLVM_DEBUG(dbgs() << "vecz: InterleavedGroupCombinePass done!\n");
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<ScalarEvolutionAnalysis>();
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+
+  return Preserved;
+}
+
+bool InterleavedGroupCombinePass::findGroup(
+    const std::vector<InterleavedOpInfo> &Ops, UniformValueResult &UVR,
+    InterleavedGroupInfo &Group) {
+  VECZ_FAIL_IF(Ops.empty());
+  // this check keeps clang-tidy happy
+  VECZ_FAIL_IF(Kind != eInterleavedStore && Kind != eInterleavedLoad);
+
+  auto &SE = *scalarEvolution;
+
+  for (unsigned i = 0; i < Ops.size(); i++) {
+    // Extract the first memory instruction at the given offset.
+    const InterleavedOpInfo &Info0 = Ops[i];
+    if (Info0.Removed) {
+      continue;
+    }
+
+    Type *DataType0 = nullptr;
+    Value *Ptr0 = nullptr;
+    if (Kind == eInterleavedStore) {
+      DataType0 = Info0.Op->getOperand(0)->getType();
+      Ptr0 = Info0.Op->getOperand(1);
+    } else if (Kind == eInterleavedLoad) {
+      DataType0 = Info0.Op->getType();
+      Ptr0 = Info0.Op->getOperand(0);
+    }
+
+    IRBuilder<> B(cast<Instruction>(Info0.Op));
+    Value *Base0 = UVR.extractMemBase(Ptr0);
+    if (!Base0) {
+      continue;
+    }
+
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr0->getType());
+    if (!PtrTy) {
+      continue;
+    }
+
+    Type *EleTy = DataType0->getScalarType();
+    assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, EleTy) &&
+           "Unhandled interleaved accesses");
+    unsigned Align = EleTy->getScalarSizeInBits() / 8;
+    assert(Align != 0 &&
+           "interleaved memory operation with zero-sized elements");
+
+    Group.clear();
+    Group.Data.push_back(Info0.Op);
+    Group.Info.emplace_back(GroupMemberInfo{0, i, Info0.Op, Ptr0, DataType0});
+    Group.Kind = Info0.Kind;
+
+    // Try to find others that have the same stride and base pointer.
+    for (unsigned j = i + 1; j < Ops.size(); j++) {
+      const InterleavedOpInfo &InfoN = Ops[j];
+      if (InfoN.Removed) {
+        continue;
+      }
+
+      if (Group.Kind != InfoN.Kind) {
+        continue;
+      }
+
+      Type *DataTypeN = nullptr;
+      Value *PtrN = nullptr;
+      if (Kind == eInterleavedStore) {
+        DataTypeN = InfoN.Op->getOperand(0)->getType();
+        PtrN = InfoN.Op->getOperand(1);
+      } else if (Kind == eInterleavedLoad) {
+        DataTypeN = InfoN.Op->getType();
+        PtrN = InfoN.Op->getOperand(0);
+      }
+
+      if ((InfoN.Stride != Info0.Stride) || (DataTypeN != DataType0)) {
+        continue;
+      }
+
+      IRBuilder<> B(cast<Instruction>(InfoN.Op));
+      Value *BaseN = UVR.extractMemBase(PtrN);
+      if (!BaseN || BaseN != Base0) {
+        continue;
+      }
+
+      const SCEV *PtrDiff = SE.getMinusSCEV(SE.getSCEV(PtrN), SE.getSCEV(Ptr0));
+      const auto *ConstDiff = dyn_cast<SCEVConstant>(PtrDiff);
+      if (!ConstDiff) {
+        continue;
+      }
+
+      // Note that the offset calculated here is a byte offset
+      int64_t Offset = ConstDiff->getAPInt().getSExtValue();
+      if (Offset % Align == 0) {
+        // only add them to the group if it is possible to collate them together
+        // at the same place in the function
+        bool CanMove = false;
+        if (Kind == eInterleavedLoad) {
+          CanMove = canMoveUp(Group.Data, cast<Instruction>(InfoN.Op));
+
+          if (InfoN.Kind == eMaskedInterleavedLoad) {
+            Optional<MemOp> Op = MemOp::get(InfoN.Op);
+            assert(Op && "Unanalyzable load?");
+            if (auto *MaskInst = dyn_cast<Instruction>(Op->getMaskOperand())) {
+              CanMove &= Group.canDeinterleaveMask(*MaskInst);
+            }
+          }
+        } else if (Kind == eInterleavedStore) {
+          CanMove = canMoveDown(Group.Data, cast<Instruction>(InfoN.Op));
+        }
+
+        if (CanMove) {
+          Offset /= Align;
+          Group.Data.push_back(InfoN.Op);
+          Group.Info.emplace_back(
+              GroupMemberInfo{Offset, j, InfoN.Op, PtrN, DataTypeN});
+        }
+      }
+    }
+
+    if (Group.Data.size() > 1) {
+      auto InfoB = Group.Info.begin();
+      auto InfoE = Group.Info.end();
+
+      if (Kind == eInterleavedStore) {
+        // In the case of stores, the instructions are processed in reverse
+        // order, so this just puts them back in forwards order
+        std::reverse(InfoB, InfoE);
+      }
+
+      // Sort the group members in order of their offsets. Use a stable sort
+      // so that any duplicates don't get re-ordered (important for stores).
+      std::stable_sort(
+          InfoB, InfoE,
+          [](const GroupMemberInfo &a, const GroupMemberInfo &b) -> bool {
+            return a.Offset < b.Offset;
+          });
+
+      // If the same offset occurs several times, we can still de-interleave
+      // the unique ones, and maybe catch the rest the next time round.
+      InfoE = Group.Info.erase(
+          std::unique(InfoB, InfoE,
+                      [](const GroupMemberInfo &a, const GroupMemberInfo &b)
+                          -> bool { return a.Offset == b.Offset; }),
+          InfoE);
+
+      if (Group.Info.size() <= 1) {
+        // This could happen if our entire group has the same address, in
+        // which case "std::unique" removes all but the first element and we
+        // don't have a Group anymore.
+        continue;
+      }
+
+      unsigned Stride = Info0.Stride;
+      Group.Stride = Stride;
+      // If the group is bigger than the stride we can still de-interleave the
+      // first "Stride" members
+      if (Group.Info.size() > Stride) {
+        Group.Info.resize(Stride);
+        InfoB = Group.Info.begin();
+        InfoE = Group.Info.end();
+      }
+
+      if (!Group.isConsecutive()) {
+        // The group of memory instructions was not consecutive, try further.
+        continue;
+      }
+
+      // Everything is fine, return this group in offset-sorted order.
+      {
+        Group.Data.resize(Group.Info.size());
+        auto InfoIt = InfoB;
+        for (auto &Op : Group.Data) {
+          assert(InfoIt != InfoE);
+          Op = (InfoIt++)->MemOp;
+        }
+      }
+
+      Group.Base = Group.Info.front().Ptr;
+      Group.Offset = Group.Info.front().Offset;
+
+      // Put the Info list back into original Ops vector order
+      // (reverse order for Stores)
+      std::sort(InfoB, InfoE,
+                [](const GroupMemberInfo &a, const GroupMemberInfo &b) -> bool {
+                  return a.Order < b.Order;
+                });
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
new file mode 100644
index 0000000000000..aee00d8c43233
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+
+llvm::PreservedAnalyses vecz::VeczLoopRotatePass::run(
+    llvm::Loop &L, llvm::LoopAnalysisManager &LAM,
+    llvm::LoopStandardAnalysisResults &AR, llvm::LPMUpdater &LU) {
+  // Only process loops whose latch cannot exit the loop and its predecessors
+  // cannot either.
+  if (L.isLoopExiting(L.getLoopLatch())) {
+    return PreservedAnalyses::all();
+  }
+
+  for (BasicBlock *pred : predecessors(L.getLoopLatch())) {
+    if (L.contains(pred) && L.isLoopExiting(pred)) {
+      return PreservedAnalyses::all();
+    }
+  }
+
+  return LoopRotatePass().run(L, LAM, AR, LU);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
new file mode 100644
index 0000000000000..6213225d42170
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -0,0 +1,662 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file contains all the code to perform, on demand, the plumbing between
+// values that have been vectorized, vector-widened, instantiated, or
+// semi-widened/instantiated (otherwise known as Vector Sub-Widening),
+// including the broadcast of uniform values, scatters, gathers, vector splits
+// and concatenations.
+
+#include "transform/packetization_helpers.h"
+
+#include <compiler/utils/group_collective_helpers.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Analysis/VectorUtils.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <multi_llvm/creation_apis_helper.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "transform/packetizer.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-packetization"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+inline Type *getWideType(Type *ty, ElementCount factor) {
+  if (!ty->isVectorTy()) {
+    return VectorType::get(ty, factor);
+  }
+  bool const isScalable = isa<ScalableVectorType>(ty);
+  assert((!factor.isScalable() || !isScalable) &&
+         "Can't widen a scalable vector by a scalable amount");
+  auto *vecTy = cast<llvm::VectorType>(ty);
+  unsigned elts = vecTy->getElementCount().getKnownMinValue();
+  // If we're widening a scalable type then set the fixed factor to scalable
+  // here.
+  if (isScalable && !factor.isScalable()) {
+    factor = ElementCount::getScalable(factor.getKnownMinValue());
+  }
+  ty = vecTy->getElementType();
+  return VectorType::get(ty, factor * elts);
+}
+
+Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
+                               const vecz::TargetInfo &TI, IRBuilder<> &B,
+                               bool URem);
+
+// Helper to broadcast a fixed vector thus:
+// <A,B> -> vscale x 1 -> <A,B,A,B,A,B,...>
+Value *createScalableBroadcastOfFixedVector(const vecz::TargetInfo &TI,
+                                            IRBuilder<> &B, Value *subvec,
+                                            ElementCount factor) {
+  assert(factor.isScalable());
+  return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ true);
+}
+
+// Helper to broadcast a scalable vector thus:
+// <A,B,C, ...> -> x 2 <A,A,B,B,C,C, ...>
+Value *createFixedBroadcastOfScalableVector(const vecz::TargetInfo &TI,
+                                            IRBuilder<> &B, Value *subvec,
+                                            ElementCount factor) {
+  assert(!factor.isScalable());
+  return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ false);
+}
+}  // namespace
+
+namespace vecz {
+Instruction *buildAfter(Value *V, Function &F, bool IsPhi) {
+  if (auto *const I = dyn_cast<Instruction>(V)) {
+    BasicBlock::iterator Next = I->getIterator();
+    const BasicBlock::iterator End = Next->getParent()->end();
+    do {
+      ++Next;
+    } while (!IsPhi && (Next != End) &&
+             (isa<PHINode>(Next) || isa<AllocaInst>(Next)));
+    return &*Next;
+  }
+  // Else find the first point in the function after any allocas.
+  auto it = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(*it)) {
+    ++it;
+  }
+  return &*it;
+}
+
+Constant *getShuffleMask(ShuffleVectorInst *shuffle) {
+  // The mask value seems not to be a proper operand for LLVM 11.
+  // NOTE this is marked as "temporary" in the docs!
+  return shuffle->getShuffleMaskForBitcode();
+}
+
+Value *createOptimalShuffle(IRBuilder<> &B, Value *srcA, Value *srcB,
+                            const SmallVectorImpl<int> &mask,
+                            const Twine &name) {
+  const auto &maskC = mask;
+  auto *shuffleA = dyn_cast<ShuffleVectorInst>(srcA);
+  // If we have a unary shuffle of a shuffle, we can just pre-shuffle the masks
+  if (shuffleA && isa<UndefValue>(srcB)) {
+    auto *const srcMask = getShuffleMask(shuffleA);
+    auto *const newMask = ConstantExpr::getShuffleVector(
+        srcMask, UndefValue::get(srcMask->getType()), maskC);
+
+    return B.CreateShuffleVector(shuffleA->getOperand(0),
+                                 shuffleA->getOperand(1), newMask, name);
+  }
+
+  auto *shuffleB = dyn_cast<ShuffleVectorInst>(srcB);
+
+  if (shuffleA && shuffleB) {
+    auto *const shuffleSrcA = shuffleA->getOperand(0);
+    auto *const shuffleSrcB = shuffleA->getOperand(1);
+
+    // If we have a shuffle of two shuffles with identical source operands,
+    // we can just pre-shuffle their masks together.
+    if (shuffleB->getOperand(0) == shuffleSrcA &&
+        shuffleB->getOperand(1) == shuffleSrcB) {
+      auto *const srcMaskA = getShuffleMask(shuffleA);
+      auto *const srcMaskB = getShuffleMask(shuffleB);
+      auto *const newMask =
+          ConstantExpr::getShuffleVector(srcMaskA, srcMaskB, maskC);
+
+      return B.CreateShuffleVector(shuffleSrcA, shuffleSrcB, newMask, name);
+    }
+  }
+
+  // If either operand is a unary shuffle, we can pull a few more tricks..
+  // For instance:
+  //
+  //    shuffle(shuffle(A, undef, maskA), shuffle(B, undef, maskB), maskC)
+  // => shuffle(A, B, shuffle(maskA, adjust(maskB), maskC))
+  // where "adjust" refers to adjusting the mask values to refer to the second
+  // source vector by adding the width of the first operand to the indices.
+  //
+  // If either source operand is something other than a unary shuffle, we can
+  // "pretend" it is a NOP shuffle of that operand (i.e. a mask of <0, 1, 2..>)
+  // and proceed as before, absorbing the unary shuffle from the other operand.
+  if (shuffleA && !isa<UndefValue>(shuffleA->getOperand(1))) {
+    shuffleA = nullptr;
+  }
+  if (shuffleB && !isa<UndefValue>(shuffleB->getOperand(1))) {
+    shuffleB = nullptr;
+  }
+
+  if (shuffleA || shuffleB) {
+    // We can absorb one or two unary shuffles into the new shuffle..
+    auto *const shuffleAsrc = shuffleA ? shuffleA->getOperand(0) : srcA;
+    auto *const shuffleBsrc = shuffleB ? shuffleB->getOperand(0) : srcB;
+    auto const srcASize =
+        cast<FixedVectorType>(shuffleAsrc->getType())->getNumElements();
+    auto const srcBSize =
+        cast<FixedVectorType>(shuffleBsrc->getType())->getNumElements();
+    if (srcASize == srcBSize) {
+      Constant *srcMaskA = nullptr;
+      Constant *srcMaskB = nullptr;
+
+      if (shuffleA) {
+        srcMaskA = getShuffleMask(shuffleA);
+      } else {
+        // if one operand is not a shuffle, we can make a pretend shuffle..
+        SmallVector<Constant *, 16> newMaskA;
+        for (unsigned i = 0; i < srcASize; ++i) {
+          newMaskA.push_back(B.getInt32(i));
+        }
+        srcMaskA = ConstantVector::get(newMaskA);
+      }
+
+      if (shuffleB) {
+        auto *const maskB = getShuffleMask(shuffleB);
+
+        // adjust the second mask to refer to the second vector..
+        srcMaskB = ConstantExpr::getAdd(
+            maskB, ConstantVector::getSplat(
+                       multi_llvm::getVectorElementCount(maskB->getType()),
+                       B.getInt32(srcASize)));
+      } else {
+        // if one operand is not a shuffle, we can make a pretend shuffle..
+        SmallVector<Constant *, 16> newMaskB;
+        for (unsigned i = 0; i < srcBSize; ++i) {
+          newMaskB.push_back(B.getInt32(i + srcASize));
+        }
+        srcMaskB = ConstantVector::get(newMaskB);
+      }
+
+      auto *const newMask =
+          ConstantExpr::getShuffleVector(srcMaskA, srcMaskB, maskC);
+
+      return B.CreateShuffleVector(shuffleAsrc, shuffleBsrc, newMask, name);
+    }
+  }
+
+  // No more optimal alternative, just build a new one
+  return B.CreateShuffleVector(srcA, srcB, maskC, name);
+}
+
+bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
+                     SmallVectorImpl<Value *> &srcs, unsigned subWidth) {
+  // Scalable sub-splats must be handled specially.
+  if (isa<ScalableVectorType>(srcs.front()->getType())) {
+    if (srcs.size() != 1) {
+      return false;
+    }
+    Value *&val = srcs.front();
+    val = createFixedBroadcastOfScalableVector(
+        TI, B, val, ElementCount::getFixed(subWidth));
+    return val != nullptr;
+  }
+
+  auto *const vecTy = dyn_cast<FixedVectorType>(srcs.front()->getType());
+
+  if (!vecTy) {
+    return false;
+  }
+
+  unsigned srcWidth = vecTy->getNumElements();
+
+  // Build shuffle mask to widen the vector condition.
+  SmallVector<int, 16> mask;
+  for (unsigned i = 0; i < srcWidth; ++i) {
+    for (unsigned j = 0; j < subWidth; ++j) {
+      mask.push_back(i);
+    }
+  }
+
+  auto *undef = UndefValue::get(srcs.front()->getType());
+  for (auto &src : srcs) {
+    src = createOptimalShuffle(B, src, undef, mask);
+  }
+  return true;
+}
+
+Value *sanitizeVPReductionInput(IRBuilder<> &B, Value *Val, Value *VL,
+                                RecurKind Kind) {
+  Type *const ValTy = Val->getType();
+  ElementCount const EC = multi_llvm::getVectorElementCount(ValTy);
+  Value *const VLSplat = B.CreateVectorSplat(EC, VL);
+  Value *const IdxVec = multi_llvm::createIndexSequence(
+      B, VectorType::get(VL->getType(), EC), EC);
+  Value *const ActiveMask = B.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
+  auto *const NeutralVal = compiler::utils::getNeutralVal(Kind, ValTy);
+  return B.CreateSelect(ActiveMask, Val, NeutralVal);
+}
+
+Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty,
+                              unsigned FixedVecElts, const Twine &N) {
+  auto *const Steps = B.CreateStepVector(Ty);
+
+  auto const EltCount = multi_llvm::getVectorElementCount(Ty);
+  auto *const ElTy = multi_llvm::getVectorElementType(Ty);
+
+  auto *const FixedVecEltsSplat =
+      B.CreateVectorSplat(EltCount, ConstantInt::get(ElTy, FixedVecElts));
+  auto *const StepsMul = B.CreateMul(Steps, FixedVecEltsSplat);
+  return B.CreateAdd(StepsMul, Indices, N);
+}
+}  // namespace vecz
+
+PacketRange PacketInfo::getRange(std::vector<llvm::Value *> &d,
+                                 unsigned width) const {
+  auto found = packets.find(width);
+  if (found != packets.end()) {
+    return PacketRange(d, found->second, width);
+  } else {
+    return PacketRange(d);
+  }
+}
+
+Value *Packetizer::Result::getAsValue() const {
+  if (!scalar || !info) {
+    return nullptr;
+  }
+
+  if (info->vector) {
+    return info->vector;
+  }
+
+  const auto numInstances = info->numInstances;
+  if (numInstances == 0) {
+    return broadcast(1).info->vector;
+  }
+
+  const auto packet = getRange(numInstances);
+  assert(packet && "Packet doesn't exist when it should");
+
+  // If the instantiator broadcast the value, it will have set its own packet,
+  // so we fix that here.
+  bool splat = true;
+  for (auto *v : packet) {
+    if (v != scalar) {
+      splat = false;
+      break;
+    }
+  }
+
+  if (splat) {
+    info->numInstances = 0;
+    return broadcast(1).info->vector;
+  }
+
+  Type *const eleTy = packet.front()->getType();
+  assert(!eleTy->isVoidTy() && "Should not be getting a vector of voids");
+
+  auto name = scalar->getName();
+
+  if (FixedVectorType::isValidElementType(eleTy)) {
+    Value *gather = UndefValue::get(FixedVectorType::get(eleTy, packet.size()));
+
+    IRBuilder<> B(buildAfter(packet.back(), packetizer.F));
+    for (unsigned i = 0; i < packet.size(); i++) {
+      gather = B.CreateInsertElement(gather, packet.at(i), B.getInt32(i),
+                                     Twine(name, ".gather"));
+    }
+    info->vector = gather;
+  } else if (eleTy->isVectorTy()) {
+    // Gathering an instantiated vector by concatenating all the lanes
+    auto parts = narrow(2);
+    auto *vecTy = cast<FixedVectorType>(parts.front()->getType());
+    unsigned fullWidth = vecTy->getNumElements() * 2;
+
+    SmallVector<int, 16> mask;
+    for (size_t j = 0; j < fullWidth; ++j) {
+      mask.push_back(j);
+    }
+
+    IRBuilder<> B(buildAfter(parts[1], packetizer.F));
+    info->vector = B.CreateShuffleVector(parts[0], parts[1], mask,
+                                         Twine(name, ".concatenate"));
+  } else {
+    Value *gather = UndefValue::get(ArrayType::get(eleTy, packet.size()));
+
+    IRBuilder<> B(buildAfter(packet.back(), packetizer.F));
+    for (unsigned i = 0; i < packet.size(); i++) {
+      gather =
+          B.CreateInsertValue(gather, packet.at(i), i, Twine(name, ".gather"));
+    }
+    info->vector = gather;
+  }
+  return info->vector;
+}
+
+PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
+  if (!scalar || !info) {
+    return PacketRange(packetizer.packetData);
+  }
+
+  if (const auto range = getRange(width)) {
+    return range;
+  }
+
+  auto numInstances = info->numInstances;
+  if (numInstances == 0) {
+    return broadcast(width).getRange(width);
+  }
+
+  if (numInstances != 1) {
+    if (numInstances < width) {
+      return widen(width);
+    } else if (numInstances > width) {
+      return narrow(width);
+    } else {
+      assert(false && "Supposedly unreachable condition in Packetizer::Result");
+    }
+  }
+
+  if (!info->vector) {
+    return PacketRange(packetizer.packetData);
+  }
+
+  auto packet = createPacket(width);
+
+  Value *vec = info->vector;
+  if (auto *const vecTy = dyn_cast<FixedVectorType>(vec->getType())) {
+    assert(isa<FixedVectorType>(vecTy) && "Must be a fixed vector type here!");
+    unsigned scalarWidth = vecTy->getNumElements() / width;
+    if (scalarWidth > 1) {
+      auto *const undef = UndefValue::get(vec->getType());
+
+      // Build shuffle mask to perform the subvector extracts.
+      IRBuilder<> B(buildAfter(vec, packetizer.F));
+      for (size_t i = 0, k = 0; i < width; ++i) {
+        SmallVector<int, 16> mask;
+        for (size_t j = 0; j < scalarWidth; ++j, ++k) {
+          mask.push_back(k);
+        }
+        packet[i] = createOptimalShuffle(B, vec, undef, mask,
+                                         Twine(scalar->getName(), ".split"));
+      }
+    } else {
+      IRBuilder<> B(buildAfter(vec, packetizer.F));
+      for (unsigned i = 0; i < width; i++) {
+        packet[i] = B.CreateExtractElement(vec, B.getInt32(i));
+      }
+    }
+  } else {
+    assert(isa<ArrayType>(vecTy) && "Must be an array here!");
+    IRBuilder<> B(buildAfter(vec, packetizer.F));
+    for (unsigned i = 0; i < width; i++) {
+      packet[i] = B.CreateExtractValue(vec, i);
+    }
+  }
+  return packet;
+}
+
+void Packetizer::Result::getPacketValues(SmallVectorImpl<Value *> &vals) const {
+  assert(info && "No packet info for this packetization result");
+  auto const width = info->numInstances;
+  if (width != 0) {
+    return getPacketValues(width, vals);
+  }
+}
+
+void Packetizer::Result::getPacketValues(unsigned width,
+                                         SmallVectorImpl<Value *> &vals) const {
+  assert(width != 0 && "Can't get a zero width packet");
+  if (width == 1) {
+    if (auto *const val = getAsValue()) {
+      vals.push_back(val);
+    }
+  } else {
+    auto p = getAsPacket(width);
+    vals.assign(p.begin(), p.end());
+  }
+}
+
+PacketRange Packetizer::Result::createPacket(unsigned width) const {
+  assert(info && "Can't create a packet on a fail state");
+  assert(info->packets.count(width) == 0 &&
+         "Shouldn't create the same packet twice");
+
+  const auto start = packetizer.packetData.size();
+  packetizer.packetData.resize(start + width, nullptr);
+  info->packets[width] = start;
+  return PacketRange(packetizer.packetData, start, width);
+}
+
+PacketRange Packetizer::Result::getRange(unsigned width) const {
+  return info->getRange(packetizer.packetData, width);
+}
+
+// it makes a wider packet by splitting the sub-vectors
+PacketRange Packetizer::Result::widen(unsigned width) const {
+  const auto numInstances = info->numInstances;
+  const auto parts = getRange(numInstances);
+  auto *const vecTy = dyn_cast<FixedVectorType>(parts.front()->getType());
+  assert(vecTy && "Expected a fixed vector type");
+
+  auto packet = createPacket(width);
+  const auto origWidth = vecTy->getNumElements();
+  const auto newWidth = (origWidth * numInstances) / width;
+  const auto name = scalar->getName();
+
+  auto *it = parts.begin();
+  IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
+  if (newWidth > 1) {
+    auto *const undef = UndefValue::get(vecTy);
+
+    // Build shuffle mask to perform the subvector extracts.
+    for (size_t i = 0, origIdx = 0; i < width; ++i) {
+      if (origIdx == origWidth) {
+        origIdx = 0;
+        ++it;
+      }
+      SmallVector<int, 16> mask;
+      for (size_t j = 0; j < newWidth; ++j, ++origIdx) {
+        mask.push_back(origIdx);
+      }
+      packet[i] =
+          createOptimalShuffle(B, *it, undef, mask, Twine(name, ".split"));
+    }
+  } else {
+    for (size_t i = 0, origIdx = 0; i < width; ++i, ++origIdx) {
+      if (origIdx == origWidth) {
+        origIdx = 0;
+        ++it;
+      }
+      packet[i] = B.CreateExtractElement(*it, B.getInt32(origIdx),
+                                         Twine(name, ".split"));
+    }
+  }
+  return packet;
+}
+
+// it makes a narrower packet by concatenating the sub-vectors
+PacketRange Packetizer::Result::narrow(unsigned width) const {
+  if (const auto range = getRange(width)) {
+    return range;
+  }
+
+  // Narrow recursively
+  const auto parts = narrow(width * 2);
+  assert(parts && "Error during packet narrowing");
+
+  auto packet = createPacket(width);
+  auto *const ty = parts.front()->getType();
+  auto *const vecTy = dyn_cast<FixedVectorType>(ty);
+  if (!vecTy) {
+    // Build vectors out of pairs of scalar values
+    const auto name = scalar->getName();
+    IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
+    Value *undef = UndefValue::get(FixedVectorType::get(ty, 2));
+    for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) {
+      Value *in = B.CreateInsertElement(undef, parts[pairIdx], B.getInt32(0),
+                                        Twine(name, ".gather"));
+      packet[i] = B.CreateInsertElement(in, parts[pairIdx + 1], B.getInt32(1),
+                                        Twine(name, ".gather"));
+    }
+    return packet;
+  }
+
+  const unsigned fullWidth = vecTy->getNumElements() * 2;
+
+  SmallVector<int, 16> mask;
+  for (size_t j = 0; j < fullWidth; ++j) {
+    mask.push_back(j);
+  }
+
+  // Build wider vectors by concatenating pairs of sub-vectors
+  const auto name = scalar->getName();
+  IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
+  for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) {
+    packet[i] = createOptimalShuffle(B, parts[pairIdx], parts[pairIdx + 1],
+                                     mask, Twine(name, ".concatenate"));
+  }
+  return packet;
+}
+
+namespace {
+// This method creates the following sequence to broadcast a fixed-length
+// vector to a scalable one or broadcasting a scalable-vector by a fixed
+// amount, barring any optimizations we can perform for broadcasting a splat
+// vector.
+// The general idea is first to store the subvector to a stack 'alloca', then
+// use a gather operation with a vector of pointers created using a step vector
+// modulo the fixed amount.
+// Note that other sequences are possible, such as a series of blend
+// operations. This could perhaps be a target choice.
+Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
+                               const vecz::TargetInfo &TI, IRBuilder<> &B,
+                               bool URem) {
+  auto *ty = subvec->getType();
+  auto const subVecEltCount = multi_llvm::getVectorElementCount(ty);
+  assert(subVecEltCount.isScalable() ^ factor.isScalable() &&
+         "Must either broadcast fixed vector by scalable factor or scalable "
+         "vector by fixed factor");
+  auto *const wideTy = getWideType(ty, factor);
+  auto wideEltCount = multi_llvm::getVectorElementCount(wideTy);
+
+  // If this vector is a constant splat, just splat it to the wider scalable
+  // type.
+  if (auto *const cvec = dyn_cast<Constant>(subvec)) {
+    if (auto *const splat = cvec->getSplatValue()) {
+      return ConstantVector::getSplat(wideEltCount, splat);
+    }
+  }
+  // Or if it's a splat value, re-splat it. Note we do Constants separately
+  // above as it generates more canonical code, e.g., a splat of 0 becomes
+  // zeroinitializer rather than a insertelement/shufflevector sequence.
+  if (const auto *const splat = getSplatValue(subvec)) {
+    return B.CreateVectorSplat(wideEltCount, const_cast<Value *>(splat));
+  }
+
+  // Compiler support for masked.gather on i1 vectors is lacking, so emit this
+  // operation as the equivalent i8 vector instead.
+  const bool upcast_i1_as_i8 = ty->getScalarType()->isIntegerTy(1);
+  if (upcast_i1_as_i8) {
+    auto *const int8Ty = Type::getInt8Ty(B.getContext());
+    ty = llvm::VectorType::get(int8Ty, subVecEltCount);
+    subvec = B.CreateSExt(subvec, ty);
+  }
+
+  Value *gather =
+      URem ? TI.createOuterScalableBroadcast(B, subvec, /*VL*/ nullptr, factor)
+           : TI.createInnerScalableBroadcast(B, subvec, /*VL*/ nullptr, factor);
+
+  // If we've been performing this broadcast as i8, now's the time to truncate
+  // back down to i1.
+  if (upcast_i1_as_i8) {
+    gather = B.CreateTrunc(gather, wideTy);
+  }
+
+  return gather;
+}
+}  // namespace
+
+const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
+  const auto factor = packetizer.width().divideCoefficientBy(width);
+  auto *const ty = scalar->getType();
+  assert(!ty->isVoidTy() && "Should not be broadcasting a void type");
+
+  if (width != 1 && !factor.isScalable() && factor.getFixedValue() == 1) {
+    // Pure instantiation broadcast..
+    for (auto &v : createPacket(width)) {
+      v = scalar;
+    }
+    return *this;
+  }
+
+  auto &F = packetizer.F;
+  Value *result = nullptr;
+  const auto &TI = packetizer.context().targetInfo();
+  if (isa<UndefValue>(scalar)) {
+    result = UndefValue::get(getWideType(ty, factor));
+  } else if (ty->isVectorTy() && factor.isScalable()) {
+    IRBuilder<> B(buildAfter(scalar, F));
+    result = createScalableBroadcastOfFixedVector(TI, B, scalar, factor);
+  } else if (ty->isVectorTy()) {
+    auto *const vecTy = cast<FixedVectorType>(ty);
+    unsigned scalarWidth = vecTy->getNumElements();
+
+    unsigned simdWidth = factor.getFixedValue();
+
+    // Build shuffle mask to perform the splat.
+    SmallVector<int, 16> mask;
+    for (size_t i = 0; i < simdWidth; ++i) {
+      for (size_t j = 0; j < scalarWidth; ++j) {
+        mask.push_back(j);
+      }
+    }
+
+    IRBuilder<> B(buildAfter(scalar, packetizer.F));
+    result = createOptimalShuffle(B, scalar, UndefValue::get(ty), mask,
+                                  Twine(scalar->getName(), ".broadcast"));
+  } else if (auto *const C = dyn_cast<Constant>(scalar)) {
+    result = ConstantVector::getSplat(factor, C);
+  } else {
+    IRBuilder<> B(buildAfter(scalar, packetizer.F));
+    result = B.CreateVectorSplat(factor, scalar);
+  }
+
+  if (!result) {
+    // Failed to broadcast this value, return the empty result
+    return *this;
+  }
+
+  if (width == 1) {
+    info->vector = result;
+  } else {
+    for (auto &v : createPacket(width)) {
+      v = result;
+    }
+  }
+  return *this;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
new file mode 100644
index 0000000000000..1934a81938be7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/packetization_pass.h"
+
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/Dominators.h>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/simd_width_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/packetizer.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-packetization"
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczPacketizeFail,
+          "Number of kernels that failed to packetize [ID#P80]");
+STATISTIC(VeczSimdAnalysisFail,
+          "Number of kernels that SIMD Width Analysis "
+          "suggested not to packetize [ID#P81]");
+
+char PacketizationPass::PassID = 0;
+
+PreservedAnalyses PacketizationPass::run(Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+
+  if (!VU.width().isScalable()) {
+    unsigned SimdWidth = VU.width().getFixedValue();
+    if (VU.autoWidth() && VU.context().targetInfo().getTargetMachine()) {
+      LLVM_DEBUG(dbgs() << "vecz: Original SIMD width: " << SimdWidth << "\n");
+      unsigned NewSimdWidth = AM.getResult<SimdWidthAnalysis>(F).value;
+      LLVM_DEBUG(dbgs() << "vecz: Re-determined SIMD width: " << NewSimdWidth
+                        << "\n");
+
+      if (NewSimdWidth <= 1u) {
+        ++VeczSimdAnalysisFail;
+        return VU.setFailed("SIMD Width Analysis suggested not to packetize");
+      }
+
+      if (NewSimdWidth < SimdWidth) {
+        VU.setWidth(ElementCount::getFixed(NewSimdWidth));
+      }
+    }
+  }
+
+  if (!Packetizer::packetize(F, AM, VU.width(), VU.dimension())) {
+    ++VeczPacketizeFail;
+    return VU.setFailed("packetization failed");
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
new file mode 100644
index 0000000000000..b377881034085
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -0,0 +1,3283 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/packetizer.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/VectorUtils.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/LoopUtils.h>
+#include <multi_llvm/creation_apis_helper.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/opaque_pointers.h>
+#include <multi_llvm/optional_helper.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <memory>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/packetization_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/instantiation_pass.h"
+#include "transform/packetization_helpers.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-packetization"
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczPacketized, "Number of instructions packetized [ID#P00]");
+STATISTIC(VeczPacketizeFailCall,
+          "Packetize: missing function declarations [ID#P81]");
+STATISTIC(VeczPacketizeFailType,
+          "Packetize: inconsistent vector parameters [ID#P87]");
+STATISTIC(VeczPacketizeFailPtr,
+          "Packetize: inconsistent pointer parameters [ID#P88]");
+STATISTIC(VeczPacketizeFailStride,
+          "Packetize: non-constant strides in pointer parameters [ID#P8A]");
+
+// Just a little macro that can return an empty SmallVector, as a drop-in
+// replacement for VECZ_FAIL_IF..
+#define PACK_FAIL_IF(cond) \
+  do {                     \
+    if (cond) {            \
+      return {};           \
+    }                      \
+  } while (false)
+
+namespace {
+// Returns a type equivalent to the input type plus padding.
+// This converts a <3 x Ty> into a <4 x Ty>, leaving other types unchanged.
+Type *getPaddedType(Type *Ty) {
+  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    if (VecTy->getNumElements() == 3) {
+      return VectorType::get(VecTy->getElementType(),
+                             ElementCount::getFixed(4));
+    }
+  }
+  return Ty;
+}
+
+Type *getWideType(Type *Ty, ElementCount Factor) {
+  unsigned Elts = 1;
+  if (Ty->isVectorTy()) {
+    auto *VecTy = cast<FixedVectorType>(Ty);
+    Elts = VecTy->getNumElements();
+    Ty = VecTy->getElementType();
+  }
+  return VectorType::get(Ty, Factor * Elts);
+}
+}  // namespace
+
+using ValuePacket = SmallVector<Value *, 16>;
+
+/// @brief Private implementation of the Packetizer.
+/// It inherits its own outer class, which has only private constructors. This
+/// allows us to pass it by reference to functions that need to access the
+/// Packetizer, while also ensuring that a Packetizer cannot be created except
+/// as the base class of its own implementation.
+class Packetizer::Impl : public Packetizer {
+ public:
+  Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM, ElementCount Width,
+       unsigned Dim);
+  Impl() = delete;
+  Impl(const Packetizer &) = delete;
+  Impl(Packetizer &&) = delete;
+  ~Impl();
+
+  bool packetize();
+
+  /// @brief Handle packetization failure. This method ensures that
+  /// packetization failure does not leave behind invalid IR.
+  void onFailure();
+
+  /// @brief Packetize the given value from the function.
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized value.
+  Result packetize(Value *V);
+
+  /// @brief Packetize the given value and return the packet by values
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeAndGet(Value *V);
+
+  /// @brief Packetize the given value to a specified packet width, and return
+  /// the packet by values
+  ///
+  /// @param[in] V Value to packetize.
+  /// @param[in] Width the requested packet width
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeAndGet(Value *V, unsigned Width);
+
+  /// @brief Packetize the given value from the function, only if it is a
+  /// varying value. Ensures Mask Varying values are handled correctly.
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized value if varying, or the original value if Uniform.
+  Value *packetizeIfVarying(Value *V);
+
+  /// @brief Packetize a uniform value by broadcasting to all vector lanes.
+  ///
+  /// @param[in] V Value to broadcast
+  ///
+  /// @return Packetized instruction
+  Result broadcast(Value *V);
+  /// @brief Reduce a varying boolean condition to a scalar
+  ///
+  /// @param[in] cond Condition to packetize.
+  /// @param[in] terminator Terminator instruction.
+  /// @param[in] allOf Whether to create a all of mask, or any of.
+  ///
+  /// @return reduced boolean value.
+  Value *reduceBranchCond(Value *cond, Instruction *terminator, bool allOf);
+  /// @brief Compute the ideal packet width for subwidening the given type
+  ///
+  /// @param[in] ty Type of the value to subwiden
+  /// @param[in] limit The maximum vector width we allow
+  ///
+  /// @return width of the packet to create
+  unsigned getPacketWidthForType(Type *ty, unsigned limit = ~0u) const;
+  /// @brief Packetize an instruction.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  ///
+  /// @return Packetized instructions.
+  Result packetizeInstruction(Instruction *Ins);
+  /// @brief Packetize a mask-varying instruction.
+  ///
+  /// @param[in] I Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *packetizeMaskVarying(Instruction *I);
+  /// @brief Packetize a mask-varying subgroup reduction.
+  ///
+  /// @param[in] I Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *packetizeSubgroupReduction(Instruction *I);
+  /// @brief Packetize a subgroup broadcast.
+  ///
+  /// @param[in] I Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *packetizeSubgroupBroadcast(Instruction *I);
+  /// @brief Packetize PHI node.
+  ///
+  /// @param[in] PHI PHI Node to packetize.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizePHI(PHINode *Phi);
+  /// @brief Packetize a call instruction.
+  ///
+  /// @param[in] CI Call Instruction to packetize.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeCall(CallInst *CI);
+  /// @brief Packetize a subgroup scan.
+  ///
+  /// @param[in] CI CallInst to packetize.
+  /// @param[in] SubgroupScanKind type of subgroup scan to packetized.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeSubgroupScan(
+      CallInst *CI, compiler::utils::BuiltinSubgroupScanKind SubgroupScanKind);
+  /// @brief Perform post-packetization tasks for the given scalar value.
+  ///
+  /// @param[in] Scalar Scalar value to assign a vectorized value.
+  /// @param[in] Vectorized Packetized value to assign.
+  ///
+  /// @return Packetized values.
+  Result assign(Value *Scalar, Value *Vectorized);
+  /// @brief Packetize a load instruction.
+  ///
+  /// @param[in] Load Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeInstruction(Instruction *Ins);
+  /// @brief Packetize a load instruction.
+  ///
+  /// @param[in] Load Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeLoad(LoadInst *Load);
+  /// @brief Packetize a store instruction.
+  ///
+  /// @param[in] Store Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeStore(StoreInst *Store);
+  /// @brief Packetize a memory operation.
+  ///
+  /// @param[in] Op Memory operation to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeMemOp(MemOp &Op);
+  /// @brief Packetize a GEP instruction.
+  ///
+  /// @param[in] GEP Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeGEP(GetElementPtrInst *GEP);
+  /// @brief Packetize a cast instruction.
+  ///
+  /// @param[in] CastI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeCast(CastInst *CastI);
+  /// @brief Packetize a binary operator instruction.
+  ///
+  /// @param[in] BinOp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeBinaryOp(BinaryOperator *BinOp);
+  /// @brief Packetize a freeze instruction.
+  ///
+  /// @param[in] FreezeInst Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeFreeze(FreezeInst *FreezeI);
+  /// @brief Packetize a unary operator instruction.
+  ///
+  /// @param[in] UnOp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeUnaryOp(UnaryOperator *UnOp);
+  /// @brief Packetize an integer compare instruction.
+  ///
+  /// @param[in] Cmp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeICmp(ICmpInst *Cmp);
+  /// @brief Packetize a floating-point compare instruction.
+  ///
+  /// @param[in] Cmp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeFCmp(FCmpInst *Cmp);
+  /// @brief Packetize a select instruction.
+  ///
+  /// @param[in] Select Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeSelect(SelectInst *Select);
+  /// @brief Packetize a return instruction.
+  ///
+  /// @param[in] Return Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeReturn(ReturnInst *Return);
+  /// @brief Packetize a call instruction.
+  ///
+  /// @param[in] CI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeCall(CallInst *CI);
+  /// @brief Packetize a call to a work-group builtin.
+  ///
+  /// @param[in] CI Instruction to packetize.
+  /// @param[in] Builtin Builtin identifier.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeWorkGroupCall(CallInst *CI,
+                                compiler::utils::BuiltinCall const &Builtin);
+  /// @brief Packetize an alloca instruction.
+  ///
+  /// @param[in] Alloca Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeAlloca(AllocaInst *Alloca);
+  /// @brief Packetize an extract value instruction.
+  ///
+  /// @param[in] ExtractElement Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeExtractValue(ExtractValueInst *ExtractElement);
+  /// @brief Packetize an insert element instruction.
+  ///
+  /// @param[in] InsertElement Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeInsertElement(InsertElementInst *InsertElement);
+  /// @brief Packetize an insert element instruction.
+  ///
+  /// @param[in] InsertElement Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement);
+  /// @brief Packetize a shuffle vector instruction.
+  ///
+  /// @param[in] Shuffle Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeShuffleVector(ShuffleVectorInst *Shuffle);
+  /// @brief Preserves debug information attached to old scalar instruction,
+  ///        updating the debug info type to match the vector width.
+  ///
+  /// @param[in] Scalar Scalar instruction before packetization.
+  /// @param[in] Packet Packetized instruction.
+  void vectorizeDI(Instruction *Scalar, Value *Packet);
+
+  /// @brief Helps handle instructions that cannot be packetized.
+  std::unique_ptr<InstantiationPass> Instantiator;
+
+  /// @brief List of phi nodes that can be used by passes to defer the
+  /// processing of these nodes.
+  std::vector<PHINode *> pendingPhis;
+
+  /// @brief The target transform info
+  const TargetTransformInfo TTI;
+};
+
+Packetizer::Packetizer(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                       ElementCount Width, unsigned Dim)
+    : AM(AM),
+      VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
+      Ctx(AM.getResult<VectorizationContextAnalysis>(F).getContext()),
+      Choices(VU.choices()),
+      UVR(AM.getResult<UniformValueAnalysis>(F)),
+      SAR(AM.getResult<StrideAnalysis>(F)),
+      PAR(AM.getResult<PacketizationAnalysis>(F)),
+      F(F),
+      SimdWidth(Width),
+      Dimension(Dim) {}
+
+Packetizer::Impl::Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                       ElementCount Width, unsigned Dim)
+    : Packetizer(F, AM, Width, Dim), TTI(Ctx.getTargetTransformInfo(F)) {
+  Instantiator.reset(new InstantiationPass(*this));
+}
+
+Packetizer::Impl::~Impl() = default;
+
+bool Packetizer::packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                           ElementCount Width, unsigned Dim) {
+  Impl impl(F, AM, Width, Dim);
+  bool Res = impl.packetize();
+  if (!Res) {
+    impl.onFailure();
+  }
+  return Res;
+}
+
+bool Packetizer::Impl::packetize() {
+  LLVM_DEBUG(if (PAR.isEmpty()) {
+    llvm::dbgs() << "No vector leaves in function "
+                 << VU.scalarFunction()->getName() << "\n";
+  });
+
+  // If requested, set up the base vector length for this kernel based on the
+  // number of remaining work items: the local size minus the local id. Since
+  // VP intrinsics are undefined for %evl values larger than the actual vector
+  // width, we also constrain it based on the vectorization width.
+  BasicBlock &EntryBB = F.getEntryBlock();
+  IRBuilder<> B(&*EntryBB.getFirstInsertionPt());
+
+  if (Choices.vectorPredication()) {
+    auto &M = *F.getParent();
+    auto *const I32Ty = Type::getInt32Ty(F.getContext());
+    auto *const LocalIdFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+        compiler::utils::eMuxBuiltinGetLocalId, M);
+    auto *const LocalSizeFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+        compiler::utils::eMuxBuiltinGetLocalSize, M);
+    assert(LocalIdFn && LocalSizeFn && "Unable to create mux builtins");
+    auto *const ID =
+        B.CreateCall(LocalIdFn, B.getInt32(VU.dimension()), "local.id");
+    ID->setAttributes(LocalIdFn->getAttributes());
+    ID->setCallingConv(LocalIdFn->getCallingConv());
+    auto *const Size =
+        B.CreateCall(LocalSizeFn, B.getInt32(VU.dimension()), "local.size");
+    Size->setAttributes(LocalSizeFn->getAttributes());
+    Size->setCallingConv(LocalSizeFn->getCallingConv());
+    VECZ_FAIL_IF(!ID || !Size);
+
+    VL = B.CreateSub(Size, ID, "work.remaining", /*HasNUW*/ true,
+                     /*HasNSW*/ true);
+
+    if (auto *RVVVL = Ctx.targetInfo().createVPKernelWidth(
+            B, VL, /*WidestType*/ 32, VU.width())) {
+      VL = RVVVL;
+    } else {
+      auto *const Scaling =
+          ConstantInt::get(VL->getType(), VU.width().getKnownMinValue());
+      auto *const VectorLength =
+          VU.width().isScalable() ? B.CreateVScale(Scaling) : Scaling;
+      VL = B.CreateIntrinsic(Intrinsic::umin, {VL->getType()},
+                             {VL, VectorLength});
+
+      VL = B.CreateTrunc(VL, I32Ty);
+    }
+  }
+
+  // Manifest the memory operation stride values as actual `llvm::Value`s
+  SAR.manifestAll(B);
+
+  // Pre-process the arguments first to replace any placeholders with their
+  // proper vector values, and convert pointer return arguments to vector of
+  // pointers where required.
+  {
+    Value *idxVector = nullptr;
+    for (const auto &TargetArg : VU.arguments()) {
+      if (auto *const Placeholder = TargetArg.Placeholder) {
+        auto &info = packets[Placeholder];
+        info.vector = TargetArg.NewArg;
+        info.numInstances = 1;
+      } else if (TargetArg.PointerRetPointeeTy &&
+                 PAR.needsPacketization(TargetArg.NewArg)) {
+        if (!idxVector) {
+          idxVector = multi_llvm::createIndexSequence(
+              B, VectorType::get(B.getInt32Ty(), SimdWidth), SimdWidth,
+              "index.vec");
+        }
+
+        // CA-3943 this implementation looks unlikely to be correct, but for
+        // now we just maintain the original behaviour, until we have a better
+        // idea of what is going on or whether any of this is still needed.
+        // This case will never be encountered during kernel vectorization.
+        auto *const Arg = TargetArg.NewArg;
+        auto *const EleTy = TargetArg.PointerRetPointeeTy;
+        auto &info = packets[Arg];
+        info.vector = B.CreateGEP(EleTy, Arg, idxVector);
+        info.numInstances = 1;
+      }
+    }
+  }
+
+  // Build an ordered list of the instructions to packetize, in depth first
+  // order so that we don't have to recurse too much. We build the list first
+  // because packetization of calls can produce loops, which messes up our
+  // iteration over the basic blocks of the function.
+  std::vector<Instruction *> ordered;
+  for (auto *BB : depth_first(&F)) {
+    for (auto &I : *BB) {
+      if (PAR.needsPacketization(&I)) {
+        ordered.push_back(&I);
+      }
+    }
+  }
+
+  for (auto *const I : ordered) {
+    if (!packetize(I)) {
+      emitVeczRemarkMissed(&F, I, "Could not packetize");
+      VECZ_FAIL();
+    }
+  }
+
+  // Packetize remaining phi nodes until they have all been packetized.
+  // Packetizing one phi node may involve the packetization of another node.
+  // Some nodes might need to be instantiated instead of being packetized, but
+  // we are handling this here because the instantiation pass is not run as a
+  // standalone pass.
+  // Note: pendingPhis *may* change as we progress through this loop, by
+  // calling packetize(Incoming). Therefore we can't cache the vector size when
+  // setting up the loop.
+  for (unsigned i = 0; i < pendingPhis.size(); i++) {
+    PHINode *Phi = pendingPhis[i];
+    auto &info = packets[Phi];
+    assert(info.numInstances > 0 && "A PHI pending packetization has no stub");
+    if (info.numInstances == 1) {
+      auto *NewPhi = cast<PHINode>(info.vector);
+      for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) {
+        Value *Incoming = Phi->getIncomingValue(i);
+        BasicBlock *BB = Phi->getIncomingBlock(i);
+        Value *VecIncoming = packetize(Incoming).getAsValue();
+        VECZ_FAIL_IF(!VecIncoming);
+        NewPhi->addIncoming(VecIncoming, BB);
+      }
+    } else {
+      const auto PhiPacket = info.getRange(packetData);
+      for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) {
+        Value *Incoming = Phi->getIncomingValue(i);
+        BasicBlock *BB = Phi->getIncomingBlock(i);
+        auto PackIncoming = packetize(Incoming).getAsPacket(PhiPacket.size());
+        for (unsigned j = 0; j < PhiPacket.size(); ++j) {
+          auto *NewPhi = cast<PHINode>(PhiPacket.at(j));
+          auto *Incoming = PackIncoming.at(j);
+          VECZ_FAIL_IF(!NewPhi);
+          VECZ_FAIL_IF(!Incoming);
+          NewPhi->addIncoming(Incoming, BB);
+        }
+      }
+    }
+    IC.deleteInstructionLater(Phi);
+  }
+
+  auto *insertPt = &*EntryBB.begin();
+  for (auto &I : EntryBB) {
+    auto *const alloca = dyn_cast<AllocaInst>(&I);
+    if (!alloca) {
+      insertPt = I.getNextNonDebugInstruction();
+      continue;
+    }
+
+    while (isa<AllocaInst>(insertPt)) {
+      insertPt = insertPt->getNextNonDebugInstruction();
+    }
+
+    // It's possible for some uses of the alloca to be packetized and others
+    // not. For instance, where we have a store to a constant address, since
+    // the execution order of work items is undefined, the data operand need
+    // not be packetized, and we can end up with uses of the scalar alloca
+    // still present in the vector function. In such a case we can replace it
+    // with the first element of the packetized alloca.
+    if (auto res = getPacketized(alloca)) {
+      SmallVector<Value *, 16> vals;
+      res.getPacketValues(vals);
+      if (vals.empty()) {
+        // It is a broadcast value, so we don't need to do anything.
+        continue;
+      }
+      auto *element0 = vals.front();
+
+      if (!isa<AllocaInst>(element0)) {
+        assert(isa<GetElementPtrInst>(element0) && "vecz: expected GEP");
+        auto *const GEP = cast<GetElementPtrInst>(element0);
+        // If the alloca was packetized, it will be indexed by a GEP.
+        // We only need the original, un-indexed pointer.
+        alloca->replaceAllUsesWith(GEP->getPointerOperand());
+        continue;
+      }
+
+      if (element0->getType()->isVectorTy()) {
+        B.SetInsertPoint(insertPt);
+        element0 = B.CreateExtractElement(element0, B.getInt32(0));
+      }
+      alloca->replaceAllUsesWith(element0);
+      continue;
+    }
+
+    // We have to widen allocas if they are varying, regardless of the result
+    // of the packetization analysis, because they need enough storage for all
+    // lanes, even though they are only accessed through a scalar pointer.
+    // We do this last, otherwise it messes with the stride analysis etc.
+    // Only non-instantiated allocas should be left by now.
+    if (!UVR.isVarying(alloca)) {
+      continue;
+    }
+    // Array allocas need to be instantiated.
+    assert(!alloca->isArrayAllocation() &&
+           "vecz: unexpected array alloca; should have been instantiated");
+
+    B.SetInsertPoint(alloca);
+    auto *const dataTy = alloca->getAllocatedType();
+    if (dataTy->isVectorTy() || VectorType::isValidElementType(dataTy)) {
+      // We can vectorize or vector widen this type.
+      auto *const newAlloca =
+          B.CreateAlloca(getWideType(getPaddedType(dataTy), SimdWidth));
+      newAlloca->setAlignment(alloca->getAlign());
+      newAlloca->takeName(alloca);
+
+      // Absorb other bitcasts (e.g. i8* for lifetime instrinsics, or bitcasts
+      // back to vector type for contiguous loads/stores)
+      bool needCast = false;
+      auto *const newTy = newAlloca->getType();
+      for (Use &U : alloca->uses()) {
+        auto *const user = dyn_cast<BitCastInst>(U.getUser());
+        if (!user) {
+          needCast = true;
+          continue;
+        }
+
+        auto *const dstTy = user->getType();
+        if (dstTy == newTy) {
+          // Bitcasts totally redundant
+          user->replaceAllUsesWith(newAlloca);
+        } else {
+          // Bitcast into different bitcast
+          B.SetInsertPoint(user);
+          user->replaceAllUsesWith(B.CreateBitCast(newAlloca, user->getType()));
+        }
+        IC.deleteInstructionLater(cast<Instruction>(user));
+      }
+
+      if (needCast) {
+        // Insert the bitcast after all the allocas
+        B.SetInsertPoint(insertPt);
+        auto *const scalarPtr =
+            B.CreatePointerCast(newAlloca, alloca->getType());
+        alloca->replaceAllUsesWith(scalarPtr);
+      }
+    } else {
+      // We couldn't vectorize the type, so create an array instead.
+      VECZ_FAIL_IF(SimdWidth.isScalable());
+      unsigned const fixedWidth = SimdWidth.getFixedValue();
+
+      AllocaInst *const wideAlloca =
+          B.CreateAlloca(dataTy, getSizeInt(B, fixedWidth), alloca->getName());
+      auto align = alloca->getAlign();
+
+      // Make sure the alloca has an alignment at least as wide as any of the
+      // packetized loads or stores using it.
+      SmallVector<Instruction *, 8> users;
+      for (Use &U : alloca->uses()) {
+        users.push_back(cast<Instruction>(U.getUser()));
+      }
+      while (!users.empty()) {
+        auto *const user = users.pop_back_val();
+        if (isa<BitCastInst>(user) || isa<GetElementPtrInst>(user)) {
+          for (Use &U : user->uses()) {
+            users.push_back(cast<Instruction>(U.getUser()));
+          }
+        } else if (auto memop = MemOp::get(user)) {
+          auto const memAlign = memop->getAlignment();
+          if (memAlign > align.value()) {
+            align = Align(memAlign);
+          }
+        }
+      }
+
+      wideAlloca->setAlignment(align);
+      wideAlloca->takeName(alloca);
+
+      // It's just a direct replacement.
+      alloca->replaceAllUsesWith(wideAlloca);
+    }
+
+    // Note that we don't assign the widened allocas a packet, because they
+    // are not really being packetized. The problem is, a packetized alloca
+    // would be expected to be a vector of pointers to scalars, not a scalar
+    // pointer to a vector. Only instantiation can create such a packet.
+    IC.deleteInstructionLater(alloca);
+  }
+
+  compiler::utils::NameMangler Mangler(&F.getContext());
+
+  // Handle get_sub_group_size specially (i.e., not in BuiltinInfo) since
+  // inlining it requires extra vectorization context, such as the
+  // vectorization width and choices; this inlining is too tightly coupled to
+  // the vectorizer context to exist in a generic sense.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI) {
+        continue;
+      }
+
+      auto *const Callee = CI->getCalledFunction();
+      if (Callee &&
+          "get_sub_group_size" == Mangler.demangleName(Callee->getName())) {
+        auto *const replacement = [this](CallInst *CI) -> Value * {
+          if (VL) {
+            return VL;
+          }
+
+          auto *const I32Ty = Type::getInt32Ty(F.getContext());
+          auto *const VFVal =
+              ConstantInt::get(I32Ty, SimdWidth.getKnownMinValue());
+          if (!SimdWidth.isScalable()) {
+            return VFVal;
+          } else {
+            IRBuilder<> B(CI);
+            return B.CreateVScale(VFVal);
+          }
+        }(CI);
+        CI->replaceAllUsesWith(replacement);
+        IC.deleteInstructionLater(CI);
+      }
+    }
+  }
+
+  IC.deleteInstructions();
+  return true;
+}
+
+void Packetizer::Impl::onFailure() {
+  // On failure, clean up pending Phis, which may still be invalid in that they
+  // have no incoming operands. For simplicity, just erase and replace all of
+  // them with undef: the failed vectorized function will be removed anyway.
+  for (auto *Phi : pendingPhis) {
+    auto &info = packets[Phi];
+    assert(info.numInstances > 0 && "A PHI pending packetization has no stub");
+    if (info.numInstances == 1) {
+      IRCleanup::deleteInstructionNow(cast<PHINode>(info.vector));
+    } else {
+      const auto PhiPacket = info.getRange(packetData);
+      for (unsigned j = 0; j < PhiPacket.size(); ++j) {
+        IRCleanup::deleteInstructionNow(cast<PHINode>(PhiPacket.at(j)));
+      }
+    }
+  }
+}
+
+Packetizer::Result Packetizer::packetize(Value *V) {
+  // This is safe because we only ever create an instance of Impl, never an
+  // instance of the base class.
+  return static_cast<Impl *>(this)->packetize(V);
+}
+
+Packetizer::Result Packetizer::getPacketized(Value *V) {
+  auto found = packets.find(V);
+  auto *info = found != packets.end() ? &found->second : nullptr;
+  return Packetizer::Result(*this, V, info);
+}
+
+PacketRange Packetizer::createPacket(Value *V, unsigned width) {
+  auto &info = packets[V];
+  info.numInstances = width;
+  return Result(*this, V, &info).createPacket(width);
+}
+
+Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
+                                          bool allOf) {
+  // Get the branch condition at its natural packet width
+  auto conds = packetizeAndGet(cond);
+  VECZ_FAIL_IF(conds.empty());
+
+  // Branches can only take a scalar mask. The new branch condition is true
+  // only if the original condition is true for any lane (or for all lanes if
+  // the condition is used in a BOSCC block indirection.)
+  IRBuilder<> B(terminator);
+  auto const name = cond->getName();
+
+  // Reduce the packet to a single value
+  auto w = conds.size();
+
+  if (VL && w != 1) {
+    emitVeczRemarkMissed(&F, cond,
+                         "Can not vector-predicate packets larger than 1");
+    return nullptr;
+  }
+
+  while ((w >>= 1)) {
+    for (decltype(w) i = 0; i < w; ++i) {
+      conds[i] =
+          allOf ? B.CreateAnd(conds[i], conds[i + w], Twine(name, ".all_of"))
+                : B.CreateOr(conds[i], conds[i + w], Twine(name, ".any_of"));
+    }
+  }
+
+  RecurKind kind = allOf ? RecurKind::And : RecurKind::Or;
+
+  // VP reduction intrinsics didn't make it into LLVM 13 so we have to make do
+  // by pre-sanitizing the input such that elements past VL get the identity
+  // value.
+  Value *&f = conds.front();
+
+  if (VL) {
+    f = sanitizeVPReductionInput(B, f, VL, kind);
+    VECZ_FAIL_IF(!f);
+  }
+
+  return createSimpleTargetReduction(B, &TTI, f, kind);
+}
+
+Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) {
+  if (!Vectorized) {
+    emitVeczRemarkMissed(&F, Scalar, "Failed to vectorize");
+    return Packetizer::Result(*this);
+  } else {
+    ++VeczPacketized;
+    auto &info = packets[Scalar];
+    info.vector = Vectorized;
+    info.numInstances = 1;
+    return Packetizer::Result(*this, Scalar, &info);
+  }
+}
+
+Value *Packetizer::Impl::packetizeIfVarying(Value *V) {
+  if (UVR.isVarying(V)) {
+    return packetize(V).getAsValue();
+  } else if (UVR.isMaskVarying(V)) {
+    VECZ_FAIL_IF(!packetize(V));
+  }
+  return V;
+}
+
+Packetizer::Result Packetizer::Impl::packetize(Value *V) {
+  // Do not packetize the same value twice.
+  if (const auto res = getPacketized(V)) {
+    return res;
+  }
+  // Now check whether this value is actually packetizable.
+  if (!Ctx.targetInfo().canPacketize(V, SimdWidth)) {
+    return Packetizer::Result(*this);
+  }
+
+  if (!isa<Instruction>(V)) {
+    return broadcast(V);
+  }
+
+  auto *const Ins = cast<Instruction>(V);
+  if (auto *const Branch = dyn_cast<BranchInst>(Ins)) {
+    if (Branch->isConditional()) {
+      // varying reductions need to be packetized
+      auto *newCond = packetize(Branch->getCondition()).getAsValue();
+      if (!newCond) {
+        return Packetizer::Result(*this);
+      }
+
+      // Packetization should normally have produced a reduction to scalar.
+      // However, when Packetize Uniform is on, a uniform branch won't have
+      // a divergence reduction so it will need reducing manually here.
+      if (newCond->getType()->isVectorTy()) {
+        IRBuilder<> B(Branch);
+        RecurKind kind = RecurKind::Or;
+        // Sanitize VP reduction inputs, if required.
+        if (VL) {
+          newCond = sanitizeVPReductionInput(B, newCond, VL, kind);
+          if (!newCond) {
+            return Packetizer::Result(*this);
+          }
+        }
+        newCond = createSimpleTargetReduction(B, &TTI, newCond, kind);
+      }
+
+      Branch->setCondition(newCond);
+    }
+    return broadcast(Ins);
+  }
+
+  if (isa<SwitchInst>(Ins)) {
+    // we can't handle varying switches
+    return Packetizer::Result(*this);
+  }
+
+  if (UVR.isMaskVarying(Ins)) {
+    if (auto *const res = packetizeMaskVarying(Ins)) {
+      return broadcast(res);
+    }
+    // Fall back on instantiation if the instruction could not be packetized
+    Instantiator->instantiate(Ins);
+    return getPacketized(Ins);
+  }
+
+  if (auto *reduction = packetizeSubgroupReduction(Ins)) {
+    return broadcast(reduction);
+  }
+
+  if (auto *brdcast = packetizeSubgroupBroadcast(Ins)) {
+    return broadcast(brdcast);
+  }
+
+  // Check if we should broadcast the instruction.
+  // Broadcast uniform instructions, unless we want to packetize uniform
+  // instructions as well. We can assume that isMaskVarying is false at this
+  // point.
+  bool shouldBroadcast = !UVR.isVarying(Ins) && !Choices.packetizeUniform();
+  // Or unless this instruction is in a loop and we want to packetize uniform
+  // instructions in loops
+  if (shouldBroadcast && Choices.packetizeUniformInLoops()) {
+    const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+    shouldBroadcast = !LI.getLoopFor(Ins->getParent());
+  }
+
+  // The packetization of a mask-varying value takes care of its own broadcast
+  if (shouldBroadcast) {
+    // Insert broadcast instructions after the instruction to broadcast
+    return broadcast(Ins);
+  }
+
+  if (const auto res = packetizeInstruction(Ins)) {
+    return res;
+  }
+  // Fall back on instantiation if the instruction could not be packetized,
+  // unless we're vector predicating.
+  if (VL) {
+    return Packetizer::Result(*this);
+  }
+  Instantiator->instantiate(Ins);
+  return getPacketized(Ins);
+}
+
+ValuePacket Packetizer::Impl::packetizeAndGet(Value *v) {
+  ValuePacket results;
+  if (auto res = packetize(v)) {
+    res.getPacketValues(results);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeAndGet(Value *v, unsigned w) {
+  ValuePacket results;
+  if (auto res = packetize(v)) {
+    res.getPacketValues(w, results);
+  }
+  return results;
+}
+
+Packetizer::Result Packetizer::Impl::broadcast(Value *V) {
+  return Result(*this, V, &packets[V]);
+}
+
+unsigned Packetizer::Impl::getPacketWidthForType(Type *ty,
+                                                 unsigned limit) const {
+  if (SimdWidth.isScalable()) {
+    return 1;
+  }
+
+  const unsigned simdWidth = SimdWidth.getFixedValue();
+  unsigned maxWidth = 0;
+
+  if (!Choices.targetIndependentPacketization()) {
+    maxWidth = std::min(limit, Ctx.targetInfo().getVectorWidthForType(
+                                   TTI, *ty->getScalarType()));
+
+    // We let the target return a value wider than the SIMD Width, but not
+    // narrower.
+    if (maxWidth) {
+      maxWidth = std::max(simdWidth, maxWidth);
+    }
+  }
+
+  if (maxWidth == 0) {
+    maxWidth = std::max(simdWidth, 16u);
+  }
+
+  unsigned elts = 1;
+  if (ty->isVectorTy()) {
+    auto *vecTy = cast<FixedVectorType>(ty);
+    elts = vecTy->getNumElements();
+  }
+
+  const unsigned fullWidth = elts * simdWidth;
+  if (fullWidth <= maxWidth) {
+    return 1;
+  }
+
+  // Round up to the next power of two..
+  // This should only be needed if the type was a 3-vector..
+  // Note that we don't really expect huge values here, over 16 is still
+  // currently not officially supported, over 256 would be astonishing,
+  // and over 65536 would be inconcievable, so we don't bother to >> 16.
+  unsigned width = fullWidth / maxWidth - 1;
+  width |= width >> 1;
+  width |= width >> 2;
+  width |= width >> 4;
+  width |= width >> 8;
+
+  // Can't have a packet wider than the simdWidth..
+  return std::min(width + 1, simdWidth);
+}
+
+Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
+  ValuePacket results;
+
+  // Figure out what kind of instruction it is and try to vectorize it.
+  switch (Ins->getOpcode()) {
+    default:
+      if (Ins->isBinaryOp()) {
+        results = packetizeBinaryOp(cast<BinaryOperator>(Ins));
+      } else if (Ins->isCast()) {
+        results = packetizeCast(cast<CastInst>(Ins));
+      } else if (Ins->isUnaryOp()) {
+        results = packetizeUnaryOp(cast<UnaryOperator>(Ins));
+      }
+      break;
+
+    case Instruction::PHI:
+      results = packetizePHI(cast<PHINode>(Ins));
+      break;
+    case Instruction::GetElementPtr:
+      results = packetizeGEP(cast<GetElementPtrInst>(Ins));
+      break;
+    case Instruction::Store:
+      results = packetizeStore(cast<StoreInst>(Ins));
+      break;
+    case Instruction::Load:
+      results = packetizeLoad(cast<LoadInst>(Ins));
+      break;
+    case Instruction::Call:
+      results = packetizeCall(cast<CallInst>(Ins));
+      break;
+    case Instruction::ICmp:
+      results = packetizeICmp(cast<ICmpInst>(Ins));
+      break;
+    case Instruction::FCmp:
+      results = packetizeFCmp(cast<FCmpInst>(Ins));
+      break;
+    case Instruction::Select:
+      results = packetizeSelect(cast<SelectInst>(Ins));
+      break;
+    case Instruction::InsertElement:
+      results = packetizeInsertElement(cast<InsertElementInst>(Ins));
+      break;
+    case Instruction::ExtractElement:
+      results = packetizeExtractElement(cast<ExtractElementInst>(Ins));
+      break;
+    case Instruction::ShuffleVector:
+      results = packetizeShuffleVector(cast<ShuffleVectorInst>(Ins));
+      break;
+    case Instruction::Freeze:
+      results = packetizeFreeze(cast<FreezeInst>(Ins));
+      break;
+  }
+
+  if (!results.empty()) {
+    auto packetWidth = results.size();
+    if (packetWidth == 1) {
+      Value *vec = results.front();
+      if (vec != Ins) {
+        // Only delete if the vectorized value is different from the scalar.
+        IC.deleteInstructionLater(Ins);
+      }
+      vectorizeDI(Ins, vec);
+      return assign(Ins, vec);
+    } else {
+      IC.deleteInstructionLater(Ins);
+      auto &info = packets[Ins];
+      auto res = Result(*this, Ins, &info);
+      auto P = res.createPacket(packetWidth);
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        P[i] = results[i];
+        // TODO CA-3376: vectorize the debug instructions
+      }
+      info.numInstances = packetWidth;
+      ++VeczPacketized;
+      return res;
+    }
+  }
+
+  if (auto *vec = vectorizeInstruction(Ins)) {
+    return assign(Ins, vec);
+  }
+
+  return Packetizer::Result(*this, Ins, nullptr);
+}
+
+Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->getCalledFunction()) {
+    return nullptr;
+  }
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *callee = CI->getCalledFunction();
+
+  auto const Builtin = BI.analyzeBuiltin(*callee);
+  auto const subgroupReduceKind = BI.getBuiltinSubgroupReductionKind(Builtin);
+
+  if (subgroupReduceKind == compiler::utils::eBuiltinSubgroupReduceInvalid) {
+    return nullptr;
+  }
+
+  SmallVector<Value *, 16> opPackets;
+  IRBuilder<> B(buildAfter(CI, F));
+  auto *const argTy = CI->getArgOperand(0)->getType();
+  auto packetWidth = getPacketWidthForType(argTy);
+
+  // Don't vector predicate if we have to split into multiple packets. The
+  // introduction of instructions to manage the splitting up of our VL into N
+  // chunks is likely to kill performance anyway.
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, CI,
+                         "Can not vector-predicate packets larger than 1");
+    return nullptr;
+  }
+
+  auto op = packetize(CI->getArgOperand(0));
+
+  bool isSignedInt = false;
+  bool const isFP = argTy->isFPOrFPVectorTy();
+  bool const isBool = argTy->isIntOrIntVectorTy(/*BitWidth*/ 1);
+  (void)isBool;
+
+  // Determine whether this is a signed or unsigned integer min/max reduction.
+  if (!isFP &&
+      (subgroupReduceKind == compiler::utils::eBuiltinSubgroupReduceMax ||
+       subgroupReduceKind == compiler::utils::eBuiltinSubgroupReduceMin)) {
+    // Demangle the function name to get the type qualifiers.
+    SmallVector<Type *, 2> Types;
+    SmallVector<compiler::utils::TypeQualifiers, 2> Quals;
+    compiler::utils::NameMangler Mangler(&F.getContext());
+    if (!Mangler.demangleName(callee->getName(), Types, Quals).empty()) {
+      assert(!Quals.empty());
+      auto &Qual = Quals[0];
+      while (!isSignedInt && Qual.getCount()) {
+        isSignedInt |= Qual.pop_front() == compiler::utils::eTypeQualSignedInt;
+      }
+    }
+  }
+
+  RecurKind recurK;
+  switch (subgroupReduceKind) {
+    default:
+      emitVeczRemarkMissed(&F, nullptr, "Unimplemented subgroup reduction");
+      VECZ_FAIL();
+      break;
+    case compiler::utils::eBuiltinSubgroupAll:
+      recurK = RecurKind::And;
+      break;
+    case compiler::utils::eBuiltinSubgroupAny:
+      recurK = RecurKind::Or;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceAdd:
+      recurK = isFP ? RecurKind::FAdd : RecurKind::Add;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceMin:
+      recurK = isFP ? RecurKind::FMin
+                    : (isSignedInt ? RecurKind::SMin : RecurKind::UMin);
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceMax:
+      recurK = isFP ? RecurKind::FMax
+                    : (isSignedInt ? RecurKind::SMax : RecurKind::UMax);
+      break;
+    // SPV_KHR_uniform_group_instructions
+    case compiler::utils::eBuiltinSubgroupReduceMul:
+      recurK = isFP ? RecurKind::FMul : RecurKind::Mul;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceAnd:
+      assert(!isFP && "Invalid subgroup reduction");
+      recurK = RecurKind::And;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceOr:
+      assert(!isFP && "Invalid subgroup reduction");
+      recurK = RecurKind::Or;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceXor:
+      assert(!isFP && "Invalid subgroup reduction");
+      recurK = RecurKind::Xor;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceLogicalAnd:
+      assert(isBool && "Invalid subgroup reduction");
+      recurK = RecurKind::And;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceLogicalOr:
+      assert(isBool && "Invalid subgroup reduction");
+      recurK = RecurKind::Or;
+      break;
+    case compiler::utils::eBuiltinSubgroupReduceLogicalXor:
+      assert(isBool && "Invalid subgroup reduction");
+      recurK = RecurKind::Xor;
+      break;
+  }
+
+  // Reduce the packet values in-place.
+  // TODO: can we add 'reassoc' to the floating-point reductions to absolve
+  // them of ordering? See CA-3969.
+  op.getPacketValues(packetWidth, opPackets);
+
+  // Any/All reductions are defined as reducing over the i32 value being
+  // "evaluated to non-zero", so emit the required comparisons.
+  if (subgroupReduceKind == compiler::utils::eBuiltinSubgroupAll ||
+      subgroupReduceKind == compiler::utils::eBuiltinSubgroupAny) {
+    for (unsigned i = 0, e = opPackets.size(); i != e; i++) {
+      opPackets[i] = B.CreateICmpNE(
+          opPackets[i], ConstantInt::get(opPackets[i]->getType(), 0));
+    }
+  }
+
+  // When in VP mode, pre-sanitize the reduction input (before VP reduction
+  // intrinsics, introduced in LLVM 14)
+  if (VL) {
+    assert(opPackets.size() == 1 &&
+           "Should have bailed if dealing with more than one packet");
+    Value *&val = opPackets.front();
+    val = sanitizeVPReductionInput(B, val, VL, recurK);
+    if (!val) {
+      emitVeczRemarkMissed(&F, CI,
+                           "Can not vector-predicate subgroup reduction");
+      return nullptr;
+    }
+  }
+
+  // According to the OpenCL Spec, we are allowed to rearrange the operation
+  // order of a subgroup reduction any way we like (even though floating point
+  // addition is not associative so might not produce exactly the same result),
+  // so we reduce to a single vector first, if necessary, and then do a single
+  // reduction to scalar. This is more efficient than doing multiple reductions
+  // to scalar and then BinOp'ing multiple scalars together.
+  //
+  // Reduce to a single vector.
+  while ((packetWidth >>= 1)) {
+    for (decltype(packetWidth) i = 0; i < packetWidth; ++i) {
+      Value *const lhs = opPackets[i];
+      Value *const rhs = opPackets[i + packetWidth];
+      opPackets[i] = multi_llvm::createBinOpForRecurKind(B, lhs, rhs, recurK);
+    }
+  }
+
+  // Reduce to a scalar.
+  Value *v = createSimpleTargetReduction(B, &TTI, opPackets.front(), recurK);
+
+  IC.deleteInstructionLater(CI);
+
+  // For any/all reductions we have to get back from an i1 to the original
+  // type.
+  if (subgroupReduceKind == compiler::utils::eBuiltinSubgroupAll ||
+      subgroupReduceKind == compiler::utils::eBuiltinSubgroupAny) {
+    v = B.CreateSExt(v, CI->getType());
+  }
+
+  CI->replaceAllUsesWith(v);
+
+  return v;
+}
+
+Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->getCalledFunction()) {
+    return nullptr;
+  }
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *callee = CI->getCalledFunction();
+  auto const Builtin = BI.analyzeBuiltin(*callee);
+
+  if (!Builtin.isValid() || Builtin.ID != BI.getSubgroupBroadcastBuiltin()) {
+    return nullptr;
+  }
+
+  IRBuilder<> B(buildAfter(CI, F));
+
+  auto *const idx = CI->getArgOperand(1);
+
+  auto op = packetize(CI->getArgOperand(0));
+  PACK_FAIL_IF(!op);
+  Value *val = nullptr;
+  // Optimize the constant fixed-vector case, where we can choose the exact
+  // subpacket to extract from directly.
+  if (isa<ConstantInt>(idx) && !SimdWidth.isScalable()) {
+    ValuePacket opPackets;
+    op.getPacketValues(opPackets);
+    auto factor = SimdWidth.divideCoefficientBy(opPackets.size());
+    const unsigned subvecSize = factor.getFixedValue();
+    const unsigned idxVal = cast<ConstantInt>(idx)->getZExtValue();
+    // If individual elements are scalar (through instantiation, say) then just
+    // use the desired packet directly.
+    if (subvecSize == 1) {
+      val = opPackets[idxVal];
+    } else {
+      // Else extract from the correct packet, adjusting the index as we go.
+      val = B.CreateExtractElement(
+          opPackets[idxVal / subvecSize],
+          ConstantInt::get(idx->getType(), idxVal % subvecSize));
+    }
+  } else {
+    val = B.CreateExtractElement(op.getAsValue(), idx);
+  }
+
+  IC.deleteInstructionLater(CI);
+
+  CI->replaceAllUsesWith(val);
+
+  return val;
+}
+
+Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
+  if (auto memop = MemOp::get(I)) {
+    auto *const mask = memop->getMaskOperand();
+    if (!mask) {
+      return nullptr;
+    }
+
+    Value *vecMask = nullptr;
+
+    MemOpDesc desc = memop->getDesc();
+    bool isVector = desc.getDataType()->isVectorTy();
+
+    // If only the mask operand is varying, we do not need to vectorize the
+    // MemOp itself, only reduce the mask with an OR.
+    if (!isVector) {
+      vecMask = packetize(mask).getAsValue();
+    } else {
+      // If it's a vector, and the mask is splatted, then packetize the
+      // splatted value, reduce it, then re-splat it as a vector. Otherwise, we
+      // send it to the instantiator.
+      auto *const splatVal = getSplatValue(mask);
+      if (!splatVal) {
+        return nullptr;
+      }
+      vecMask = packetize(splatVal).getAsValue();
+    }
+
+    VECZ_FAIL_IF(!vecMask);
+
+    // Build the reduction right after the vector to reduce register
+    // pressure, and to make it easier for CSE/GVN to combine them if there
+    // are multiple uses of the same value (we could cache these?)
+    auto *maskInst = dyn_cast<Instruction>(vecMask);
+    IRBuilder<> B(maskInst ? buildAfter(maskInst, F) : I);
+
+    // Sanitize any vector-predicated inputs.
+    if (VL) {
+      vecMask = sanitizeVPReductionInput(B, vecMask, VL, RecurKind::Or);
+      VECZ_FAIL_IF(!vecMask);
+    }
+
+    Value *anyOfMask =
+        createSimpleTargetReduction(B, &TTI, vecMask, RecurKind::Or);
+    anyOfMask->setName("any_of_mask");
+
+    if (isVector) {
+      anyOfMask = B.CreateVectorSplat(
+          multi_llvm::getVectorElementCount(desc.getDataType()), anyOfMask);
+    }
+
+    memop->setMaskOperand(anyOfMask);
+
+    return I;
+  }
+
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI) {
+    return nullptr;
+  }
+
+  Function *callee = CI->getCalledFunction();
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(callee)) {
+    // Handle lane mask reductions.
+    // We treat these as Mask Varying instructions since their single argument
+    // represents a lane mask and their result is a reduction over all lanes,
+    // which means it is effectively uniform. We don't actually have to check
+    // that they are mask varying, because that is the only possible uniformity
+    // value of these function calls.
+    compiler::utils::Lexer L(callee->getName());
+    VECZ_FAIL_IF(!L.Consume(VectorizationContext::InternalBuiltinPrefix));
+    bool any = false;
+    bool divergence = false;
+    if (L.Consume("divergence_any")) {
+      divergence = true;
+    } else if (L.Consume("divergence_all")) {
+      any = true;
+      divergence = true;
+    }
+
+    if (divergence) {
+      IC.deleteInstructionLater(CI);
+      auto *const reduce = reduceBranchCond(CI->getOperand(0), CI, any);
+      CI->replaceAllUsesWith(reduce);
+      return reduce;
+    }
+  }
+
+  return nullptr;
+}
+
+ValuePacket Packetizer::Impl::packetizePHI(PHINode *Phi) {
+  ValuePacket results;
+  auto *const ty = Phi->getType();
+
+  auto *wideTy = ty;
+  unsigned packetWidth = 0;
+  if (ty->isVectorTy() || VectorType::isValidElementType(ty)) {
+    packetWidth = getPacketWidthForType(ty);
+    wideTy =
+        getWideType(Phi->getType(), SimdWidth.divideCoefficientBy(packetWidth));
+  } else {
+    // It's not a type we can widen, but we can save the instantiator the job..
+    if (SimdWidth.isScalable()) {
+      // as long as we aren't requesting a scalable vectorization factor..
+      return results;
+    }
+    packetWidth = SimdWidth.getFixedValue();
+  }
+
+  IRBuilder<> B(buildAfter(Phi, F, true));
+  auto numVals = Phi->getNumIncomingValues();
+  auto name = Phi->getName();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreatePHI(wideTy, numVals, name));
+  }
+
+  // To avoid cycles in the use/def chain, packetize the incoming values later.
+  // This allows packetizing phi uses by creating an 'empty' phi placeholder.
+  pendingPhis.push_back(Phi);
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
+  ValuePacket results;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return results;
+  }
+
+  IRBuilder<> B(CI);
+  // Handle LLVM intrinsics.
+  if (Callee->isIntrinsic()) {
+    auto IntrID = Intrinsic::ID(Callee->getIntrinsicID());
+    if (IntrID == llvm::Intrinsic::lifetime_end ||
+        IntrID == llvm::Intrinsic::lifetime_start) {
+      auto *ptr = CI->getOperand(1);
+      if (auto *const bcast = dyn_cast<BitCastInst>(ptr)) {
+        ptr = bcast->getOperand(0);
+      }
+
+      if (auto *const alloca = dyn_cast<AllocaInst>(ptr)) {
+        if (!needsInstantiation(Ctx, *alloca)) {
+          // If it's an alloca we can widen, we can just change the size
+          llvm::TypeSize const allocSize =
+              Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
+          auto const lifeSize = allocSize.isScalable() || SimdWidth.isScalable()
+                                    ? -1
+                                    : multi_llvm::getKnownMinValue(allocSize) *
+                                          SimdWidth.getKnownMinValue();
+          CI->setOperand(
+              0, ConstantInt::get(CI->getOperand(0)->getType(), lifeSize));
+          results.push_back(CI);
+        }
+      }
+      return results;
+    }
+
+    auto const Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
+    if (!(Props & compiler::utils::eBuiltinPropertyVectorEquivalent)) {
+      return results;
+    }
+
+    // Only floating point intrinsics need this to be set to CI.
+    // The IR Builder helpfully crashes when we pass it unnecessarily.
+    Instruction *fastMathSrc = isa<FPMathOperator>(CI) ? CI : nullptr;
+
+    // Using a native array with hard coded size for simplicity, make sure
+    // to increase this if intrinsics with more operands are to be handled
+    size_t constexpr maxOperands = 3;
+    // Some llvm intrinsic functions like abs have argument that are constants
+    // and define as llvm_i1_ty. This means that thoses operand can't
+    // be packetized. To solve that temporary, we use this vector so every
+    // cases can set independently what operand must be skipped
+    // CA-3696
+    SmallVector<bool, maxOperands> operandsToSkip(maxOperands, false);
+    switch (IntrID) {
+      case Intrinsic::abs:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+        // def abs [LLVMMatchType<0>, llvm_i1_ty]
+        operandsToSkip = {false, true};
+        break;
+      default:
+        break;
+    }
+
+    auto *const ty = CI->getType();
+    auto packetWidth = getPacketWidthForType(ty);
+    auto *const wideTy =
+        getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth));
+
+    auto const n = CI->arg_size();
+    assert(n <= maxOperands && "Intrinsic has too many arguments");
+
+    SmallVector<Value *, 16> opPackets[maxOperands];
+    for (auto i = decltype(n){0}; i < n; ++i) {
+      auto *argOperand = CI->getArgOperand(i);
+
+      if (operandsToSkip[i]) {
+        assert(isa<Constant>(argOperand) && "Operand should be a Constant");
+        opPackets[i].resize(packetWidth);
+        std::fill(opPackets[i].begin(), opPackets[i].end(), argOperand);
+      } else {
+        auto op = packetize(CI->getArgOperand(i));
+        if (!op) {
+          return results;
+        }
+        op.getPacketValues(packetWidth, opPackets[i]);
+        PACK_FAIL_IF(opPackets[i].empty());
+      }
+    }
+
+    auto const name = CI->getName();
+    Type *const types[1] = {wideTy};  // because LLVM 13 is a numpty
+    Value *opVals[maxOperands];
+    for (unsigned i = 0; i < packetWidth; ++i) {
+      for (unsigned j = 0; j < n; ++j) {
+        opVals[j] = opPackets[j][i];
+      }
+
+      results.push_back(B.CreateIntrinsic(
+          IntrID, types, ArrayRef<Value *>(opVals, n), fastMathSrc, name));
+    }
+    return results;
+  }
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(Callee)) {
+    // Handle masked loads and stores.
+    if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp()) {
+        return packetizeMemOp(*MaskedOp);
+      }
+    }
+  }
+
+  auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
+  auto const subgroupScanKind =
+      Ctx.builtins().getBuiltinSubgroupScanKind(Builtin);
+
+  // Handle subgroup scans, which defer to internal builtins.
+  if (Builtin.isValid() &&
+      subgroupScanKind != compiler::utils::eBuiltinSubgroupScanInvalid) {
+    return packetizeSubgroupScan(CI, subgroupScanKind);
+  }
+
+  // Handle external builtins.
+  auto const Props = Builtin.properties;
+  if (Props & compiler::utils::eBuiltinPropertyExecutionFlow ||
+      Props & compiler::utils::eBuiltinPropertyWorkItem) {
+    return results;
+  }
+
+  auto *const ty = CI->getType();
+
+  // Our builtins are only defined up to a width of 16 so will not vectorize
+  // above that. Inspect the operands as well in case they are wider, for
+  // instance a convert from float to i8, we would rather widen according to
+  // the float and not the i8 so we don't create too wide a vector of floats.
+  auto packetWidth = getPacketWidthForType(ty, 16u);
+  for (const auto &op : CI->data_ops()) {
+    auto *const vTy = op.get()->getType();
+    if (!vTy->isPointerTy()) {
+      packetWidth = std::max(packetWidth, getPacketWidthForType(vTy, 16u));
+    }
+  }
+
+  auto factor = SimdWidth.divideCoefficientBy(packetWidth);
+
+  // Try to find a unit for this builtin.
+  auto CalleeVec = Ctx.getVectorizedFunction(*Callee, factor);
+  if (!CalleeVec) {
+    // No vectorization strategy found. Fall back on Instantiation.
+    return results;
+  }
+
+  // Packetize call operands.
+  // But not if they have pointer return arguments (handled in vectorizeCall).
+  for (const auto &TargetArg : CalleeVec.args) {
+    PACK_FAIL_IF(TargetArg.kind == VectorizationResult::Arg::POINTER_RETURN);
+  }
+
+  auto *const vecTy = dyn_cast<FixedVectorType>(ty);
+  unsigned const scalarWidth = vecTy ? vecTy->getNumElements() : 1;
+  unsigned i = 0;
+  SmallVector<SmallVector<Value *, 16>, 4> opPackets;
+  for (const auto &TargetArg : CalleeVec.args) {
+    opPackets.emplace_back();
+
+    // Handle scalar arguments.
+    Value *scalarOp = CI->getArgOperand(i);
+    if (TargetArg.kind == VectorizationResult::Arg::SCALAR) {
+      for (unsigned j = 0; j < packetWidth; ++j) {
+        opPackets.back().push_back(scalarOp);
+      }
+      i++;
+      continue;
+    }
+
+    // Vectorize scalar operands.
+    auto op = packetize(CI->getOperand(i));
+    PACK_FAIL_IF(!op);
+
+    // The vector versions of some builtins can have a mix of vector and scalar
+    // arguments. We need to widen any scalar arguments by sub-splatting.
+    auto *const scalarTy = scalarOp->getType();
+    auto *const argTy = TargetArg.type;
+    if (vecTy && !scalarTy->isVectorTy()) {
+      PACK_FAIL_IF(argTy->getScalarType() != scalarTy);
+
+      op.getPacketValues(packetWidth, opPackets.back());
+      PACK_FAIL_IF(opPackets.back().empty());
+
+      // Widen the scalar operands.
+      PACK_FAIL_IF(
+          !createSubSplats(Ctx.targetInfo(), B, opPackets.back(), scalarWidth));
+    } else {
+      // Make sure the type is correct for vector arguments.
+      Type *wideTy = getWideType(scalarOp->getType(), factor);
+      PACK_FAIL_IF(argTy != wideTy);
+
+      op.getPacketValues(packetWidth, opPackets.back());
+      PACK_FAIL_IF(opPackets.back().empty());
+    }
+    i++;
+  }
+
+  auto numArgs = opPackets.size();
+  SmallVector<Value *, 4> opVals;
+  opVals.resize(numArgs);
+
+  auto *vecFn = CalleeVec.get();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    for (unsigned j = 0; j < numArgs; ++j) {
+      opVals[j] = opPackets[j][i];
+    }
+
+    CallInst *newCI = B.CreateCall(vecFn, opVals, CI->getName());
+    newCI->setCallingConv(CI->getCallingConv());
+    results.push_back(newCI);
+  }
+
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeSubgroupScan(
+    CallInst *CI, compiler::utils::BuiltinSubgroupScanKind subgroupScanKind) {
+  ValuePacket results;
+
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return results;
+  }
+
+  const StringRef fnName = callee->getName();
+  compiler::utils::NameMangler mangler(&CI->getContext());
+
+  // The operands and types for the internal builtin
+  SmallVector<Value *, 2> Ops = {packetize(CI->getArgOperand(0)).getAsValue()};
+  SmallVector<Type *, 2> Tys = {getWideType(CI->getType(), SimdWidth)};
+
+  bool isInclusive = true;
+  StringRef op = "add";
+  // min/max scans are prefixed with s/u if they are signed/unsigned integer
+  // operations. The value 'None' here represents an operation where the sign
+  // of the operands is unimportant, such as floating-point operations, or
+  // integer addition.
+  multi_llvm::Optional<bool> optIsSignedInt;
+  bool isInt = Tys[0]->isIntOrIntVectorTy();
+
+  // Determine whether this is a signed or unsigned integer min/max scan.
+  const auto isSignedArg0 = [isInt, fnName, &mangler]() -> Optional<bool> {
+    if (!isInt) {
+      return multi_llvm::None;
+    }
+    // Demangle the function name to get the type qualifiers.
+    SmallVector<Type *, 2> types;
+    SmallVector<compiler::utils::TypeQualifiers, 2> quals;
+    if (mangler.demangleName(fnName, types, quals).empty()) {
+      return false;
+    }
+    assert(!quals.empty());
+    auto &qual = quals[0];
+    bool isSignedInt = false;
+    while (!isSignedInt && qual.getCount()) {
+      isSignedInt |= qual.pop_front() == compiler::utils::eTypeQualSignedInt;
+    }
+    return isSignedInt;
+  };
+
+  switch (subgroupScanKind) {
+    default:
+      assert(false && "Impossible subgroup scan kind");
+      return results;
+    case compiler::utils::eBuiltinSubgroupScanAddExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanAddIncl:
+      op = "add";
+      break;
+    case compiler::utils::eBuiltinSubgroupScanMinExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanMinIncl:
+      op = "min";
+      optIsSignedInt = isSignedArg0();
+      break;
+    case compiler::utils::eBuiltinSubgroupScanMaxExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanMaxIncl:
+      op = "max";
+      optIsSignedInt = isSignedArg0();
+      break;
+      /// Scans provided by SPV_KHR_uniform_group_instructions.
+    case compiler::utils::eBuiltinSubgroupScanMulExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanMulIncl:
+      op = "mul";
+      break;
+    case compiler::utils::eBuiltinSubgroupScanAndExcl:
+    case compiler::utils::eBuiltinSubgroupScanLogicalAndExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanAndIncl:
+    case compiler::utils::eBuiltinSubgroupScanLogicalAndIncl:
+      // Since we only support logical and on boolean types, we can re-use the
+      // regular bitwise and builtin.
+      op = "and";
+      break;
+    case compiler::utils::eBuiltinSubgroupScanOrExcl:
+    case compiler::utils::eBuiltinSubgroupScanLogicalOrExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanOrIncl:
+    case compiler::utils::eBuiltinSubgroupScanLogicalOrIncl:
+      // Since we only support logical or on boolean types, we can re-use the
+      // regular bitwise or builtin.
+      op = "or";
+      break;
+    case compiler::utils::eBuiltinSubgroupScanXorExcl:
+    case compiler::utils::eBuiltinSubgroupScanLogicalXorExcl:
+      isInclusive = false;
+      LLVM_FALLTHROUGH;
+    case compiler::utils::eBuiltinSubgroupScanXorIncl:
+    case compiler::utils::eBuiltinSubgroupScanLogicalXorIncl:
+      // Since we only support logical xor on boolean types, we can re-use the
+      // regular bitwise xor builtin.
+      op = "xor";
+      break;
+  }
+
+  // Now create the mangled builtin function name.
+  SmallString<128> NameSV;
+  raw_svector_ostream O(NameSV);
+
+  // We don't bother with VP for fixed vectors, because it doesn't save us
+  // anything.
+  bool const VP = VL && SimdWidth.isScalable();
+
+  O << VectorizationContext::InternalBuiltinPrefix << "sub_group_scan_"
+    << (isInclusive ? "inclusive" : "exclusive") << "_"
+    << (optIsSignedInt.has_value() ? (*optIsSignedInt ? "s" : "u") : "") << op
+    << (VP ? "_vp" : "") << "_";
+
+  compiler::utils::TypeQualifiers VecQuals(
+      compiler::utils::eTypeQualNone, optIsSignedInt == true
+                                          ? compiler::utils::eTypeQualSignedInt
+                                          : compiler::utils::eTypeQualNone);
+  if (!mangler.mangleType(O, Tys[0], VecQuals)) {
+    return results;
+  }
+
+  // VP operations mangle the extra i32 VL operand.
+  if (VP) {
+    Ops.push_back(VL);
+    Tys.push_back(VL->getType());
+    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    if (!mangler.mangleType(O, Tys[1], VLQuals)) {
+      return results;
+    }
+  }
+
+  auto *SubgroupScanFnTy = FunctionType::get(Tys[0], Tys, /*isVarArg*/ false);
+  auto *const SubgroupFn =
+      Ctx.getOrCreateInternalBuiltin(NameSV, SubgroupScanFnTy);
+
+  IRBuilder<> B(CI);
+
+  auto *c = B.CreateCall(SubgroupFn, Ops);
+
+  results.push_back(c);
+  return results;
+}
+
+Value *Packetizer::Impl::vectorizeInstruction(Instruction *Ins) {
+  if (needsInstantiation(Ctx, *Ins)) {
+    return nullptr;
+  }
+
+  // Figure out what kind of instruction it is and try to vectorize it.
+  Value *Result = nullptr;
+  switch (Ins->getOpcode()) {
+    default:
+      break;
+    case Instruction::Call:
+      Result = vectorizeCall(cast<CallInst>(Ins));
+      break;
+    case Instruction::Ret:
+      Result = vectorizeReturn(cast<ReturnInst>(Ins));
+      break;
+    case Instruction::Alloca:
+      Result = vectorizeAlloca(cast<AllocaInst>(Ins));
+      break;
+    case Instruction::ExtractValue:
+      Result = vectorizeExtractValue(cast<ExtractValueInst>(Ins));
+      break;
+  }
+
+  if (Result) {
+    vectorizeDI(Ins, Result);
+  }
+  return Result;
+}
+
+ValuePacket Packetizer::Impl::packetizeLoad(LoadInst *Load) {
+  auto Op = *MemOp::get(Load);
+  return packetizeMemOp(Op);
+}
+
+ValuePacket Packetizer::Impl::packetizeStore(StoreInst *Store) {
+  auto Op = *MemOp::get(Store);
+  return packetizeMemOp(Op);
+}
+
+ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
+  ValuePacket results;
+
+  // Determine the stride of the memory operation.
+  // Vectorize the pointer if there is no valid stride.
+  Value *ptr = op.getPointerOperand();
+  assert(ptr && "Could not get pointer operand of Op");
+
+  auto *const dataTy = op.getDataType();
+  if (!dataTy->isVectorTy() && !VectorType::isValidElementType(dataTy)) {
+    return results;
+  }
+
+  if (auto *const vecTy = dyn_cast<FixedVectorType>(dataTy)) {
+    auto const elts = vecTy->getNumElements();
+    if (elts & (elts - 1)) {
+      // If the data type is a vector with number of elements not a power of 2,
+      // it is not safe to widen, because of alignment padding. Reject it and
+      // let instantiation deal with it..
+      return results;
+    }
+  }
+
+  auto const packetWidth = getPacketWidthForType(dataTy);
+  // Note: NOT const because LLVM 11 can't multiply a const ElementCount.
+  auto factor = SimdWidth.divideCoefficientBy(packetWidth);
+
+  if (factor.isScalar()) {
+    // not actually widening anything here, so just instantiate it
+    return results;
+  }
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, op.getInstr(),
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  IRBuilder<> B(op.getInstr());
+  IC.deleteInstructionLater(op.getInstr());
+
+  auto const name = op.getInstr()->getName();
+  auto *const mask = op.getMaskOperand();
+  auto *const data = op.getDataOperand();
+  auto *const stride = SAR.buildMemoryStride(B, ptr, dataTy);
+
+  auto *const vecPtrTy = dyn_cast<FixedVectorType>(dataTy);
+
+  // If we're vector-predicating a vector access, scale the vector length up by
+  // the original number of vector elements.
+  // Adjust the MemOp so that it is VL-predicated, if we must.
+  Value *EVL = VL;
+  if (vecPtrTy && VL) {
+    EVL = B.CreateMul(VL, B.getInt32(vecPtrTy->getNumElements()));
+  }
+
+  auto *const constantStrideVal = dyn_cast_or_null<ConstantInt>(stride);
+  int constantStride =
+      constantStrideVal ? constantStrideVal->getSExtValue() : 0;
+  bool validStride = stride && (!constantStrideVal || constantStride != 0);
+  if (!validStride) {
+    if (dataTy->isPointerTy()) {
+      // We do not have vector-of-pointers support in Vecz builtins, hence
+      // instantiate instead of packetize
+      return results;
+    }
+
+    bool const scalable = SimdWidth.isScalable();
+    if (!mask && dataTy->isVectorTy() && !scalable) {
+      // unmasked scatter/gathers are better off instantiated..
+      return results;
+    }
+
+    // Assume that individual masked loads/stores are more efficient when the
+    // type does not fit into a native integer. Since instantiation is never an
+    // option for scalable vectors, we do not consider this option.
+    if (vecPtrTy && !scalable &&
+        !Ctx.dataLayout()->fitsInLegalInteger(
+            dataTy->getPrimitiveSizeInBits())) {
+      return results;
+    }
+
+    auto ptrPacket = packetizeAndGet(ptr, packetWidth);
+    PACK_FAIL_IF(ptrPacket.empty());
+
+    auto *const scalarTy = dataTy->getScalarType();
+    auto *const scalarPtrTy =
+        cast<PointerType>(ptr->getType()->getScalarType());
+
+    // When scattering/gathering with a vector type, we can cast it to a
+    // vector of pointers to the scalar type and widen it into a vector
+    // of pointers to all the individual elements, and then gather/scatter
+    // using that.
+    if (vecPtrTy && scalable) {
+      // Scalable requires special codegen that avoids shuffles, but the idea
+      // is the same.
+      // We only handle the one packet right now.
+      PACK_FAIL_IF(ptrPacket.size() != 1);
+      auto const scalarWidth = vecPtrTy->getNumElements();
+      Value *&vecPtr = ptrPacket.front();
+      ElementCount const wideEC = factor * scalarWidth;
+      // Sub-splat the pointers such that we get, e.g.:
+      // <A, B> -> x4 -> <A, A, A, A, B, B, B, B>
+      bool const success =
+          createSubSplats(Ctx.targetInfo(), B, ptrPacket, scalarWidth);
+      PACK_FAIL_IF(!success);
+      auto *const newPtrTy = llvm::VectorType::get(
+          PointerType::get(scalarTy, scalarPtrTy->getPointerAddressSpace()),
+          wideEC);
+      // Bitcast the above sub-splat to purely scalar pointers
+      vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
+      // Create an index sequence to start the offseting process
+      Value *idxVector = multi_llvm::createIndexSequence(
+          B, VectorType::get(B.getInt32Ty(), wideEC), wideEC, "index.vec");
+      PACK_FAIL_IF(!idxVector);
+      // Modulo the indices 0,1,2,.. with the original vector type, producing,
+      // e.g., for the above: <0,1,2,3,0,1,2,3>
+      auto *const subVecEltsSplat =
+          B.CreateVectorSplat(wideEC, B.getInt32(scalarWidth));
+      idxVector = B.CreateURem(idxVector, subVecEltsSplat);
+      // Index into the pointer vector with the offsets, e.g.,:
+      // <A, A+1, A+2, A+3, B, B+1, B+2, B+3>
+      vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector);
+    } else if (vecPtrTy && !scalable) {
+      auto const simdWidth = factor.getFixedValue();
+      auto const scalarWidth = vecPtrTy->getNumElements();
+
+      // Build shuffle mask to widen the pointer
+      SmallVector<Constant *, 16> indices;
+      SmallVector<int, 16> widenMask;
+      for (size_t i = 0; i < simdWidth; ++i) {
+        for (size_t j = 0; j < scalarWidth; ++j) {
+          widenMask.push_back(i);
+          indices.push_back(B.getInt32(j));
+        }
+      }
+
+      auto *const newPtrTy = FixedVectorType::get(
+          PointerType::get(scalarTy, scalarPtrTy->getPointerAddressSpace()),
+          simdWidth);
+
+      auto *const idxVector = ConstantVector::get(indices);
+      auto *const undef = UndefValue::get(newPtrTy);
+      for (auto &vecPtr : ptrPacket) {
+        vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
+        vecPtr = B.CreateShuffleVector(vecPtr, undef, widenMask);
+        vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector);
+      }
+    }
+
+    ValuePacket dataPacket;
+    if (data) {
+      auto src = packetize(data);
+      PACK_FAIL_IF(!src);
+      src.getPacketValues(packetWidth, dataPacket);
+      PACK_FAIL_IF(dataPacket.empty());
+    } else {
+      dataPacket.resize(packetWidth, nullptr);
+    }
+
+    // Vector-predicated scatters/gathers are always masked.
+    ValuePacket maskPacket(packetWidth, nullptr);
+    auto *const packetVecTy = getWideType(dataTy, factor);
+    if (mask || EVL) {
+      if (!mask) {
+        // If there's no mask then just splat a trivial one.
+        auto *const trueMask = multi_llvm::createAllTrueMask(
+            B, multi_llvm::getVectorElementCount(packetVecTy));
+        std::fill(maskPacket.begin(), maskPacket.end(), trueMask);
+      } else {
+        maskPacket = packetizeAndGet(mask, packetWidth);
+        PACK_FAIL_IF(maskPacket.empty());
+      }
+    }
+
+    // Gather load or scatter store.
+    for (unsigned i = 0; i != packetWidth; ++i) {
+      if (op.isLoad()) {
+        results.push_back(createGather(Ctx, packetVecTy, ptrPacket[i],
+                                       maskPacket[i], EVL, op.getAlignment(),
+                                       name, op.getInstr()));
+      } else {
+        results.push_back(createScatter(Ctx, dataPacket[i], ptrPacket[i],
+                                        maskPacket[i], EVL, op.getAlignment(),
+                                        name, op.getInstr()));
+      }
+    }
+  } else if (!constantStrideVal || constantStride != 1) {
+    if (dataTy->isPointerTy() || dataTy->isVectorTy()) {
+      // No builtins for memops on pointer types, and we can't do interleaved
+      // memops over vector types.
+      return results;
+    }
+
+    ValuePacket dataPacket;
+    if (data) {
+      auto src = packetize(data);
+      PACK_FAIL_IF(!src);
+      src.getPacketValues(packetWidth, dataPacket);
+      PACK_FAIL_IF(dataPacket.empty());
+    } else {
+      dataPacket.resize(packetWidth, nullptr);
+    }
+
+    Value *packetStride = nullptr;
+    if (packetWidth != 1) {
+      // Make sure the stride is at least as wide as a GEP index needs to be
+      unsigned const indexBits = Ctx.dataLayout()->getIndexSizeInBits(
+          ptr->getType()->getPointerAddressSpace());
+      unsigned strideBits = stride->getType()->getPrimitiveSizeInBits();
+      auto *const elementStride =
+          (indexBits > strideBits)
+              ? B.CreateSExt(stride, B.getIntNTy((strideBits = indexBits)))
+              : stride;
+
+      auto const simdWidth = factor.getFixedValue();
+      packetStride =
+          B.CreateMul(elementStride, B.getIntN(strideBits, simdWidth),
+                      Twine(name, ".packet_stride"));
+    }
+
+    // Vector-predicated interleaved operations are always masked.
+    ValuePacket maskPacket(packetWidth, nullptr);
+    auto *const packetVecTy = getWideType(dataTy, factor);
+    if (mask || EVL) {
+      if (!mask) {
+        // If there's no mask then just splat a trivial one.
+        auto *const trueMask = multi_llvm::createAllTrueMask(
+            B, multi_llvm::getVectorElementCount(packetVecTy));
+        std::fill(maskPacket.begin(), maskPacket.end(), trueMask);
+      } else {
+        maskPacket = packetizeAndGet(mask, packetWidth);
+        PACK_FAIL_IF(maskPacket.empty());
+      }
+    }
+
+    // Interleaved (strided) load or store.
+    for (unsigned i = 0; i != packetWidth; ++i) {
+      if (i != 0) {
+        ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                  Twine(name, ".incr"));
+      }
+      if (op.isLoad()) {
+        results.push_back(
+            createInterleavedLoad(Ctx, packetVecTy, ptr, stride, maskPacket[i],
+                                  EVL, op.getAlignment(), name, op.getInstr()));
+      } else {
+        results.push_back(createInterleavedStore(
+            Ctx, dataPacket[i], ptr, stride, maskPacket[i], EVL,
+            op.getAlignment(), name, op.getInstr()));
+      }
+    }
+  } else {
+    ValuePacket dataPacket;
+    if (data) {
+      auto src = packetize(data);
+      PACK_FAIL_IF(!src);
+      src.getPacketValues(packetWidth, dataPacket);
+      PACK_FAIL_IF(dataPacket.empty());
+    } else if (mask) {
+      // don't need the data packet for unmasked stores
+      dataPacket.resize(packetWidth, nullptr);
+    }
+
+    Value *packetStride = nullptr;
+    if (packetWidth != 1) {
+      auto const simdWidth = factor.getFixedValue();
+      packetStride = B.getInt64(simdWidth);
+    }
+
+    // Calculate the alignment. The MemOp's alignment is the original
+    // alignment, but may be overaligned. After vectorization it can't be
+    // larger than the pointee element type.
+    unsigned alignment = op.getAlignment();
+    unsigned sizeInBits =
+        multi_llvm::getKnownMinValue(dataTy->getPrimitiveSizeInBits());
+    alignment = std::min(alignment, std::max(sizeInBits, 8u) / 8u);
+
+    // Regular load or store.
+    if (mask) {
+      bool isVectorMask = mask->getType()->isVectorTy();
+      auto maskPacket = packetizeAndGet(mask, packetWidth);
+      PACK_FAIL_IF(maskPacket.empty());
+
+      // If the original instruction was a vector but the mask was a scalar i1,
+      // we have to broadcast the mask elements across the data vector.
+      auto *const vecTy = dyn_cast<FixedVectorType>(dataTy);
+      if (vecTy && !isVectorMask) {
+        PACK_FAIL_IF(factor.isScalable());
+        unsigned simdWidth = factor.getFixedValue();
+        unsigned scalarWidth = vecTy->getNumElements();
+
+        // Build shuffle mask to widen the vector condition.
+        SmallVector<int, 16> widenMask;
+        for (size_t i = 0; i < simdWidth; ++i) {
+          for (size_t j = 0; j < scalarWidth; ++j) {
+            widenMask.push_back(i);
+          }
+        }
+
+        auto *const undef = UndefValue::get(maskPacket.front()->getType());
+        for (auto &vecMask : maskPacket) {
+          vecMask = createOptimalShuffle(B, vecMask, undef, widenMask);
+        }
+      }
+
+      for (unsigned i = 0; i != packetWidth; ++i) {
+        if (i != 0) {
+          ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                    Twine(name, ".incr"));
+        }
+        if (op.isLoad()) {
+          results.push_back(createMaskedLoad(
+              Ctx, getWideType(dataTy, factor), ptr, maskPacket[i], EVL,
+              op.getAlignment(), name, op.getInstr()));
+        } else {
+          results.push_back(
+              createMaskedStore(Ctx, dataPacket[i], ptr, maskPacket[i], EVL,
+                                op.getAlignment(), name, op.getInstr()));
+        }
+      }
+    } else {
+      TargetInfo &VTI = Ctx.targetInfo();
+      if (op.isLoad()) {
+        auto *const one = B.getInt64(1);
+        for (unsigned i = 0; i != packetWidth; ++i) {
+          if (i != 0) {
+            ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                      Twine(name, ".incr"));
+          }
+          results.push_back(
+              VTI.createLoad(B, getWideType(dataTy, factor), ptr, one, EVL));
+        }
+      } else {
+        auto *const one = B.getInt64(1);
+        for (unsigned i = 0; i != packetWidth; ++i) {
+          if (i != 0) {
+            ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                      Twine(name, ".incr"));
+          }
+          results.push_back(
+              VTI.createStore(B, dataPacket[i], ptr, one, alignment, EVL));
+        }
+      }
+    }
+  }
+
+  // Transfer attributes from an old call instruction to a new one.
+  if (CallInst *oldCI = op.getCall()) {
+    for (auto *r : results) {
+      if (CallInst *newCI = dyn_cast_or_null<CallInst>(r)) {
+        newCI->setCallingConv(oldCI->getCallingConv());
+      }
+    }
+  }
+  return results;
+}
+
+void Packetizer::Impl::vectorizeDI(Instruction *Scalar, Value *Packet) {
+  auto *const LAM = LocalAsMetadata::getIfExists(Scalar);
+  if (!LAM) {
+    return;
+  }
+
+  auto *const MDV = MetadataAsValue::getIfExists(Scalar->getContext(), LAM);
+  if (!MDV) {
+    return;
+  }
+
+  DIBuilder DIB(*Scalar->getModule(), false);
+
+  // Find all the debug value intrinsics attached to scalar instruction
+  for (User *U : MDV->users()) {
+    DbgValueInst *const DVI = dyn_cast<DbgValueInst>(U);
+    if (!DVI) {
+      continue;
+    }
+
+    DILocalVariable *const DILocal = DVI->getVariable();
+    DIType *LocalType = dyn_cast<DIType>(DILocal->getType());
+
+    // Vector types need to be of a integral base type
+    while (!isa<DIBasicType>(LocalType)) {
+      if (DIDerivedType *DerivedType = dyn_cast<DIDerivedType>(LocalType)) {
+        LocalType = dyn_cast_or_null<DIType>(DerivedType->getBaseType());
+      } else if (DICompositeType *CompositeType =
+                     dyn_cast<DICompositeType>(LocalType)) {
+        auto baseType = CompositeType->getBaseType();
+        LocalType = dyn_cast_or_null<DIType>(baseType);
+      } else {
+        // Error case:
+        // No other valid derived classes of DIType,
+        // however some might be added to LLVM in the future.
+        break;
+      }
+
+      if (!LocalType) {
+        break;
+      }
+    }
+
+    // Type is something complex like a struct which we can't handle
+    if (!LocalType) {
+      continue;
+    }
+
+    if (SimdWidth.isScalable()) {
+      continue;
+    }
+    // Create a new DI vector type with simd width
+    const unsigned int Width = SimdWidth.getFixedValue();
+    Metadata *const Subscript = DIB.getOrCreateSubrange(0, Width);
+    DINodeArray SubscriptArray = DIB.getOrCreateArray(Subscript);
+
+    const uint64_t Size = LocalType->getSizeInBits() * Width;
+    const uint64_t Align = LocalType->getAlignInBits() * Width;
+
+    DICompositeType *const VectorType =
+        DIB.createVectorType(Size, Align, LocalType, SubscriptArray);
+
+    // Replace DILocalVariable type with our new vectorized type
+    DILocal->replaceOperandWith(3, VectorType);
+
+    // New packetized instruction will point to the base of our vector type
+    auto DIExpr = DIB.createExpression();
+
+    // Create llvm.dbg.value() intrinsic for packetized instruction,
+    // but can't insert it before a phi node.
+    if (isa<PHINode>(Scalar)) {
+      DIB.insertDbgValueIntrinsic(Packet, DILocal, DIExpr, DVI->getDebugLoc(),
+                                  Scalar->getParent()->getFirstNonPHI());
+    } else {
+      DIB.insertDbgValueIntrinsic(Packet, DILocal, DIExpr, DVI->getDebugLoc(),
+                                  Scalar);
+    }
+    // Delete the old scalar debug intrinsic since the instruction
+    // it references will also be deleted.
+    IC.deleteInstructionLater(DVI);
+  }
+  return;
+}
+
+ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) {
+  ValuePacket results;
+  Value *pointer = GEP->getPointerOperand();
+  if (isa<AllocaInst>(pointer)) {
+    return results;
+  }
+
+  if (isa<VectorType>(GEP->getType())) {
+    // instantiate vector GEPs, for safety
+    return results;
+  }
+
+  // Work out the packet width from the pointed to type, rather than the
+  // pointer type itself, because this is the width the memops will be using.
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+      cast<PointerType>(pointer->getType()), GEP->getSourceElementType()));
+  auto *const ty = GEP->getSourceElementType();
+  auto const packetWidth = getPacketWidthForType(ty);
+
+  // It is legal to create a GEP with a mixture of scalar and vector operands.
+  // If any operand is a vector, the result will be a vector of pointers.
+  ValuePacket pointerPacket;
+  if (UVR.isVarying(pointer)) {
+    auto res = packetize(pointer);
+    PACK_FAIL_IF(!res);
+    res.getPacketValues(packetWidth, pointerPacket);
+    PACK_FAIL_IF(pointerPacket.empty());
+  } else {
+    for (unsigned i = 0; i != packetWidth; ++i) {
+      pointerPacket.push_back(pointer);
+    }
+  }
+
+  // Packetize the GEP indices.
+  SmallVector<SmallVector<Value *, 16>, 4> opPackets;
+  for (unsigned i = 0, n = GEP->getNumIndices(); i != n; i++) {
+    Value *idx = GEP->getOperand(i + 1);
+    opPackets.emplace_back();
+
+    // Handle constant indices
+    if (isa<ConstantInt>(idx)) {
+      for (unsigned j = 0; j < packetWidth; ++j) {
+        opPackets.back().push_back(idx);
+      }
+    } else {
+      auto op = packetize(idx);
+      PACK_FAIL_IF(!op);
+      op.getPacketValues(packetWidth, opPackets.back());
+      PACK_FAIL_IF(opPackets.back().empty());
+    }
+  }
+
+  IRBuilder<> B(GEP);
+  IC.deleteInstructionLater(GEP);
+
+  bool inBounds = GEP->isInBounds();
+  auto const name = GEP->getName();
+
+  auto const numIndices = opPackets.size();
+  SmallVector<Value *, 4> opVals;
+  opVals.resize(numIndices);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    for (unsigned j = 0; j < numIndices; ++j) {
+      opVals[j] = opPackets[j][i];
+    }
+
+    if (inBounds) {
+      results.push_back(
+          B.CreateInBoundsGEP(ty, pointerPacket[i], opVals, name));
+    } else {
+      results.push_back(B.CreateGEP(ty, pointerPacket[i], opVals, name));
+    }
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeBinaryOp(BinaryOperator *BinOp) {
+  ValuePacket results;
+  auto packetWidth = getPacketWidthForType(BinOp->getType());
+
+  auto LHS = packetizeAndGet(BinOp->getOperand(0), packetWidth);
+  auto RHS = packetizeAndGet(BinOp->getOperand(1), packetWidth);
+  PACK_FAIL_IF(LHS.empty() || RHS.empty());
+
+  auto opcode = BinOp->getOpcode();
+  auto name = BinOp->getName();
+  IRBuilder<> B(BinOp);
+  if (VL) {
+    auto *const VecTy = LHS[0]->getType();
+    // Support for VP legalization is still lacking so fall back to non-VP
+    // operations in other cases. This support will improve over time.
+    if (Ctx.targetInfo().isVPVectorLegal(F, VecTy)) {
+      PACK_FAIL_IF(packetWidth != 1);
+      auto VPId = VPIntrinsic::getForOpcode(opcode);
+      PACK_FAIL_IF(VPId == Intrinsic::not_intrinsic);
+      auto *const Mask = multi_llvm::createAllTrueMask(
+          B, multi_llvm::getVectorElementCount(LHS[0]->getType()));
+      // Scale the base length by the number of vector elements, where
+      // appropriate.
+      Value *EVL = VL;
+      if (auto *const VecTy = dyn_cast<VectorType>(BinOp->getType())) {
+        EVL = B.CreateMul(
+            EVL,
+            B.getInt32(
+                multi_llvm::getVectorElementCount(VecTy).getKnownMinValue()));
+      }
+      auto *const NewBinOp = B.CreateIntrinsic(VPId, {LHS[0]->getType()},
+                                               {LHS[0], RHS[0], Mask, EVL});
+      NewBinOp->copyIRFlags(BinOp, true);
+      NewBinOp->copyMetadata(*BinOp);
+      results.push_back(NewBinOp);
+      return results;
+    }
+    // If we haven't matched [us]div or [us]rem then we may be executing
+    // out-of-bounds elements if we don't predicate. Since this isn't safe,
+    // bail.
+    PACK_FAIL_IF(
+        opcode == BinaryOperator::UDiv || opcode == BinaryOperator::SDiv ||
+        opcode == BinaryOperator::URem || opcode == BinaryOperator::SRem);
+  }
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    auto *const NewV = B.CreateBinOp(opcode, LHS[i], RHS[i], name);
+    if (auto *const NewBinOp = dyn_cast<BinaryOperator>(NewV)) {
+      NewBinOp->copyIRFlags(BinOp, true);
+      NewBinOp->copyMetadata(*BinOp);
+    }
+    results.push_back(NewV);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) {
+  ValuePacket results;
+  auto resC = packetize(FreezeI->getOperand(0));
+  PACK_FAIL_IF(!resC);
+
+  SmallVector<Value *, 16> src;
+  resC.getPacketValues(src);
+  PACK_FAIL_IF(src.empty());
+
+  auto const packetWidth = src.size();
+  auto const name = FreezeI->getName();
+
+  IRBuilder<> B(FreezeI);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreateFreeze(src[i], name));
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeUnaryOp(UnaryOperator *UnOp) {
+  ValuePacket results;
+
+  auto opcode = UnOp->getOpcode();
+
+  auto packetWidth = getPacketWidthForType(UnOp->getType());
+  auto src = packetizeAndGet(UnOp->getOperand(0), packetWidth);
+  PACK_FAIL_IF(src.empty());
+
+  auto name = UnOp->getName();
+  IRBuilder<> B(UnOp);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    Value *New = B.CreateUnOp(opcode, src[i], name);
+    auto *NewUnOp = cast<UnaryOperator>(New);
+    NewUnOp->copyIRFlags(UnOp, true);
+    results.push_back(NewUnOp);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeCast(CastInst *CastI) {
+  ValuePacket results;
+
+  auto *const ty = CastI->getType();
+  auto packetWidth = std::max(getPacketWidthForType(ty),
+                              getPacketWidthForType(CastI->getSrcTy()));
+
+  auto src = packetizeAndGet(CastI->getOperand(0), packetWidth);
+  PACK_FAIL_IF(src.empty());
+
+  auto *const wideTy =
+      getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth));
+  auto name = CastI->getName();
+  IRBuilder<> B(CastI);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreateCast(CastI->getOpcode(), src[i], wideTy, name));
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeICmp(ICmpInst *Cmp) {
+  ValuePacket results;
+  auto packetWidth = getPacketWidthForType(Cmp->getOperand(0)->getType());
+
+  auto LHS = packetizeAndGet(Cmp->getOperand(0), packetWidth);
+  auto RHS = packetizeAndGet(Cmp->getOperand(1), packetWidth);
+  PACK_FAIL_IF(LHS.empty() || RHS.empty());
+
+  auto pred = Cmp->getPredicate();
+  auto name = Cmp->getName();
+  IRBuilder<> B(Cmp);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    auto *const NewICmp = B.CreateICmp(pred, LHS[i], RHS[i], name);
+    if (isa<ICmpInst>(NewICmp)) {
+      cast<ICmpInst>(NewICmp)->copyIRFlags(Cmp, true);
+    }
+    results.push_back(NewICmp);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeFCmp(FCmpInst *Cmp) {
+  ValuePacket results;
+  auto packetWidth = getPacketWidthForType(Cmp->getOperand(0)->getType());
+
+  auto LHS = packetizeAndGet(Cmp->getOperand(0), packetWidth);
+  auto RHS = packetizeAndGet(Cmp->getOperand(1), packetWidth);
+  PACK_FAIL_IF(LHS.empty() || RHS.empty());
+
+  auto pred = Cmp->getPredicate();
+  auto name = Cmp->getName();
+  IRBuilder<> B(Cmp);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    auto *NewICmp = cast<FCmpInst>(B.CreateFCmp(pred, LHS[i], RHS[i], name));
+    NewICmp->copyIRFlags(Cmp, true);
+    results.push_back(NewICmp);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeSelect(SelectInst *Select) {
+  ValuePacket results;
+  auto *const ty = Select->getType();
+  if (!ty->isVectorTy() && !VectorType::isValidElementType(ty)) {
+    // Selects can work on struct/aggregate types, but we can't widen them..
+    return results;
+  }
+
+  auto packetWidth = getPacketWidthForType(ty);
+  auto vecT = packetizeAndGet(Select->getOperand(1), packetWidth);
+  auto vecF = packetizeAndGet(Select->getOperand(2), packetWidth);
+  PACK_FAIL_IF(vecT.empty() || vecF.empty());
+
+  auto *cond = Select->getOperand(0);
+  auto resC = packetize(cond);
+  PACK_FAIL_IF(!resC);
+
+  IRBuilder<> B(Select);
+  bool isVectorSelect = cond->getType()->isVectorTy();
+  SmallVector<Value *, 16> vecC;
+  if (UVR.isVarying(cond)) {
+    resC.getPacketValues(packetWidth, vecC);
+    PACK_FAIL_IF(vecC.empty());
+
+    // If the original select returns a vector, but the condition was scalar,
+    // and its packet members are widened, we have to sub-broadcast it across
+    // the lanes of the original vector.
+    if (!isVectorSelect && vecC.front()->getType()->isVectorTy()) {
+      if (auto *vecTy = dyn_cast<FixedVectorType>(Select->getType())) {
+        PACK_FAIL_IF(!createSubSplats(Ctx.targetInfo(), B, vecC,
+                                      vecTy->getNumElements()));
+      }
+    }
+  } else if (isVectorSelect) {
+    // If the condition is a uniform vector, get its broadcast packets
+    resC.getPacketValues(packetWidth, vecC);
+    PACK_FAIL_IF(vecC.empty());
+  } else {
+    // If the condition is a uniform scalar, we can just use it as is
+    vecC.assign(packetWidth, cond);
+  }
+
+  auto name = Select->getName();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreateSelect(vecC[i], vecT[i], vecF[i], name));
+  }
+  return results;
+}
+
+Value *Packetizer::Impl::vectorizeReturn(ReturnInst *Return) {
+  IRBuilder<> B(Return);
+  Value *Op = packetize(Return->getOperand(0)).getAsValue();
+  VECZ_FAIL_IF(!Op);
+  IC.deleteInstructionLater(Return);
+  return B.CreateRet(Op);
+}
+
+Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
+  Function *Callee = CI->getCalledFunction();
+  VECZ_STAT_FAIL_IF(!Callee, VeczPacketizeFailCall);
+
+  IRBuilder<> B(CI);
+  // Handle LLVM intrinsics.
+  if (Callee->isIntrinsic()) {
+    Value *Result = nullptr;
+    auto IntrID = Intrinsic::ID(Callee->getIntrinsicID());
+    if (IntrID == Intrinsic::fmuladd || IntrID == Intrinsic::fma) {
+      SmallVector<Value *, 3> Ops;
+      SmallVector<Type *, 1> Tys;
+      for (unsigned i = 0; i < 3; ++i) {
+        Value *P = packetize(CI->getOperand(i)).getAsValue();
+        VECZ_FAIL_IF(!P);
+        Ops.push_back(P);
+      }
+      Tys.push_back(getWideType(CI->getType(), SimdWidth));
+      Result = B.CreateIntrinsic(IntrID, Tys, Ops, CI, CI->getName());
+    }
+
+    if (Result) {
+      IC.deleteInstructionLater(CI);
+      return Result;
+    }
+  }
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(Callee)) {
+    // These should have been handled by packetizeCall, if not, off to the
+    // instantiator they go...
+    if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp()) {
+        return nullptr;
+      }
+    }
+  }
+
+  if (VectorizationContext::isVector(*CI)) {
+    return nullptr;
+  }
+
+  // Handle external builtins.
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  auto const Builtin = BI.analyzeBuiltinCall(*CI, Dimension);
+
+  if (Builtin.properties & compiler::utils::eBuiltinPropertyExecutionFlow) {
+    return nullptr;
+  }
+  if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
+    // The subgroup ID is just a simple index sequence. There is no dimension
+    // to it, and we only support 1D workgroups.
+    if (Builtin.isValid() &&
+        Builtin.ID == Ctx.builtins().getSubgroupLocalIdBuiltin()) {
+      IRBuilder<> B(buildAfter(CI, F));
+      return multi_llvm::createIndexSequence(
+          B, VectorType::get(CI->getType(), SimdWidth), SimdWidth,
+          "subgroup.local.id");
+    }
+    return vectorizeWorkGroupCall(CI, Builtin);
+  }
+
+  // Try to find a unit for this builtin.
+  auto CalleeVec = Ctx.getVectorizedFunction(*Callee, SimdWidth);
+  if (!CalleeVec) {
+    // No vectorization strategy found. Fall back on Instantiation.
+    return nullptr;
+  }
+  IC.deleteInstructionLater(CI);
+
+  // Vectorize call operands.
+  unsigned i = 0;
+  AllocaInst *PointerRetAlloca = nullptr;
+  Value *PointerRetAddr = nullptr;
+  int PointerRetStride = 0;
+  SmallVector<Value *, 4> Ops;
+  for (const auto &TargetArg : CalleeVec.args) {
+    // Handle scalar arguments.
+    Value *ScalarOp = CI->getArgOperand(i);
+    Type *ScalarTy = ScalarOp->getType();
+    if (TargetArg.kind == VectorizationResult::Arg::POINTER_RETURN) {
+      // 'Pointer return' arguments that are not sequential need to be handled
+      // specially.
+      auto *const PtrTy = dyn_cast<PointerType>(ScalarOp->getType());
+      auto *const PtrEleTy = TargetArg.pointerRetPointeeTy;
+      Value *Stride = SAR.buildMemoryStride(B, ScalarOp, PtrEleTy);
+      VECZ_STAT_FAIL_IF(!Stride, VeczPacketizeFailStride);
+      bool hasConstantStride = false;
+      int64_t ConstantStride = 0;
+      if (ConstantInt *CInt = dyn_cast<ConstantInt>(Stride)) {
+        ConstantStride = CInt->getSExtValue();
+        hasConstantStride = true;
+      }
+      VECZ_STAT_FAIL_IF(!hasConstantStride || ConstantStride < 1,
+                        VeczPacketizeFailStride);
+      if (ConstantStride == 1) {
+        Ops.push_back(B.CreateBitCast(ScalarOp, TargetArg.type));
+        i++;
+        continue;
+      }
+      // Create an alloca in the function's entry block. The alloca will be
+      // passed instead of the original pointer. After the function call,
+      // the value from the alloca will be loaded sequentially and stored to the
+      // original address using an interleaved store.
+      VECZ_STAT_FAIL_IF(!PtrTy || PointerRetAddr, VeczPacketizeFailPtr);
+      BasicBlock *BB = CI->getParent();
+      VECZ_FAIL_IF(!BB);
+      Function *F = BB->getParent();
+      VECZ_FAIL_IF(!F);
+      BasicBlock &EntryBB = F->getEntryBlock();
+      B.SetInsertPoint(&*EntryBB.getFirstInsertionPt());
+      Type *AllocaTy = getWideType(PtrEleTy, SimdWidth);
+      PointerRetAlloca = B.CreateAlloca(AllocaTy, nullptr, "ptr_ret_temp");
+      Value *NewOp = PointerRetAlloca;
+      if (PtrTy->getAddressSpace() != 0) {
+        Type *NewOpTy = PointerType::get(AllocaTy, PtrTy->getAddressSpace());
+        NewOp = B.CreateAddrSpaceCast(NewOp, NewOpTy);
+      }
+      PointerRetAddr = ScalarOp;
+      PointerRetStride = ConstantStride;
+      Ops.push_back(NewOp);
+      i++;
+      continue;
+    } else if (TargetArg.kind != VectorizationResult::Arg::VECTORIZED) {
+      Ops.push_back(ScalarOp);
+      i++;
+      continue;
+    }
+
+    // Make sure the type is correct for vector arguments.
+    auto VectorTy = dyn_cast<FixedVectorType>(TargetArg.type);
+    VECZ_STAT_FAIL_IF(!VectorTy || VectorTy->getElementType() != ScalarTy,
+                      VeczPacketizeFailType);
+
+    // Vectorize scalar operands.
+    Value *VecOp = packetize(ScalarOp).getAsValue();
+    VECZ_FAIL_IF(!VecOp);
+    Ops.push_back(VecOp);
+    i++;
+  }
+
+  CallInst *NewCI = B.CreateCall(CalleeVec.get(), Ops, CI->getName());
+  NewCI->setCallingConv(CI->getCallingConv());
+  if (PointerRetAddr) {
+    // Load the 'pointer return' value from the alloca and store it to the
+    // original address using an interleaved store.
+    LoadInst *PointerRetResult =
+        B.CreateLoad(PointerRetAlloca->getAllocatedType(), PointerRetAlloca);
+    Value *Stride = getSizeInt(B, PointerRetStride);
+    auto *Store = createInterleavedStore(
+        Ctx, PointerRetResult, PointerRetAddr, Stride,
+        /*Mask*/ nullptr, /*EVL*/ nullptr, PointerRetAlloca->getAlign().value(),
+        "", &*B.GetInsertPoint());
+    if (!Store) {
+      return nullptr;
+    }
+  }
+  return NewCI;
+}
+
+Value *Packetizer::Impl::vectorizeWorkGroupCall(
+    CallInst *CI, compiler::utils::BuiltinCall const &Builtin) {
+  // Insert instructions after the call to the builtin, since they reference
+  // the result of that call.
+  IRBuilder<> B(buildAfter(CI, F));
+
+  // Do not vectorize ranks equal to vectorization dimension. The value of
+  // get_global_id with other ranks is uniform.
+
+  // Broadcast the builtin's return value.
+  Value *Splat = B.CreateVectorSplat(SimdWidth, CI);
+
+  // Add an index sequence [0, 1, 2, ...] to the value unless uniform.
+  auto const Uniformity = Builtin.uniformity;
+  if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+      Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+    Value *StepVector = multi_llvm::createIndexSequence(B, Splat->getType(),
+                                                        SimdWidth, "index.vec");
+    VECZ_FAIL_IF(!StepVector);
+
+    Value *Result = B.CreateAdd(Splat, StepVector);
+
+    if (Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+      Value *Rank = CI->getArgOperand(0);
+
+      // if the Rank is varying, need to packetize that as well!
+      if (UVR.isVarying(Rank)) {
+        Rank = packetize(Rank).getAsValue();
+        VECZ_FAIL_IF(!Rank);
+      }
+      Value *dim = ConstantInt::get(Rank->getType(), Dimension);
+      Value *Test = B.CreateICmpEQ(Rank, dim);
+      Result = B.CreateSelect(Test, Result, Splat, "maybe_rank");
+    }
+    return Result;
+  } else if (Uniformity == compiler::utils::eBuiltinUniformityNever) {
+    VECZ_FAIL();
+  }
+
+  return Splat;
+}
+
+Value *Packetizer::Impl::vectorizeAlloca(AllocaInst *alloca) {
+  // We create an array allocation here, because the resulting value needs to
+  // represent a vector of pointers, not a pointer to vector. As such, it's a
+  // bit of a trick to handle scalable vectorization factors, since that would
+  // require creating instrucions *before* the alloca, to get the array length,
+  // which could be a surprise to some of our later passes that expect allocas
+  // to be grouped at the top of the first Basic Block. This is not an LLVM
+  // requirement, however, so it should be investigated.
+  //
+  // Note that normally, an alloca would not be packtized anyway, since access
+  // is contiguous, Load and Store operations don't need to packetize their
+  // pointer operand and the alloca would be widened after packetization, which
+  // has no trouble with scalables. This function is required for the case that
+  // some pointer-dependent instruction unexpectedly fails to packetize, and
+  // falls back to instantiation, in which case we need a pointer per lane. In
+  // actual fact, "normal" alloca vectorization is not very common, since such
+  // allocas tend to be easy to remove by the Mem-to-Reg pass, so this "edge
+  // case" is actually the most likely.
+  //
+  VECZ_FAIL_IF(SimdWidth.isScalable());
+  unsigned fixedWidth = SimdWidth.getFixedValue();
+  IRBuilder<> B(alloca);
+  auto *const ty = alloca->getAllocatedType();
+  AllocaInst *wideAlloca =
+      B.CreateAlloca(ty, getSizeInt(B, fixedWidth), alloca->getName());
+  wideAlloca->setAlignment(alloca->getAlign());
+
+  // Put the GEP after all allocas.
+  Instruction *insertPt = alloca;
+  while (isa<AllocaInst>(*insertPt)) {
+    insertPt = insertPt->getNextNonDebugInstruction();
+  }
+  B.SetInsertPoint(insertPt);
+  deleteInstructionLater(alloca);
+
+  auto *const idxTy = Ctx.dataLayout()->getIndexType(wideAlloca->getType());
+  Value *const indices = multi_llvm::createIndexSequence(
+      B, VectorType::get(idxTy, SimdWidth), SimdWidth);
+
+  return B.CreateInBoundsGEP(ty, wideAlloca, ArrayRef<Value *>{indices},
+                             Twine(alloca->getName(), ".lanes"));
+}
+
+Value *Packetizer::Impl::vectorizeExtractValue(ExtractValueInst *ExtractValue) {
+  IRBuilder<> B(buildAfter(ExtractValue, F));
+
+  Value *Aggregate =
+      packetize(ExtractValue->getAggregateOperand()).getAsValue();
+  SmallVector<unsigned, 4> Indices;
+  Indices.push_back(0);
+  for (auto Index : ExtractValue->indices()) {
+    Indices.push_back(Index);
+  }
+
+  SmallVector<Value *, 16> Extracts;
+
+  VECZ_FAIL_IF(SimdWidth.isScalable());
+  auto Width = SimdWidth.getFixedValue();
+
+  // Check that the width is non-zero so the zeroth element is initialized.
+  VECZ_FAIL_IF(Width < 1);
+
+  for (decltype(Width) i = 0; i < Width; i++) {
+    Indices[0] = i;
+    Extracts.push_back(B.CreateExtractValue(Aggregate, Indices));
+  }
+
+  Type *CompositeTy = getWideType(Extracts[0]->getType(), SimdWidth);
+  Value *Result = UndefValue::get(CompositeTy);
+  for (decltype(Width) i = 0; i < Width; i++) {
+    Result = B.CreateInsertElement(Result, Extracts[i], B.getInt32(i));
+  }
+
+  return Result;
+}
+
+ValuePacket Packetizer::Impl::packetizeInsertElement(
+    InsertElementInst *InsertElement) {
+  ValuePacket results;
+  Value *Result = nullptr;
+
+  Value *Into = InsertElement->getOperand(0);
+  assert(Into && "Could not get operand 0 of InsertElement");
+  const auto ScalarWidth = multi_llvm::getVectorNumElements(Into->getType());
+
+  Value *Elt = InsertElement->getOperand(1);
+  Value *Index = InsertElement->getOperand(2);
+  assert(Elt && "Could not get operand 1 of InsertElement");
+  assert(Index && "Could not get operand 2 of InsertElement");
+
+  if (SimdWidth.isScalable()) {
+    auto packetWidth = getPacketWidthForType(Into->getType());
+    auto intoVals = packetizeAndGet(Into, packetWidth);
+    // Scalable vectorization (currently) only ever generates 1 packet
+    PACK_FAIL_IF(intoVals.size() != 1);
+    Value *packetizedInto = intoVals.front();
+
+    auto eltPacketWidth = getPacketWidthForType(Elt->getType());
+    auto eltVals = packetizeAndGet(Elt, eltPacketWidth);
+    // Scalable vectorization (currently) only ever generates 1 packet
+    PACK_FAIL_IF(eltVals.size() != 1);
+    Value *packetizedElt = eltVals.front();
+
+    Value *packetizedIndices = packetizeIfVarying(Index);
+
+    auto *packetizedEltTy = packetizedElt->getType();
+    auto *packetizedIntoTy = packetizedInto->getType();
+    auto *scalarTy = packetizedEltTy->getScalarType();
+
+    // Compiler support for masked.gather/riscv.vrgather* on i1 vectors is
+    // lacking, so emit this operation as the equivalent i8 vector instead.
+    auto *const origPacketizedIntoTy = packetizedIntoTy;
+    const bool upcastI1AsI8 = scalarTy->isIntegerTy(1);
+    IRBuilder<> B(buildAfter(InsertElement, F));
+    if (upcastI1AsI8) {
+      auto *const int8Ty = Type::getInt8Ty(F.getContext());
+      packetizedIntoTy = llvm::VectorType::get(
+          int8Ty, multi_llvm::getVectorElementCount(packetizedIntoTy));
+      packetizedEltTy = llvm::VectorType::get(
+          int8Ty, multi_llvm::getVectorElementCount(packetizedEltTy));
+      packetizedElt = B.CreateSExt(packetizedElt, packetizedEltTy);
+      packetizedInto = B.CreateSExt(packetizedInto, packetizedIntoTy);
+    }
+
+    // If we're vector predicating, scale the vector length up by the original
+    // number of vector elements.
+    auto *const EVL = VL ? B.CreateMul(VL, B.getInt32(ScalarWidth)) : nullptr;
+
+    auto *packetizedInsert = Ctx.targetInfo().createScalableInsertElement(
+        B, Ctx, InsertElement, packetizedElt, packetizedInto, packetizedIndices,
+        EVL);
+
+    // If we've been performing this broadcast as i8, now's the time to
+    // truncate back down to i1
+    if (upcastI1AsI8) {
+      packetizedInsert = B.CreateTrunc(packetizedInsert, origPacketizedIntoTy);
+    }
+
+    IC.deleteInstructionLater(InsertElement);
+    results.push_back(packetizedInsert);
+    return results;
+  }
+
+  auto Width = SimdWidth.getFixedValue();
+
+  IRBuilder<> B(buildAfter(InsertElement, F));
+
+  const auto Name = InsertElement->getName();
+  if (auto *CIndex = dyn_cast<ConstantInt>(Index)) {
+    auto IdxVal = CIndex->getZExtValue();
+
+    auto packetWidth = getPacketWidthForType(Into->getType());
+    PACK_FAIL_IF(packetWidth == Width);
+
+    auto Intos = packetizeAndGet(Into, packetWidth);
+    PACK_FAIL_IF(Intos.empty());
+
+    auto res = packetize(Elt);
+    PACK_FAIL_IF(!res);
+
+    if (res.info->numInstances == 0) {
+      // If the element was broadcast, it's better just to create more insert
+      // element instructions..
+      const auto instanceWidth =
+          multi_llvm::getVectorNumElements(Intos.front()->getType());
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        results.push_back(Intos[i]);
+        for (unsigned j = IdxVal; j < instanceWidth; j += ScalarWidth) {
+          results.back() =
+              B.CreateInsertElement(results.back(), Elt, B.getInt32(j), Name);
+        }
+      }
+      return results;
+    }
+
+    SmallVector<Value *, 16> Elts;
+    res.getPacketValues(packetWidth, Elts);
+    PACK_FAIL_IF(Elts.empty());
+
+    const auto *VecTy = cast<FixedVectorType>(Intos.front()->getType());
+    const unsigned VecWidth = VecTy->getNumElements();
+    PACK_FAIL_IF(VecWidth == ScalarWidth);
+    {
+      // Can only shuffle two vectors of the same size, so redistribute
+      // the packetized elements vector
+      SmallVector<int, 16> Mask;
+      for (size_t i = 0; i < VecWidth; ++i) {
+        Mask.push_back(i / ScalarWidth);
+      }
+
+      auto *Undef = UndefValue::get(Elts.front()->getType());
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        results.push_back(createOptimalShuffle(B, Elts[i], Undef, Mask, Name));
+      }
+    }
+    if (isa<UndefValue>(Into)) {
+      // Inserting into nothing so we can just use it as is..
+      return results;
+    } else {
+      SmallVector<int, 16> Mask;
+      for (size_t i = 0; i < VecWidth; ++i) {
+        int j = VecWidth + i;
+        if (i == IdxVal) {
+          j = i;
+          IdxVal += ScalarWidth;
+        }
+        Mask.push_back(j);
+      }
+
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        results[i] = createOptimalShuffle(B, results[i], Intos[i], Mask, Name);
+      }
+      return results;
+    }
+  } else {
+    Into = packetize(Into).getAsValue();
+    PACK_FAIL_IF(!Into);
+    Value *Elts = packetizeIfVarying(Elt);
+    PACK_FAIL_IF(!Elts);
+    Value *Indices = packetizeIfVarying(Index);
+    PACK_FAIL_IF(!Indices);
+
+    Result = Into;
+    if (Indices != Index) {
+      Type *IdxTy = Index->getType();
+      SmallVector<Constant *, 16> Offsets;
+      for (unsigned i = 0; i < Width; ++i) {
+        Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth));
+      }
+      Value *Add = B.CreateAdd(Indices, ConstantVector::get(Offsets));
+
+      for (unsigned i = 0; i < Width; ++i) {
+        Value *ExtractElt =
+            (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt;
+        Value *ExtractIdx = B.CreateExtractElement(Add, B.getInt32(i));
+        Result = B.CreateInsertElement(Result, ExtractElt, ExtractIdx, Name);
+      }
+    } else {
+      for (unsigned i = 0; i < Width; ++i) {
+        Value *ExtractElt =
+            (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt;
+        Value *InsertIdx = B.CreateAdd(Index, B.getInt32(i * ScalarWidth));
+        Result = B.CreateInsertElement(Result, ExtractElt, InsertIdx, Name);
+      }
+    }
+  }
+  IC.deleteInstructionLater(InsertElement);
+  results.push_back(Result);
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeExtractElement(
+    ExtractElementInst *ExtractElement) {
+  ValuePacket results;
+  Value *Result = nullptr;
+
+  Value *Src = ExtractElement->getOperand(0);
+  Value *Index = ExtractElement->getOperand(1);
+  assert(Src && "Could not get operand 0 of ExtractElement");
+  assert(Index && "Could not get operand 1 of ExtractElement");
+
+  if (SimdWidth.isScalable()) {
+    auto packetWidth = getPacketWidthForType(Src->getType());
+    auto srcVals = packetizeAndGet(Src, packetWidth);
+    // Scalable vectorization (currently) only ever generates 1 packet
+    PACK_FAIL_IF(srcVals.size() != 1);
+    Value *packetizedSrc = srcVals.front();
+
+    Value *packetizedIndices = packetizeIfVarying(Index);
+
+    Value *packetizedExtract = [&]() {
+      IRBuilder<> B(buildAfter(ExtractElement, F));
+
+      auto *narrowTy = getWideType(ExtractElement->getType(), SimdWidth);
+      auto *const origNarrowTy = narrowTy;
+      auto *origSrc = ExtractElement->getOperand(0);
+      auto *origTy = origSrc->getType();
+      auto *eltTy = origTy->getScalarType()->getScalarType();
+
+      // Compiler support for masked.gather/riscv.vrgather* on i1
+      // vectors is lacking, so emit this operation as the equivalent
+      // i8 vector instead.
+      const bool upcastI1AsI8 = eltTy->isIntegerTy(/*BitWidth*/ 1);
+      if (upcastI1AsI8) {
+        auto *const int8Ty = B.getInt8Ty();
+        auto *wideTy = llvm::VectorType::get(
+            int8Ty,
+            multi_llvm::getVectorElementCount(packetizedSrc->getType()));
+        narrowTy = llvm::VectorType::get(
+            int8Ty, multi_llvm::getVectorElementCount(narrowTy));
+        packetizedSrc = B.CreateSExt(packetizedSrc, wideTy);
+      }
+
+      Value *extract = Ctx.targetInfo().createScalableExtractElement(
+          B, Ctx, ExtractElement, narrowTy, packetizedSrc, packetizedIndices,
+          VL);
+
+      // If we've been performing this broadcast as i8, now's the time to
+      // truncate back down to i1
+      if (extract && upcastI1AsI8) {
+        extract = B.CreateTrunc(extract, origNarrowTy);
+      }
+
+      return extract;
+    }();
+    PACK_FAIL_IF(!packetizedExtract);
+
+    IC.deleteInstructionLater(ExtractElement);
+    results.push_back(packetizedExtract);
+    return results;
+  }
+
+  auto Width = SimdWidth.getFixedValue();
+
+  const auto ScalarWidth = multi_llvm::getVectorNumElements(Src->getType());
+
+  IRBuilder<> B(buildAfter(ExtractElement, F));
+  const auto Name = ExtractElement->getName();
+  if (auto *CIndex = dyn_cast<ConstantInt>(Index)) {
+    auto IdxVal = CIndex->getZExtValue();
+
+    auto packetWidth = getPacketWidthForType(ExtractElement->getType());
+    auto srcVals = packetizeAndGet(Src, packetWidth);
+    PACK_FAIL_IF(srcVals.empty());
+
+    auto resultWidth = Width / packetWidth;
+    if (packetWidth == 1) {
+      srcVals.push_back(UndefValue::get(srcVals.front()->getType()));
+    } else {
+      resultWidth *= 2;
+    }
+
+    SmallVector<int, 16> Mask;
+    for (size_t i = 0, j = IdxVal; i < resultWidth; ++i, j += ScalarWidth) {
+      Mask.push_back(j);
+    }
+
+    for (unsigned i = 0; i < packetWidth; i += 2) {
+      results.push_back(
+          createOptimalShuffle(B, srcVals[i], srcVals[i + 1], Mask, Name));
+    }
+    return results;
+  } else {
+    Value *Sources = packetizeIfVarying(Src);
+    PACK_FAIL_IF(!Sources);
+    Value *Indices = packetizeIfVarying(Index);
+    PACK_FAIL_IF(!Indices);
+
+    Result = UndefValue::get(getWideType(ExtractElement->getType(), SimdWidth));
+    if (Indices != Index) {
+      Type *IdxTy = Index->getType();
+      SmallVector<Constant *, 16> Offsets;
+      for (unsigned i = 0; i < Width; ++i) {
+        Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth));
+      }
+
+      if (Sources != Src) {
+        Indices = B.CreateAdd(Indices, ConstantVector::get(Offsets));
+      }
+
+      for (unsigned i = 0; i < Width; ++i) {
+        Value *ExtractIdx = B.CreateExtractElement(Indices, B.getInt32(i));
+        Value *ExtractElt = B.CreateExtractElement(Sources, ExtractIdx);
+        Result = B.CreateInsertElement(Result, ExtractElt, B.getInt32(i), Name);
+      }
+    } else {
+      for (unsigned i = 0, j = 0; i < Width; ++i, j += ScalarWidth) {
+        Value *ExtractIdx = (Sources != Src && i != 0)
+                                ? B.CreateAdd(Index, B.getInt32(j))
+                                : Index;
+        Value *ExtractElt = B.CreateExtractElement(Sources, ExtractIdx);
+        Result = B.CreateInsertElement(Result, ExtractElt, B.getInt32(i), Name);
+      }
+    }
+  }
+  IC.deleteInstructionLater(ExtractElement);
+  results.push_back(Result);
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeShuffleVector(
+    ShuffleVectorInst *Shuffle) {
+  Value *const srcA = Shuffle->getOperand(0);
+  Value *const srcB = Shuffle->getOperand(1);
+  assert(srcA && "Could not get operand 0 from Shuffle");
+  assert(srcB && "Could not get operand 1 from Shuffle");
+  auto *const ty = Shuffle->getType();
+  auto *const tyA = srcA->getType();
+  auto packetWidth =
+      std::max(getPacketWidthForType(ty), getPacketWidthForType(tyA));
+
+  ValuePacket results;
+  IRBuilder<> B(buildAfter(Shuffle, F));
+  auto const scalarWidth = multi_llvm::getVectorNumElements(tyA);
+
+  if (SimdWidth.isScalable()) {
+    PACK_FAIL_IF(packetWidth != 1);
+    if (auto *const SplatVal = getSplatValue(Shuffle)) {
+      // Handle splats as a special case.
+      auto Splats = packetizeAndGet(SplatVal);
+      PACK_FAIL_IF(!createSubSplats(Ctx.targetInfo(), B, Splats, scalarWidth));
+      return Splats;
+    } else {
+      // It isn't safe to do it if it's not a power of 2.
+      PACK_FAIL_IF(!isPowerOf2_32(scalarWidth));
+      TargetInfo &VTI = Ctx.targetInfo();
+
+      auto const dstScalarWidth = multi_llvm::getVectorNumElements(ty);
+      auto const fullWidth = SimdWidth * dstScalarWidth;
+
+      // If we're vector-predicating a vector access, scale the vector length
+      // up by the original number of vector elements.
+      auto *const EVL =
+          VL ? B.CreateMul(VL, B.getInt32(dstScalarWidth)) : nullptr;
+
+      auto *const mask = Shuffle->getShuffleMaskForBitcode();
+      auto *const vecMask =
+          VTI.createOuterScalableBroadcast(B, mask, EVL, SimdWidth);
+
+      auto *const idxVector = multi_llvm::createIndexSequence(
+          B, VectorType::get(B.getInt32Ty(), fullWidth), fullWidth);
+
+      // We need to create offsets into the source operand subvectors, to add
+      // onto the broadcast shuffle mask, so that each subvector of the
+      // destination indices into the corresponding subvector of the source.
+      // That is, for a source vector width of `n` we need the indices
+      // `[0, n, 2*n, 3*n ...]`, which correspond to the indices of the first
+      // element of each subvector of the packetized source. For a destination
+      // vector of width `m` we need `m` instances of each index.
+      //
+      // We can compute the offset vector as `offset[i] = floor(i / m) * n`.
+      Value *offset = nullptr;
+      if (dstScalarWidth == scalarWidth) {
+        // If the source and destination are the same size, we have a special
+        // case and can mask off the LSBs of the index vector instead. i.e.
+        //     `offset[i] = i & -n`
+        // For instance, for `n == 4` we have offset indices:
+        // [0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, ... ].
+        offset = B.CreateAnd(
+            idxVector,
+            ConstantVector::getSplat(fullWidth, B.getInt32(-scalarWidth)));
+      } else {
+        auto *const subVecID = B.CreateUDiv(
+            idxVector,
+            ConstantVector::getSplat(fullWidth, B.getInt32(dstScalarWidth)));
+        offset = B.CreateMul(subVecID, ConstantVector::getSplat(
+                                           fullWidth, B.getInt32(scalarWidth)));
+      }
+
+      auto *const vecA = packetizeAndGet(srcA, 1).front();
+      if (isa<UndefValue>(srcB)) {
+        auto *const adjust = B.CreateAdd(vecMask, offset, "shuffleMask");
+        auto *const shuffleA = VTI.createVectorShuffle(B, vecA, adjust, EVL);
+        results.push_back(shuffleA);
+      } else {
+        // For a two-source shuffle, we shuffle each source separately and then
+        // select between the results. It might sound tempting to concatenate
+        // the sources first and use a single shuffle, but since the results
+        // need to be interleaved, it makes the mask computation somewhat more
+        // complicated, with indices dependent on the vector scale factor.
+        auto *const vecB = packetizeAndGet(srcB, 1).front();
+
+        auto *const whichCmp = B.CreateICmpUGE(
+            vecMask,
+            ConstantVector::getSplat(fullWidth, B.getInt32(scalarWidth)));
+        auto *const safeMask = B.CreateAnd(
+            vecMask,
+            ConstantVector::getSplat(fullWidth, B.getInt32(scalarWidth - 1)));
+
+        auto *const adjust = B.CreateAdd(safeMask, offset, "shuffleMask");
+        auto *const shuffleA = VTI.createVectorShuffle(B, vecA, adjust, EVL);
+        auto *const shuffleB = VTI.createVectorShuffle(B, vecB, adjust, EVL);
+        results.push_back(B.CreateSelect(whichCmp, shuffleB, shuffleA));
+      }
+
+      return results;
+    }
+  }
+
+  auto srcsA = packetizeAndGet(srcA, packetWidth);
+  auto srcsB = packetizeAndGet(srcB, packetWidth);
+  PACK_FAIL_IF(srcsA.empty() || srcsB.empty());
+
+  auto width = SimdWidth.getFixedValue() / packetWidth;
+
+  // Because up to and including LLVM 10, the IR Builder accepts a mask as a
+  // vector of uint32_t, but getShuffleMask returns an array of ints. So
+  // we do it this way.
+  const auto &origMask = Shuffle->getShuffleMask();
+  SmallVector<int, 16> mask(origMask.begin(), origMask.end());
+
+  // Adjust any indices that select from the second source vector
+  const auto adjust =
+      isa<UndefValue>(srcB) ? -scalarWidth : (width - 1) * scalarWidth;
+  for (auto &idx : mask) {
+    if (idx != int(-1) && idx >= int(scalarWidth)) {
+      idx += adjust;
+    }
+  }
+
+  // Duplicate the mask over the vectorized width
+  const auto size = mask.size();
+  mask.reserve(size * width);
+  for (unsigned i = 1, k = 0; i < width; ++i, k += size) {
+    for (unsigned j = 0; j < size; ++j) {
+      auto maskElem = mask[k + j];
+      if (maskElem != int(-1)) {
+        maskElem += scalarWidth;
+      }
+      mask.push_back(maskElem);
+    }
+  }
+
+  const auto name = Shuffle->getName();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(createOptimalShuffle(B, srcsA[i], srcsB[i], mask, name));
+  }
+  return results;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
new file mode 100644
index 0000000000000..ec52a14847195
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -0,0 +1,176 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/passes.h"
+
+#include <compiler/utils/mangling.h>
+#include <llvm/Analysis/AliasAnalysis.h>
+#include <llvm/Analysis/MemorySSA.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Utils.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "memory_operations.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+
+namespace vecz {
+PreservedAnalyses DivergenceCleanupPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+
+  for (BasicBlock &BB : F) {
+    auto *TI = BB.getTerminator();
+    if (BranchInst *Branch = dyn_cast<BranchInst>(TI)) {
+      if (!Branch->isConditional()) {
+        continue;
+      }
+
+      if (auto *const call = dyn_cast<CallInst>(Branch->getCondition())) {
+        compiler::utils::Lexer L(call->getCalledFunction()->getName());
+        if (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
+            L.Consume("divergence_")) {
+          // uniform reductions can just disappear
+          auto *const newCond = call->getOperand(0);
+          if (!UVR.isVarying(newCond)) {
+            Branch->setCondition(newCond);
+            if (call->use_empty()) {
+              UVR.remove(call);
+              call->eraseFromParent();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Try to replace or remove masked memory operations that are trivially
+/// not needed or can be converted to non-masked operations.
+PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+
+  TargetInfo &VTI = Ctx.targetInfo();
+  std::vector<Instruction *> ToDelete;
+  for (Function &Builtin : F.getParent()->functions()) {
+    Optional<MemOpDesc> BuiltinDesc = MemOpDesc::analyzeMaskedMemOp(Builtin);
+    if (!BuiltinDesc) {
+      continue;
+    }
+    for (User *U : Builtin.users()) {
+      CallInst *CI = dyn_cast<CallInst>(U);
+      if (!CI) {
+        continue;
+      }
+      Function *Parent = CI->getParent()->getParent();
+      if (Parent != &F) {
+        continue;
+      }
+      auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked);
+      if (!MaskedOp || !MaskedOp->isMaskedMemOp()) {
+        continue;
+      }
+      Value *Mask = MaskedOp->getMaskOperand();
+      Constant *CMask = dyn_cast<Constant>(Mask);
+      if (!CMask) {
+        continue;
+      }
+
+      // Handle special constants.
+      if (CMask->isZeroValue()) {
+        // A null mask means no lane executes the memory operation.
+        if (BuiltinDesc->isLoad()) {
+          CI->replaceAllUsesWith(UndefValue::get(BuiltinDesc->getDataType()));
+        }
+        ToDelete.push_back(CI);
+      } else if (CMask->isAllOnesValue()) {
+        // An 'all ones' mask means all lane execute the memory operation.
+        IRBuilder<> B(CI);
+        Value *Data = MaskedOp->getDataOperand();
+        Value *Ptr = MaskedOp->getPointerOperand();
+        Type *DataTy = MaskedOp->getDataType();
+        if (MaskedOp->isLoad()) {
+          Value *Load = nullptr;
+          if (DataTy->isVectorTy()) {
+            // Skip this optimization for scalable vectors for now. It's
+            // theoretically possible to perform but without scalable-vector
+            // builtins we can't test it; leave any theoretical scalable-vector
+            // maksed mem operation unoptimized.
+            if (isa<ScalableVectorType>(DataTy)) {
+              continue;
+            }
+            Load = VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1));
+          } else {
+            Load = B.CreateLoad(CI->getType(), Ptr, /*isVolatile*/ false,
+                                CI->getName());
+          }
+          CI->replaceAllUsesWith(Load);
+        } else {
+          if (DataTy->isVectorTy()) {
+            // Skip this optimization for scalable vectors for now. It's
+            // theoretically possible to perform but without scalable-vector
+            // builtins we can't test it; leave any theoretical scalable-vector
+            // maksed mem operation unoptimized.
+            if (isa<ScalableVectorType>(DataTy)) {
+              continue;
+            }
+            VTI.createStore(B, Data, Ptr, B.getInt64(1),
+                            BuiltinDesc->getAlignment());
+          } else {
+            B.CreateStore(Data, Ptr);
+          }
+        }
+        ToDelete.push_back(CI);
+      }
+    }
+  }
+
+  // Clean up.
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.back();
+    IRCleanup::deleteInstructionNow(I);
+    ToDelete.pop_back();
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+  return Preserved;
+}
+
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
new file mode 100644
index 0000000000000..28108ced6eff2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -0,0 +1,351 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This pass aims to optimize the CFG by hoisting instructions out of triangle
+// or diamond patterns (i.e. "if" or "if..else" constructs) where it determines
+// that executing all the instructions in all branch targets is cheaper than
+// actually branching. This is especially the case when BOSCC is active as the
+// BOSCC gadget introduces potentially-expensive AND/OR reduction operations
+// in order to branch to the uniform version of each Basic Block. To such end,
+// the pass needs to use the Uniform Value Analysis result, since only varying
+// branch conditions will be affected by BOSCC in such a way. We also need
+// access to the Target Transform Info result from the Vectorization Unit in
+// order to make target-dependent cost-based decisions.
+//
+// This pass only hoists instructions out of conditional blocks, and does not
+// directly modify the CFG, so it is intended that CFG Simplification Pass to
+// be run afterwards, in order to eliminate the now-redundant Basic Blocks and
+// transform PHI nodes into select instructions. Therefore, the
+// pre-linearization pass is implemented as an llvm::FunctionPass so it can
+// be run in the middle of the Vecz Preparation Pass.
+//
+// Pre-Linearization is currently unable to hoist memory operations, since
+// doing so will require the correct masked versions to be generated, which
+// would require a lot of special extra handling.
+
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/GraphTraits.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Pass.h>
+#include <llvm/Transforms/Utils/LoopUtils.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+bool isTrivialBlock(const llvm::BasicBlock &BB) {
+  for (const auto &I : BB) {
+    if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects() ||
+        llvm::isa<llvm::PHINode>(&I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// This is an estimate of the cycle count for executing the entire block,
+// not including the terminating branch instruction, obtained by summing
+// the cost (Reciprocal Throughput) of each individual instruction.
+// This assumes sequential execution (no Instruction Level Parallelism)
+// and takes no account of Data Hazards &c so is not guaranteed to be
+// entirely accurate.
+unsigned calculateBlockCost(const BasicBlock &BB,
+                            const TargetTransformInfo &TTI) {
+  unsigned cost = 0;
+  for (const auto &I : BB) {
+    if (I.isTerminator()) {
+      break;
+    }
+
+    InstructionCost inst_cost =
+        TTI.getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput);
+
+    // When a vector instruction is encountered, we multiply by the vector
+    // width, because it will either be scalarized into that many individual
+    // instructions during scalarization, or packetized by duplication.
+    // This works on the assumption that throughput does not depend on the
+    // vector width. This calculation may need refining in future.
+    if (I.getType()->isVectorTy()) {
+      inst_cost *= multi_llvm::getVectorNumElements(I.getType());
+    }
+
+    cost += *inst_cost.getValue();
+  }
+  return cost;
+}
+
+// It creates a temporary function in order to build a target-dependent
+// vector AND reduction inside it, in order to calculate the cost of it.
+unsigned calculateBoolReductionCost(LLVMContext &context, Module *module,
+                                    const TargetTransformInfo &TTI,
+                                    llvm::ElementCount width) {
+  Type *cond_ty = VectorType::get(Type::getInt1Ty(context), width);
+
+  FunctionType *new_fty =
+      FunctionType::get(Type::getVoidTy(context), {cond_ty}, false);
+
+  // LLVM 11 requires the function to be in a valid (existing) module in
+  // order to create a simple vector reduction with the specified opcode.
+  auto *F = Function::Create(new_fty, Function::InternalLinkage, "tmp", module);
+  auto *BB = BasicBlock::Create(context, "reduce", F);
+  IRBuilder<> B(BB);
+  createSimpleTargetReduction(B, &TTI, &*F->arg_begin(), RecurKind::And);
+  unsigned cost = calculateBlockCost(*BB, TTI);
+
+  // We don't really need that function in the module anymore because it's
+  // only purpose was to be used for analysis, so we go ahead and remove it.
+  F->removeFromParent();
+  delete F;
+  return cost;
+}
+
+bool hoistInstructions(BasicBlock &BB, BranchInst &Branch, bool exceptions) {
+  const auto &DL = BB.getModule()->getDataLayout();
+  const bool TrueBranch = (Branch.getSuccessor(0) == &BB);
+  DenseMap<Value *, Value *> safeDivisors;
+
+  bool modified = false;
+  while (!BB.front().isTerminator()) {
+    auto &I = BB.front();
+    I.moveBefore(&Branch);
+    modified = true;
+
+    if (!exceptions) {
+      // we don't need to mask division operations if they don't trap
+      continue;
+    }
+
+    if (!isa<BinaryOperator>(&I)) {
+      // we only hoist binary operators
+      continue;
+    }
+    auto *binOp = cast<BinaryOperator>(&I);
+    // It is potentially dangerous to hoist division operations, since
+    // the RHS could be zero or INT_MIN on some lanes, unless it's a
+    // constant.
+    bool isUnsigned = false;
+    switch (binOp->getOpcode()) {
+      default:
+        break;
+      case Instruction::UDiv:
+      case Instruction::URem:
+        isUnsigned = true;
+        LLVM_FALLTHROUGH;
+      case Instruction::SDiv:
+      case Instruction::SRem: {
+        auto *divisor = binOp->getOperand(1);
+        if (auto *C = dyn_cast<Constant>(divisor)) {
+          if (C->isZeroValue()) {
+            // Divides by constant zero can be a NOP since there is no
+            // division by zero exception in OpenCL.
+            I.replaceAllUsesWith(binOp->getOperand(0));
+            I.eraseFromParent();
+          }
+        } else {
+          // if the divisor could be illegal, we need to guard it with a
+          // select instruction generated from the branch condition.
+          auto &masked = safeDivisors[divisor];
+          if (!masked) {
+            // NOTE this function does not check for the pattern
+            // "select (x eq 0) 1, x" or equivalent, so we might want to
+            // write it ourselves, but Instruction Combining cleans it
+            // up. NOTE that for a signed division, we also have to
+            // consider the potential overflow situation, which is not
+            // so simple
+            if (isUnsigned && isKnownNonZero(divisor, DL)) {
+              // Static analysis concluded it can't be zero, so we don't
+              // need to do anything.
+              masked = divisor;
+            } else {
+              Value *one = ConstantInt::get(divisor->getType(), 1);
+              Value *cond = Branch.getCondition();
+
+              if (TrueBranch) {
+                masked =
+                    SelectInst::Create(cond, divisor, one,
+                                       divisor->getName() + ".hoist_guard", &I);
+              } else {
+                masked =
+                    SelectInst::Create(cond, one, divisor,
+                                       divisor->getName() + ".hoist_guard", &I);
+              }
+            }
+          }
+
+          if (masked != divisor) {
+            binOp->setOperand(1, masked);
+          }
+        }
+      } break;
+    }
+  }
+  return modified;
+}
+}  // namespace
+
+PreservedAnalyses PreLinearizePass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  VectorizationUnitAnalysis::Result R =
+      AM.getResult<VectorizationUnitAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  VectorizationUnit &VU = R.getVU();
+
+  bool modified = false;
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool div_exceptions =
+      VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions);
+
+  unsigned boscc_cost = 0;
+  UniformValueResult *UVR = nullptr;
+  if (VU.choices().linearizeBOSCC()) {
+    boscc_cost = calculateBoolReductionCost(F.getContext(), F.getParent(), TTI,
+                                            VU.width());
+    UVR = &AM.getResult<UniformValueAnalysis>(F);
+  }
+
+  auto dfo = depth_first(&F.getEntryBlock());
+  SmallVector<BasicBlock *, 16> blocks(dfo.begin(), dfo.end());
+
+  DenseMap<BasicBlock *, BasicBlock *> single_succs;
+  for (auto *BB : blocks) {
+    single_succs[BB] = BB->getSingleSuccessor();
+  }
+
+  for (auto BBit = blocks.rbegin(), BBe = blocks.rend(); BBit != BBe; ++BBit) {
+    BasicBlock *BB = *BBit;
+
+    // Check that all hoistable successor blocks are in the same loop
+    Loop *block_loop = LI.getLoopFor(BB);
+
+    if (succ_size(BB) >= 2) {
+      bool simple = true;
+      SmallPtrSet<BasicBlock *, 2> targets;
+      for (auto *succ : successors(BB)) {
+        if (BasicBlock *target = single_succs[succ]) {
+          targets.insert(target);
+        }
+      }
+
+      SmallVector<BasicBlock *, 2> hoistable;
+      SmallPtrSet<BasicBlock *, 2> new_succs;
+      for (auto *succ : successors(BB)) {
+        if (targets.count(succ) == 0) {
+          if (single_succs[succ] == nullptr || pred_size(succ) != 1 ||
+              LI.getLoopFor(succ) != block_loop || !isTrivialBlock(*succ)) {
+            simple = false;
+            break;
+          }
+          hoistable.push_back(succ);
+        } else {
+          // these "bypass" successors are going to stay where they are
+          new_succs.insert(succ);
+        }
+      }
+      if (!simple || hoistable.empty()) {
+        continue;
+      }
+
+      // The cost of a "bypass" branch is essentially zero. This occurs in a
+      // "triangle" type control struct (i.e. if with no else).
+      unsigned min_cost = new_succs.empty() ? ~0 : 0;
+
+      // The total cost of executing every successor sequentially
+      InstructionCost total_cost = 0;
+
+      for (auto *succ : hoistable) {
+        unsigned block_cost = calculateBlockCost(*succ, TTI);
+        if (block_cost < min_cost) {
+          min_cost = block_cost;
+        }
+        total_cost += block_cost;
+        new_succs.insert(single_succs[succ]);
+      }
+
+      // One of the successors was going to get executed anyway, so we can
+      // discount the cost of the cheapest one from the total cost.
+      total_cost -= min_cost;
+
+      // The unconditional branches of the successors are going to get
+      // removed if we hoist the contents. We will only execute one successor
+      // so assume the first successor's branch is representative.
+      auto *succ_term = hoistable.front()->getTerminator();
+      InstructionCost branch_cost =
+          TTI.getInstructionCost(succ_term,
+                                 TargetTransformInfo::TCK_RecipThroughput) +
+          TTI.getInstructionCost(succ_term, TargetTransformInfo::TCK_Latency);
+
+      // If all our successors branch to the same target, the conditional
+      // branch is going to disappear as well, so we can add that to the cost
+      // of the successor's branches in our analysis.
+      auto *T = BB->getTerminator();
+      if (new_succs.size() == 1) {
+        branch_cost +=
+            TTI.getInstructionCost(T, TargetTransformInfo::TCK_RecipThroughput);
+        branch_cost +=
+            TTI.getInstructionCost(T, TargetTransformInfo::TCK_Latency);
+
+        // BOSCC will incur an additional cost on varying branches.
+        if (UVR && UVR->isVarying(T)) {
+          branch_cost += boscc_cost;
+        }
+      }
+
+      // If the cost of executing everything is less than the cost of the
+      // branches that would get removed, then it is beneficial to hoist.
+      // If the costs are the same then we might as well make the CFG simpler!
+      if (total_cost <= branch_cost) {
+        // The Lower Switch Pass ought to guarantee we can only get branch
+        // instructions here, but in case it didn't, we don't want to crash.
+        if (auto *const Branch = dyn_cast<BranchInst>(T)) {
+          for (auto *succ : hoistable) {
+            modified |= hoistInstructions(*succ, *Branch, div_exceptions);
+          }
+
+          if (new_succs.size() == 1) {
+            // We are not going to modify the CFG while we are working on it,
+            // because that is very complex so we leave it to the Simplfy CFG
+            // Pass which is to come after us, and will do a better job. So
+            // here we can just pretend we modified it.
+            single_succs[BB] = *new_succs.begin();
+          }
+        }
+      }
+    }
+  }
+
+  if (!modified) {
+    return PreservedAnalyses::all();
+  }
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
new file mode 100644
index 0000000000000..28d649814c4ec
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -0,0 +1,392 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/printf_scalarizer.h"
+
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#define DEBUG_TYPE "VECZ-PRINTF-SCALARIZER"
+
+using namespace llvm;
+
+namespace vecz {
+
+GlobalVariable *GetFormatStringAsValue(Value *op) {
+  if (isa<ConstantExpr>(op)) {
+    auto const_string = cast<ConstantExpr>(op);
+    if (const_string->getOpcode() != Instruction::GetElementPtr) {
+      return nullptr;
+    }
+    return dyn_cast<GlobalVariable>(const_string->getOperand(0));
+  }
+
+  if (isa<GetElementPtrInst>(op)) {
+    auto gep_string = cast<GetElementPtrInst>(op);
+    return dyn_cast<GlobalVariable>(gep_string->getPointerOperand());
+  }
+
+  return dyn_cast<GlobalVariable>(op);
+}
+
+std::string GetFormatStringAsString(Value *op) {
+  if (!op || !isa<GlobalVariable>(op)) {
+    return "";
+  }
+
+  auto *string_global = cast<GlobalVariable>(op);
+
+  if (!string_global->hasInitializer()) {
+    return "";
+  }
+
+  Constant *const string_const = string_global->getInitializer();
+
+  if (!isa<ConstantDataSequential>(string_const)) {
+    return "";
+  }
+
+  auto *array_string = cast<ConstantDataSequential>(string_const);
+
+  if (!array_string->isString()) {
+    return "";
+  }
+
+  return array_string->getAsString().str();
+}
+
+bool IncrementPtr(const char **fmt) {
+  if (*(++(*fmt)) == '\0') {
+    return true;
+  }
+  return false;
+}
+
+GlobalVariable *GetNewFormatStringAsGlobalVar(
+    Module &module, GlobalVariable *const string_value,
+    const std::string &new_format_string) {
+  ArrayRef<uint8_t> Elts((uint8_t *)(&new_format_string[0]),
+                         new_format_string.size());
+  Constant *new_format_string_const =
+      ConstantDataArray::get(module.getContext(), Elts);
+
+  const bool is_constant = string_value->isConstant();
+  const bool is_externally_initialized = false;
+  const uint32_t addr_space = string_value->getType()->getPointerAddressSpace();
+  const GlobalValue::LinkageTypes linkage_type = string_value->getLinkage();
+  const GlobalValue::ThreadLocalMode thread_local_mode =
+      string_value->getThreadLocalMode();
+
+  GlobalVariable *new_var = new GlobalVariable(
+      module, new_format_string_const->getType(), is_constant, linkage_type,
+      new_format_string_const, Twine(string_value->getName() + "_"),
+      string_value, thread_local_mode, addr_space, is_externally_initialized);
+
+  new_var->setAlignment(MaybeAlign(string_value->getAlignment()));
+  new_var->setUnnamedAddr(string_value->getUnnamedAddr());
+
+  return new_var;
+}
+
+EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
+                                              std::string &new_str) {
+  // Set some sensible defaults in case we return error
+  new_str = "";
+
+  const char *fmt = str.c_str();
+
+  while (*fmt != '\0') {
+    if (*fmt != '%') {
+      new_str += *fmt;
+    } else {
+      std::string specifier_string(1, *fmt);
+
+      if (IncrementPtr(&fmt)) {
+        LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                          << str.c_str() << "\"");
+        return kPrintfError_invalidFormatString;
+      }
+
+      // Parse (zero or more) Flags
+      const char *flag_chars = "-+ #0";
+      while (strchr(flag_chars, *fmt)) {
+        specifier_string += *fmt;
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+      }
+
+      // Parse (optional) Width
+      if (*fmt == '*') {
+        specifier_string += *fmt;
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+      } else if (isdigit(*fmt)) {
+        while (isdigit(*fmt)) {
+          specifier_string += *fmt;
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+        }
+      }
+
+      // Parse (optional) Precision
+      if (*fmt == '.') {
+        specifier_string += *fmt;
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+
+        while (isdigit(*fmt)) {
+          specifier_string += *fmt;
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+        }
+      }
+
+      uint32_t vector_length = 1u;
+      const bool is_vector = *fmt == 'v';
+      // Parse (optional) Vector Specifier
+      if (is_vector) {
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+        switch (*fmt) {
+          default:
+            LLVM_DEBUG(dbgs() << "Unexpected character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          case '1':
+            // Must be 16, else error
+            if (IncrementPtr(&fmt)) {
+              LLVM_DEBUG(dbgs()
+                         << "Expected vector width of 16 in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            if (*fmt != '6') {
+              LLVM_DEBUG(dbgs()
+                         << "Expected vector width of 16 in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            vector_length = 16u;
+            break;
+          case '2':
+            vector_length = 2u;
+            break;
+          case '3':
+            vector_length = 3u;
+            // Lookahead for vectors of width 32. We know that we won't go out
+            // of bounds because worst case scenario there should be a null byte
+            // after the '3'.
+            if (*(fmt + 1) == '2') {
+              IncrementPtr(&fmt);
+              vector_length = 32u;
+            }
+            break;
+          case '4':
+            vector_length = 4u;
+            break;
+          case '6':
+            // Must be 64, else error
+            if (IncrementPtr(&fmt)) {
+              LLVM_DEBUG(dbgs()
+                         << "Expected vector width of 64 in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            if (*fmt != '4') {
+              LLVM_DEBUG(dbgs()
+                         << "Expected vector width of 64 in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            vector_length = 64u;
+            break;
+          case '8':
+            vector_length = 8u;
+            break;
+        }
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+      }
+
+      // Parse Length Modifier
+      const char *length_modifier_chars = "hljztL";
+      // Length Modifier is required with Vector Specifier
+      bool has_used_l_length_modifier = false;
+      const bool has_supplied_length_modifier =
+          strchr(length_modifier_chars, *fmt);
+      if (is_vector && !has_supplied_length_modifier) {
+        LLVM_DEBUG(
+            dbgs() << "Expected vector width specifier in format string \""
+                   << str.c_str() << "\"");
+        return kPrintfError_invalidFormatString;
+      }
+
+      if (has_supplied_length_modifier) {
+        bool consume_next_char = true;
+        switch (*fmt) {
+          default:
+            // The 'j', 'z', 't', and 'L' length modifiers are not supported by
+            // OpenCL C.
+            LLVM_DEBUG(dbgs() << "Unsupported length modifier '" << *fmt
+                              << "'specifier in format string \"" << str.c_str()
+                              << "\"");
+            return kPrintfError_invalidFormatString;
+          case 'h':
+            if (IncrementPtr(&fmt)) {
+              LLVM_DEBUG(dbgs()
+                         << "Unexpected \\0 character in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            if (*fmt == 'h') {
+              specifier_string += "hh";
+            } else if (*fmt == 'l') {
+              // Native printf doesn't recognize 'hl' so we don't
+              // add it to the new format string.  Luckily, 'hl'
+              // is sizeof(int) - the same as the default on
+              // native printf!
+
+              // Additionally, 'hl' modifier may only be used in
+              // conjunction with the vector specifier
+              if (!is_vector) {
+                LLVM_DEBUG(dbgs()
+                           << "Unexpected \\0 character in format string \""
+                           << str.c_str() << "\"");
+                return kPrintfError_invalidFormatString;
+              }
+            } else {
+              specifier_string += 'h';
+              // We've already incremented the ptr and we found nothing; don't
+              // do it again
+              consume_next_char = false;
+            }
+            break;
+          case 'l':
+            specifier_string += *fmt;
+            // Check ahead to see if the user is using the invalid 'll' length
+            // modifier
+            if (IncrementPtr(&fmt)) {
+              LLVM_DEBUG(dbgs()
+                         << "Unexpected \\0 character in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            if (*fmt == 'l') {
+              LLVM_DEBUG(dbgs()
+                         << "The 'll' length specifier is invalid in OpenCL "
+                            "printf\n  > "
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+            // We've already incremented the ptr; don't do it again
+
+            // The 'l' specifier for the OpenCL printf expects 64 bits
+            // integers, check if the system's long are actually 64 bits wide
+            // and if not upgrade the format specifier to 'll'.
+            //
+            // FIXME: This only works for host based devices, which is fine for
+            // our current printf implementation, but it should really be
+            // removed once we have a proper printf implementation.
+            if (sizeof(long) != 8) {
+              specifier_string += 'l';
+            }
+
+            consume_next_char = false;
+            has_used_l_length_modifier = true;
+            break;
+        }
+        if (consume_next_char) {
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+        }
+      }
+
+      // Parse Specifier
+      specifier_string += *fmt;
+
+      switch (*fmt) {
+        default:
+          break;
+        case 'n':
+          // The 'n' conversion specifier is not supported by OpenCL C.
+          LLVM_DEBUG(dbgs()
+                     << "The 'n' conversion specifier is invalid in OpenCL "
+                        "printf\n  > "
+                     << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        case 's':  // Intentional fall-through
+        case 'c':
+          // The 'l' length modifier followed by the 'c' or 's' conversion
+          // specifiers is not supported by OpenCL C.
+          if (has_used_l_length_modifier) {
+            LLVM_DEBUG(dbgs()
+                       << "The 'l' length modifier followed by the 'c' or "
+                          "'s' conversion is invalid in OpenCL printf\n  > "
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          break;
+      }
+
+      // Output the %specifier for each element of the vector,
+      // and for every element but the last, follow it by a "," string.
+      for (uint32_t i = 0; i < vector_length; ++i) {
+        new_str += specifier_string;
+
+        if (i < (vector_length - 1)) {
+          new_str += ",";
+        }
+      }
+    }
+    ++fmt;
+  }
+
+  new_str += '\0';
+
+  return kPrintfError_success;
+}
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
new file mode 100644
index 0000000000000..bc563fbba7150
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+/// @brief remove IntPtrs where possible.
+PreservedAnalyses RemoveIntPtrPass::run(Function &F,
+                                        FunctionAnalysisManager &) {
+  static const StringRef name = "remove_intptr";
+
+  SmallVector<PtrToIntInst *, 16> casts;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto *int_ptr = dyn_cast<PtrToIntInst>(&I)) {
+        casts.push_back(int_ptr);
+      }
+    }
+  }
+
+  if (casts.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  while (!casts.empty()) {
+    PtrToIntInst *int_ptr = casts.back();
+    casts.pop_back();
+
+    for (auto usei = int_ptr->use_begin(); usei != int_ptr->use_end();) {
+      auto &use = *(usei++);
+      auto *user = use.getUser();
+
+      if (auto *ptr = dyn_cast<IntToPtrInst>(user)) {
+        IRBuilder<> B(ptr);
+        Value *new_cast = B.CreatePointerBitCastOrAddrSpaceCast(
+            int_ptr->getOperand(0), ptr->getDestTy(), name);
+        ptr->replaceAllUsesWith(new_cast);
+        ptr->eraseFromParent();
+      } else if (auto *phi = dyn_cast<PHINode>(user)) {
+        // How we deal with PHI nodes is we create another PHI node with the
+        // pointer type, moving the PtrToInt to the other side of it. We also
+        // create IntToPtrs on the incoming side, where it does not consume
+        // the PtrToInt that we are currently looking at. Any new casts will
+        // hopefully be removed later.
+        auto num_values = phi->getNumIncomingValues();
+        PHINode *new_phi = PHINode::Create(int_ptr->getSrcTy(), num_values,
+                                           phi->getName() + ".intptr", phi);
+
+        Instruction *insert = phi;
+        while (isa<PHINode>(insert)) {
+          insert = insert->getNextNonDebugInstruction();
+        }
+
+        // Populate the replacement PHI node
+        for (decltype(num_values) i = 0; i != num_values; ++i) {
+          Value *incoming = phi->getIncomingValue(i);
+          BasicBlock *inb = phi->getIncomingBlock(i);
+          if (incoming == int_ptr) {
+            incoming = int_ptr->getOperand(0);
+          } else {
+            IRBuilder<> B(inb->getTerminator());
+            incoming = B.CreateIntToPtr(incoming, int_ptr->getSrcTy(), name);
+          }
+          new_phi->addIncoming(incoming, inb);
+        }
+
+        // Add the cast back to Int at the other side
+        IRBuilder<> B(insert);
+        Value *new_cast = B.CreatePtrToInt(new_phi, phi->getType(), name);
+        phi->replaceAllUsesWith(new_cast);
+        phi->eraseFromParent();
+        casts.push_back(cast<PtrToIntInst>(new_cast));
+      } else if (auto *bin_op = dyn_cast<BinaryOperator>(user)) {
+        auto *i8_ty = IntegerType::getInt8Ty(F.getContext());
+
+        IRBuilder<> B(bin_op);
+        Value *index = nullptr;
+
+        auto opcode = bin_op->getOpcode();
+        if (opcode == Instruction::Add) {
+          index = bin_op->getOperand(use.getOperandNo() == 0);
+        } else if (opcode == Instruction::Sub && use.getOperandNo() == 0) {
+          index = B.CreateNeg(bin_op->getOperand(1), name);
+        }
+
+        if (index) {
+          Value *operand = int_ptr->getOperand(0);
+          Value *cast_operand = B.CreateBitCast(
+              operand, i8_ty->getPointerTo(
+                           operand->getType()->getPointerAddressSpace()));
+          Value *new_gep = B.CreateGEP(i8_ty, cast_operand, index, name);
+          Value *new_cast = B.CreatePtrToInt(new_gep, bin_op->getType(), name);
+          bin_op->replaceAllUsesWith(new_cast);
+          bin_op->eraseFromParent();
+          casts.push_back(cast<PtrToIntInst>(new_cast));
+        }
+      }
+    }
+
+    if (int_ptr->use_empty()) {
+      int_ptr->eraseFromParent();
+    }
+  }
+
+  auto Preserved = PreservedAnalyses::all();
+  Preserved.abandon<UniformValueAnalysis>();
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
new file mode 100644
index 0000000000000..c87695a9d29eb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -0,0 +1,284 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/scalarization_pass.h"
+
+#include <compiler/utils/device_info.h>
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/Support/Debug.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/scalarizer.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-scalarization"
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczScalarizeFail,
+          "Number of kernels that failed to scalarize [ID#S80]");
+
+ScalarizationPass::ScalarizationPass() {}
+
+namespace {
+bool needsScalarization(const Type &T) { return T.isVectorTy(); }
+
+bool needsScalarization(const Instruction &I) {
+  if (needsScalarization(*I.getType())) {
+    return true;
+  }
+  for (const Use &op : I.operands()) {
+    if (needsScalarization(*op->getType())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isValidScalableShuffle(const ShuffleVectorInst &shuffle) {
+  // 3-element vectors are trouble, so scalarize them.
+  if (!isPowerOf2_32(cast<VectorType>(shuffle.getType())
+                         ->getElementCount()
+                         .getFixedValue())) {
+    return false;
+  }
+  if (!isPowerOf2_32(cast<VectorType>(shuffle.getOperand(0)->getType())
+                         ->getElementCount()
+                         .getFixedValue())) {
+    return false;
+  }
+  return true;
+}
+
+bool shouldScalarize(Instruction *I, bool scalable) {
+  // Don't scalarize loads or stores..
+  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+    return false;
+  }
+
+  // We also don't scalarize element manipulations of load instructions
+  if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+    if (scalable && !isValidScalableShuffle(*Shuffle)) {
+      return true;
+    }
+
+    auto *SrcA = dyn_cast<Instruction>(Shuffle->getOperand(0));
+    if (SrcA && !shouldScalarize(SrcA, scalable)) {
+      return false;
+    }
+    auto *SrcB = dyn_cast<Instruction>(Shuffle->getOperand(1));
+    if (SrcB && !shouldScalarize(SrcB, scalable)) {
+      return false;
+    }
+  } else if (auto *Extract = dyn_cast<ExtractElementInst>(I)) {
+    auto *SrcA = dyn_cast<Instruction>(Extract->getOperand(0));
+    if (SrcA && !shouldScalarize(SrcA, scalable)) {
+      return false;
+    }
+  }
+
+  // We also don't scalarize masked memory operations
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp()) {
+        return false;
+      }
+    }
+  }
+
+  // Scalarize anything else
+  return true;
+}
+
+/// @brief Operand Tracer struct
+/// The purpose of this helper struct is to trace through the operands of any
+/// given instruction, incrementing a usage counter, which we can compare to
+/// the total number of uses for an instruction. If any instruction's counter
+/// is equal to its total usage count, it has no uses other than ones we have
+/// marked.
+struct OperandTracer {
+  using VisitSet = DenseSet<Instruction *>;
+
+  UniformValueResult &UVR;
+  bool scalable;
+  VisitSet visited;
+  SmallVector<Instruction *, 16> stack;
+
+  OperandTracer(UniformValueResult &uvr, bool sc) : UVR(uvr), scalable(sc) {}
+
+  void count(Instruction *I) {
+    if (visited.insert(I).second) {
+      stack.push_back(I);
+    }
+  }
+
+  void countOperand(Value *V) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      countInstruction(I);
+    }
+  }
+
+  void countInstruction(Instruction *I) {
+    if (scalable) {
+      if (auto *const shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+        if (!isValidScalableShuffle(*shuffle)) {
+          return;
+        }
+      }
+    }
+
+    if (I->getType()->isVectorTy() && UVR.isVarying(I)) {
+      count(I);
+    }
+  }
+
+  void countOperands(Instruction *I) {
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      for (auto &use : Phi->incoming_values()) {
+        countOperand(use.get());
+      }
+      return;
+    }
+
+    for (auto *V : I->operand_values()) {
+      countOperand(V);
+    }
+  }
+
+  void run() {
+    while (!stack.empty()) {
+      Instruction *I = stack.back();
+      stack.pop_back();
+      countOperands(I);
+    }
+  }
+};
+
+}  // namespace
+
+PreservedAnalyses ScalarizationPass::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+  const auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  const auto *DI =
+      MAMProxy.getCachedResult<compiler::utils::DeviceInfoAnalysis>(
+          *F.getParent());
+  bool DoubleSupport = DI && DI->double_capabilities != 0;
+
+  bool FullScalarization =
+      VU.choices().isEnabled(VectorizationChoices::eFullScalarization);
+  bool NeedsScalarization = false;
+  Scalarizer SR(F, Ctx, DoubleSupport);
+
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+
+  // Find vector leaves that need to be scalarized.
+  std::vector<Instruction *> Leaves;
+  UVR.findVectorLeaves(Leaves);
+
+  if (FullScalarization) {
+    // Find varying vector values that need to be scalarized.
+    for (BasicBlock *BB : depth_first(&F)) {
+      for (Instruction &I : *BB) {
+        if (needsScalarization(*I.getType()) && UVR.isVarying(&I)) {
+          SR.setNeedsScalarization(&I);
+          NeedsScalarization = true;
+        }
+      }
+    }
+
+    for (Instruction *Leaf : Leaves) {
+      if (needsScalarization(*Leaf) && getVectorType(Leaf)) {
+        SR.setNeedsScalarization(Leaf);
+        NeedsScalarization = true;
+      }
+    }
+  } else {
+    // We use the tracer to identify instructions that are only used by
+    // scalar instructions (i.e. ExtractElement instructions and reductions).
+    //
+    // Since these instructions don't necessarily use all lanes of their
+    // operands, scalarization can produce dead code, which will get removed
+    // by later cleanup optimizations. Reductions are generally much better
+    // off scalarized.
+    bool const scalable = VU.width().isScalable();
+
+    OperandTracer tracer(UVR, scalable);
+    for (Instruction *Leaf : Leaves) {
+      if (needsScalarization(*Leaf) && getVectorType(Leaf)) {
+        tracer.countOperands(Leaf);
+      }
+    }
+    // Vector-to-scalar bitcasts aren't normally counted as vector leaves, but
+    // in this case we void unnecessary scalarization if we do.
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (auto *B = dyn_cast<BitCastInst>(&I)) {
+          if (B->getSrcTy()->isVectorTy() && !B->getDestTy()->isVectorTy() &&
+              UVR.isVarying(B)) {
+            tracer.countOperands(B);
+          }
+        }
+      }
+    }
+
+    tracer.run();
+
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (!shouldScalarize(&I, scalable)) {
+          continue;
+        }
+
+        if (I.getType()->isVectorTy() && UVR.isVarying(&I) &&
+            tracer.visited.count(&I) == 0) {
+          SR.setNeedsScalarization(&I);
+          NeedsScalarization = true;
+        }
+      }
+    }
+  }
+
+  if (!NeedsScalarization) {
+    return PreservedAnalyses::all();
+  }
+
+  if (!SR.scalarizeAll()) {
+    ++VeczScalarizeFail;
+    return VU.setFailed("Failed to scalarize");
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
new file mode 100644
index 0000000000000..25ae2d4433073
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -0,0 +1,1616 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/scalarizer.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/InstructionSimplify.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "simd_packet.h"
+#include "transform/printf_scalarizer.h"
+#include "vectorization_context.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-scalarization"
+
+namespace {
+/// @brief The maximum vector width that Vecz can handle.
+///
+/// The current limitation is due to the masks being used in the SimdPackets
+/// being stored as uint64_t.
+const unsigned MAX_SIMD_WIDTH = 64;
+}  // namespace
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczScalarized, "Number of instructions scalarized [ID#S00]");
+STATISTIC(VeczScalarizeFailCall,
+          "Scalarize: missing function declarations [ID#S81]");
+STATISTIC(VeczScalarizeFailBuiltin,
+          "Scalarize: non-scalarizable builtins [ID#S82]");
+STATISTIC(VeczScalarizeFailPrintf,
+          "Scalarize: failures to scalarize printf [ID#S83]");
+STATISTIC(VeczScalarizeFailCast,
+          "Scalarize: failures to scalarize cast [ID#S84]");
+STATISTIC(VeczScalarizeFailBitcast,
+          "Scalarize: failures to scalarize bitcast [ID#S85]");
+STATISTIC(VeczScalarizeFailReduceIntrinsic,
+          "Scalarize: failures to scalarize vector.reduce intrinsic [ID#S86]");
+
+Scalarizer::Scalarizer(llvm::Function &F, VectorizationContext &ctx,
+                       bool DoubleSuport)
+    : Ctx(ctx), F(F), DoubleSupport(DoubleSuport) {}
+
+SimdPacket *Scalarizer::getPacket(const Value *V, unsigned Width, bool Create) {
+  auto infoIt = packets.find(V);
+  if (infoIt != packets.end()) {
+    return infoIt->second.get();
+  }
+
+  if (Create) {
+    auto *P = (packets[V] = std::make_unique<SimdPacket>()).get();
+    P->resize(Width);
+    return P;
+  } else {
+    return nullptr;
+  }
+}
+
+Value *Scalarizer::getGather(Value *V) {
+  auto &Cache = Gathers[V];
+  if (Cache) {
+    return Cache;
+  }
+
+  // Build the gather directly before the original instruction.
+  // If it is not an instruction just return the original.
+  auto *insert = dyn_cast<Instruction>(V);
+  if (!insert) {
+    Cache = V;
+    return V;
+  }
+
+  auto *VecTy = cast<FixedVectorType>(V->getType());
+  unsigned SimdWidth = VecTy->getNumElements();
+
+  SimdPacket *P = getPacket(V, SimdWidth, false);
+  assert(P);
+
+  // Have to build after any PHI nodes.
+  while (isa<PHINode>(insert)) {
+    insert = insert->getNextNonDebugInstruction();
+  }
+  IRBuilder<> B(insert);
+
+  // If every element in the packet is the same, create a vector splat instead
+  // of individually inserting every element.
+  Value *const splat = [](SimdPacket &P) -> Value * {
+    Value *const first = P.at(0);
+    for (unsigned i = 1; i < P.size(); i++) {
+      if (P.at(i) != first) {
+        return nullptr;
+      }
+    }
+    return first;
+  }(*P);
+  if (splat) {
+    return Cache =
+               B.CreateVectorSplat(ElementCount::getFixed(P->size()), splat);
+  }
+
+  Value *Result = UndefValue::get(V->getType());
+  for (unsigned i = 0; i < P->size(); i++) {
+    if (auto *At = P->at(i)) {
+      if (!isa<UndefValue>(At)) {
+        Result = B.CreateInsertElement(Result, At, B.getInt32(i));
+      }
+    }
+  }
+
+  Cache = Result;
+  return Result;
+}
+
+void Scalarizer::setNeedsScalarization(Value *V) {
+  // Only mark each value once, but preserve the order
+  if (ScalarizeSet.insert(V).second) {
+    ToScalarize.push_back(V);
+  }
+}
+
+bool Scalarizer::scalarizeAll() {
+  // scalar instructions that use values to be scalarized.
+  for (Value *V : ToScalarize) {
+    auto *VecTy = getVectorType(V);
+    assert(VecTy && "Trying to scalarize a non-vector");
+    unsigned SimdWidth = VecTy->getNumElements();
+    // In the SimdPacket we use a mask that is stored as a uint64_t. Due
+    // to that, there is a limit on the vector size that Vecz can
+    // handle.
+    VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
+
+    PacketMask PM;
+    PM.enableAll(SimdWidth);
+    if (!scalarize(V, PM)) {
+      return false;
+    }
+  }
+
+  // Beware of instructions not being processed strictly in dominance order.
+  DenseSet<Instruction *> ScalarLeaves;
+  for (Value *V : ToScalarize) {
+    if (Failures.count(V)) {
+      continue;
+    }
+
+    // Any user of a scalarized instruction that is not itself scalarized needs
+    // its operands fixing up to use the scalarized versions.
+    for (auto *U : V->users()) {
+      if (auto *I = dyn_cast<Instruction>(U)) {
+        if (ScalarizeSet.count(I) == 0) {
+          ScalarLeaves.insert(I);
+        }
+      }
+    }
+  }
+
+  for (Instruction *I : ScalarLeaves) {
+    if (!scalarizeOperands(I)) {
+      emitVeczRemarkMissed(&F, I, "Could not scalarize");
+      return false;
+    }
+  }
+
+  IC.deleteInstructions();
+  return true;
+}
+
+Value *Scalarizer::scalarizeOperands(Instruction *I) {
+  // Vector extractions.
+  if (ExtractElementInst *Extract = dyn_cast<ExtractElementInst>(I)) {
+    // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
+    // that, there is a limit on the vector size that Vecz can handle.
+    VECZ_ERROR_IF(multi_llvm::getVectorNumElements(
+                      Extract->getVectorOperandType()) > MAX_SIMD_WIDTH,
+                  "The SIMD width is too large");
+    return scalarizeOperandsExtractElement(Extract);
+  }
+
+  // Vector -> non-vector bitcasts.
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(I)) {
+    if (BC->getSrcTy()->isVectorTy() && !BC->getDestTy()->isVectorTy()) {
+      // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
+      // that, there is a limit on the vector size that Vecz can handle.
+      VECZ_ERROR_IF(
+          multi_llvm::getVectorNumElements(BC->getSrcTy()) > MAX_SIMD_WIDTH,
+          "The SIMD width is too large");
+      return scalarizeOperandsBitCast(BC);
+    }
+  }
+
+  // printf or reduction intrinsic calls
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    Function *Callee = CI->getCalledFunction();
+    VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
+
+    // printf calls:
+    if (!Callee->isIntrinsic()) {
+      // Check if this is indeed a printf call
+      compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      auto const ID = BI.analyzeBuiltin(*Callee).ID;
+      if (ID == BI.getPrintfBuiltin()) {
+        return scalarizeOperandsPrintf(CI);
+      }
+    }
+
+    // reduction intrinsics:
+    if (auto *Intrin = dyn_cast<IntrinsicInst>(CI)) {
+      if (auto *reduce = scalarizeReduceIntrinsic(Intrin)) {
+        return reduce;
+      }
+    }
+  }
+
+  // No special-case handling, so just gather any scalarized operands
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    auto *Op = I->getOperand(i);
+    if (ScalarizeSet.count(Op)) {
+      I->setOperand(i, getGather(Op));
+    }
+  }
+
+  return I;
+}
+
+Value *Scalarizer::scalarizeOperandsPrintf(CallInst *CI) {
+  VECZ_STAT_FAIL_IF(CI->arg_empty(), VeczScalarizeFailPrintf);
+
+  // Get the format string as a string
+  GlobalVariable *FmtStringGV = GetFormatStringAsValue(CI->getArgOperand(0));
+  VECZ_STAT_FAIL_IF(!FmtStringGV, VeczScalarizeFailCall);
+  std::string FmtString = GetFormatStringAsString(FmtStringGV);
+  VECZ_STAT_FAIL_IF(FmtString.empty(), VeczScalarizeFailCall);
+  std::string NewFmtString;
+  const EnumPrintfError err =
+      ScalarizeAndCheckFormatString(FmtString, NewFmtString);
+  // Check if the format string was scalarizer successfully
+  VECZ_STAT_FAIL_IF(err != kPrintfError_success, VeczScalarizeFailCall);
+
+  // Create a new global variable out of the new format string
+  GlobalVariable *NewFmtStringGV = GetNewFormatStringAsGlobalVar(
+      *CI->getModule(), FmtStringGV, NewFmtString);
+
+  IRBuilder<> B(CI);
+  // Gather the operands for the new printf call, taking care to scalarize
+  // any vector operands.
+  llvm::SmallVector<Value *, 16> NewOps;
+  for (Use const &Op : CI->args()) {
+    // The first operand is the new format string
+    if (Op == *CI->arg_begin()) {
+      Constant *Zero = B.getInt32(0);
+      NewOps.push_back(B.CreateGEP(NewFmtStringGV->getValueType(),
+                                   NewFmtStringGV, {Zero, Zero}));
+      continue;
+    }
+    // The rest of the operands can either be copied or scalarized
+    if (!Op->getType()->isVectorTy()) {
+      // Non-vector operand, just copy
+      NewOps.push_back(Op.get());
+    } else {
+      // Vector operand, scalarize
+      // In the SimdPacket we use a mask that is stored as a uint64_t. Due
+      // to that, there is a limit on the vector size that Vecz can handle.
+      const uint32_t SimdWidth =
+          multi_llvm::getVectorNumElements(Op->getType());
+      VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
+      PacketMask PM;
+      PM.enableAll(SimdWidth);
+      SimdPacket *OpPacket = scalarize(Op.get(), PM);
+      VECZ_STAT_FAIL_IF(!OpPacket, VeczScalarizeFailCall);
+      for (unsigned i = 0; i < OpPacket->size(); ++i) {
+        Value *Lane = OpPacket->at(i);
+        VECZ_STAT_FAIL_IF(!Lane, VeczScalarizeFailCall);
+        // We need to promote half and floats to doubles, as per 6.5.2.2/6
+        // in the C99 standard, but not if the device does not have double
+        // support, in which case we need to promote them to floats, as per
+        // 6.12.13.2 in the OpenCL 1.2 standard.
+        Type *LaneTy = Lane->getType();
+        Type *PromotionType = DoubleSupport ? B.getDoubleTy() : B.getFloatTy();
+        if (LaneTy->isFloatingPointTy() &&
+            LaneTy->getPrimitiveSizeInBits() <
+                PromotionType->getPrimitiveSizeInBits()) {
+          VECZ_ERROR_IF(!LaneTy->isHalfTy() && !LaneTy->isFloatTy(),
+                        "Unexpected floating point type");
+          Lane = B.CreateFPExt(Lane, PromotionType);
+        }
+        NewOps.push_back(Lane);
+      }
+    }
+  }
+  // Create the new printf call
+  Function *Callee = CI->getCalledFunction();
+  CallInst *NewCI = B.CreateCall(Callee, NewOps, CI->getName());
+  NewCI->setCallingConv(CI->getCallingConv());
+  NewCI->setAttributes(CI->getAttributes());
+
+  // Replace all uses of the old one with the new one
+  CI->replaceAllUsesWith(NewCI);
+  IC.deleteInstructionLater(CI);
+
+  return NewCI;
+}
+
+Value *Scalarizer::scalarizeReduceIntrinsic(IntrinsicInst *Intrin) {
+  // Mark unhandled reduce intrinsics to fail (for now)
+  bool isHandled = true;
+  Instruction::BinaryOps BinOpcode;
+  switch (Intrin->getIntrinsicID()) {
+    default:
+      isHandled = false;
+      break;
+    case Intrinsic::vector_reduce_and:
+      BinOpcode = Instruction::And;
+      break;
+    case Intrinsic::vector_reduce_or:
+      BinOpcode = Instruction::Or;
+      break;
+    case Intrinsic::vector_reduce_xor:
+      BinOpcode = Instruction::Xor;
+      break;
+    case Intrinsic::vector_reduce_add:
+      // TODO: Need to handle FP reduce_add (Instruction::FAdd)
+      if (!Intrin->getType()->isFloatTy()) {
+        BinOpcode = Instruction::Add;
+      } else {
+        isHandled = false;
+      }
+      break;
+    case Intrinsic::vector_reduce_mul:
+      // TODO: Need to handle FP reduce_mul (Instruction::FMul)
+      if (!Intrin->getType()->isFloatTy()) {
+        BinOpcode = Instruction::Mul;
+      } else {
+        isHandled = false;
+      }
+      break;
+    case Intrinsic::vector_reduce_fadd:
+      // TODO: Need to handle FP reduce_add
+      isHandled = false;
+      break;
+    case Intrinsic::vector_reduce_fmul:
+      // TODO: Need to handle FP reduce_mul
+      isHandled = false;
+      break;
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umax:
+      // TODO: Need to handle Int (signed/unsigned) Max and FP Max
+      isHandled = false;
+      break;
+    case Intrinsic::vector_reduce_fmin:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_umin:
+      // TODO: Need to handle Int (signed/unsigned) Min and FP Min
+      isHandled = false;
+      break;
+  }
+  // If it's an intrinsic we don't handle here, return nullptr and fallback
+  // to simple gathering of any scalarized operands.
+  if (!isHandled) {
+    return nullptr;
+  }
+
+  // We need to handle more reduce intrinsics such as with more than 1 operand
+  // like 'fadd' and 'fmul', where the first operand is scalar and the second
+  // is the vector. However, the current scalarization analysis won't let these
+  // through and will fail, so we the reduce intrinsic scalarization takes in
+  // account only the the first (vector) operand, which is the only operand for
+  // the integer reduce cases.
+  Value *Vec = Intrin->getOperand(0);
+  assert(Vec && "Could not get operand 0 of Intrin");
+
+  // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
+  // that, there is a limit on the vector size that Vecz can handle.
+  auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+  VECZ_FAIL_IF(!VecTy);
+  const uint32_t SimdWidth = VecTy->getNumElements();
+  VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
+
+  PacketMask PM;
+  IRBuilder<> B(Intrin);
+  PM.enableAll(SimdWidth);
+
+  SimdPacket *Packet = scalarize(Vec, PM);
+  VECZ_STAT_FAIL_IF(!Packet, VeczScalarizeFailReduceIntrinsic);
+
+  Type *const VecEleTy = VecTy->getElementType();
+  Value *Result = ConstantInt::getNullValue(VecEleTy);
+  for (unsigned i = 0; i < Packet->size(); ++i) {
+    Value *const Lane = Packet->at(i);
+    VECZ_STAT_FAIL_IF(!Lane, VeczScalarizeFailCall);
+    Type *const LaneTy = Lane->getType();
+    VECZ_ERROR_IF(LaneTy->isFloatTy(), "Unexpected floating point type");
+    Result = B.CreateBinOp(BinOpcode, Result, Lane);
+  }
+
+  Intrin->replaceAllUsesWith(Result);
+  IC.deleteInstructionLater(Intrin);
+
+  return Result;
+}
+
+Value *Scalarizer::scalarizeOperandsExtractElement(ExtractElementInst *Extr) {
+  // Determine the extraction index.
+  Value *OrigVec = Extr->getOperand(0);
+  Value *ExtractIndex = Extr->getOperand(1);
+  assert(OrigVec && "Could not get operand 0 of Extr");
+  assert(ExtractIndex && "Could not get operand 1 of Extr");
+  ConstantInt *ConstantExtractIndex = dyn_cast<ConstantInt>(ExtractIndex);
+  PacketMask PM;
+  SimdPacket *OrigVecPacket;
+  Value *ReturnVal;
+
+  if (!ConstantExtractIndex) {
+    // Index of extractElementInst is not a constant
+    // Scalarize the original vector for all lanes.
+    auto *Vec = dyn_cast<FixedVectorType>(OrigVec->getType());
+    const unsigned VecWidth = Vec ? Vec->getNumElements() : 0;
+    PM.enableAll(VecWidth);
+    OrigVecPacket = scalarize(OrigVec, PM);
+    VECZ_FAIL_IF(!OrigVecPacket);
+
+    IRBuilder<> B(Extr);
+    Value *Select = UndefValue::get(Extr->getType());
+    for (unsigned lane = 0; lane < VecWidth; lane++) {
+      // Check if the the lane matches the extract index and select
+      // the corresponding value
+      Value *Cmp = B.CreateICmpEQ(
+          ConstantInt::get(ExtractIndex->getType(), lane), ExtractIndex);
+      Select = B.CreateSelect(Cmp, OrigVecPacket->at(lane), Select);
+    }
+    ReturnVal = Select;
+  } else {
+    // Scalarize the original vector, but only for the lane to extract.
+    unsigned Lane = ConstantExtractIndex->getZExtValue();
+    PM.enable(Lane);
+    OrigVecPacket = scalarize(OrigVec, PM);
+    VECZ_FAIL_IF(!OrigVecPacket);
+    ReturnVal = OrigVecPacket->at(Lane);
+  }
+
+  // Replace the extraction by the extracted lane value.
+  Extr->replaceAllUsesWith(ReturnVal);
+  IC.deleteInstructionLater(Extr);
+  return ReturnVal;
+}
+
+Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) {
+  auto *VecSrcTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
+  VECZ_FAIL_IF(!VecSrcTy);
+  unsigned SimdWidth = VecSrcTy->getNumElements();
+  PacketMask PM;
+  PM.enableAll(SimdWidth);
+  SimdPacket *SrcPacket = scalarize(BC->getOperand(0), PM);
+  VECZ_FAIL_IF(!SrcPacket);
+
+  Type *DstTy = BC->getDestTy();
+  Type *DstAsIntTy = DstTy;
+  Type *SrcEleTy = VecSrcTy->getElementType();
+  Type *SrcEleAsIntTy = SrcEleTy;
+  unsigned SrcEleBits = SrcEleTy->getScalarSizeInBits();
+  unsigned DstBits = DstTy->getPrimitiveSizeInBits();
+  if (!DstTy->isIntegerTy()) {
+    DstAsIntTy = IntegerType::get(BC->getContext(), DstBits);
+  }
+  if (!SrcEleTy->isIntegerTy()) {
+    SrcEleAsIntTy = IntegerType::get(BC->getContext(), SrcEleBits);
+  }
+
+  // Successively OR each scalarized value together.
+  IRBuilder<> B(BC);
+  Value *Result = ConstantInt::getNullValue(DstAsIntTy);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Value *Lane = SrcPacket->at(i);
+    if (!SrcEleTy->isIntegerTy()) {
+      Lane = B.CreateBitCast(Lane, SrcEleAsIntTy);
+    }
+    Lane = B.CreateZExt(Lane, DstAsIntTy);
+    Lane = B.CreateShl(Lane, i * SrcEleBits);
+    Result = B.CreateOr(Result, Lane);
+  }
+  if (!DstTy->isIntegerTy()) {
+    Result = B.CreateBitCast(Result, DstTy);
+  }
+  BC->replaceAllUsesWith(Result);
+  IC.deleteInstructionLater(BC);
+  return Result;
+}
+
+SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) {
+  auto *VecTy = getVectorType(V);
+  VECZ_ERROR_IF(!VecTy,
+                "We shouldn't be trying to scalarize a non-vector instruction");
+  unsigned SimdWidth = VecTy->getNumElements();
+
+  // Re-use cached packets, but make sure it contains all the lanes we want.
+  // If we have a cached packet with missing lanes, it will be fetched by
+  // getPacket and filled with the new lanes.
+  SimdPacket *CachedPacket = getPacket(V, SimdWidth, false);
+  if (CachedPacket && ((CachedPacket->Mask.Value & PM.Value) == PM.Value)) {
+    return CachedPacket;
+  }
+
+  // This value hasn't been scheduled for scalarization, so extract instead
+  if (!V->getType()->isVoidTy() && ScalarizeSet.count(V) == 0) {
+    return extractLanes(V, PM);
+  }
+
+  // Only instructions can be scalarized at this point.
+  Instruction *Ins = dyn_cast<Instruction>(V);
+  if (!Ins) {
+    if (!V->getType()->isVoidTy()) {
+      return extractLanes(V, PM);
+    } else {
+      return assignScalar(nullptr, V);
+    }
+  }
+
+  // Figure out what kind of instruction it is and try to scalarize it.
+  SimdPacket *Result = nullptr;
+  switch (Ins->getOpcode()) {
+    default:
+      if (Ins->isBinaryOp()) {
+        Result = scalarizeBinaryOp(cast<BinaryOperator>(V), PM);
+      } else if (Ins->isCast()) {
+        Result = scalarizeCast(cast<CastInst>(V), PM);
+      } else if (Ins->isUnaryOp()) {
+        Result = scalarizeUnaryOp(cast<UnaryOperator>(V), PM);
+      }
+      break;
+    case Instruction::GetElementPtr:
+      Result = scalarizeGEP(cast<GetElementPtrInst>(V), PM);
+      break;
+    case Instruction::Store:
+      Result = scalarizeStore(cast<StoreInst>(V), PM);
+      break;
+    case Instruction::Load:
+      Result = scalarizeLoad(cast<LoadInst>(V), PM);
+      break;
+    case Instruction::Call:
+      Result = scalarizeCall(cast<CallInst>(V), PM);
+      break;
+    case Instruction::ICmp:
+      Result = scalarizeICmp(cast<ICmpInst>(V), PM);
+      break;
+    case Instruction::FCmp:
+      Result = scalarizeFCmp(cast<FCmpInst>(V), PM);
+      break;
+    case Instruction::Select:
+      Result = scalarizeSelect(cast<SelectInst>(V), PM);
+      break;
+    case Instruction::ShuffleVector:
+      Result = scalarizeShuffleVector(cast<ShuffleVectorInst>(V), PM);
+      break;
+    case Instruction::InsertElement:
+      Result = scalarizeInsertElement(cast<InsertElementInst>(V), PM);
+      break;
+    case Instruction::PHI:
+      Result = scalarizePHI(cast<PHINode>(V), PM);
+      break;
+      // Freeze instruction is not available in LLVM versions prior 10.0
+      // and not used in LLVM versions prior to 11.0
+    case Instruction::Freeze:
+      Result = scalarizeFreeze(cast<FreezeInst>(V), PM);
+      break;
+  }
+
+  if (Result) {
+    scalarizeDI(Ins, Result, SimdWidth);
+    return assignScalar(Result, V);
+  } else {
+    // If an instruction couldn't be scalarized, we can just extract its
+    // elements, but we also need to remove it from the scalarization set and
+    // add it to the failures set so any scalar leaves don't try to scalarize
+    // it again.
+    ScalarizeSet.erase(Ins);
+    Failures.insert(Ins);
+    return extractLanes(V, PM);
+  }
+}
+
+SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) {
+  auto *VecTy = getVectorType(V);
+  VECZ_FAIL_IF(!VecTy);
+  unsigned SimdWidth = VecTy->getNumElements();
+  SimdPacket *P = getPacket(V, SimdWidth);
+
+  if (Constant *CVec = dyn_cast<Constant>(V)) {
+    assert(isa<FixedVectorType>(CVec->getType()) && "Invalid constant type!");
+    SimdPacket *P = getPacket(CVec, SimdWidth);
+    for (unsigned i = 0; i < SimdWidth; i++) {
+      if (!PM.isEnabled(i) || P->at(i)) {
+        continue;
+      }
+      P->set(i, CVec->getAggregateElement(i));
+    }
+    return P;
+  }
+
+  if (isa<UndefValue>(V)) {
+    Value *ScalarUndef = UndefValue::get(VecTy->getElementType());
+    SimdPacket *P = getPacket(V, SimdWidth);
+    for (unsigned i = 0; i < SimdWidth; i++) {
+      if (!PM.isEnabled(i) || P->at(i)) {
+        continue;
+      }
+      P->set(i, ScalarUndef);
+    }
+    return P;
+  }
+
+  Instruction *insert = nullptr;
+
+  if (auto *Arg = dyn_cast<Argument>(V)) {
+    BasicBlock &Entry = Arg->getParent()->getEntryBlock();
+
+    // Make sure we start inserting new instructions after any allocas
+    auto insertAfter = Entry.begin();
+
+    while (isa<AllocaInst>(*insertAfter)) {
+      insertAfter++;
+    }
+    insert = &*insertAfter;
+  } else if (auto *Inst = dyn_cast<Instruction>(V)) {
+    insert = Inst->getNextNonDebugInstruction();
+    while (isa<PHINode>(insert)) {
+      insert = insert->getNextNonDebugInstruction();
+    }
+  } else {
+    return nullptr;
+  }
+
+  const SimplifyQuery Q(F.getParent()->getDataLayout());
+
+  IRBuilder<> B(insert);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+
+    Value *Idx = B.getInt32(i);
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+    Value *Extract = simplifyExtractElementInst(V, Idx, Q);
+#else
+    Value *Extract = SimplifyExtractElementInst(V, Idx, Q);
+#endif
+    if (!Extract) {
+      Extract = B.CreateExtractElement(V, Idx);
+    }
+    P->set(i, Extract);
+  }
+  return P;
+}
+
+void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
+                             unsigned Width) {
+  // Don't support scalarizing PHI nodes
+  if (!Packet || !Original || isa<PHINode>(Original)) {
+    return;
+  }
+
+  auto *const LAM = LocalAsMetadata::getIfExists(Original);
+  if (!LAM) {
+    return;
+  }
+
+  auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM);
+  if (!MDV) {
+    return;
+  }
+
+  // Contains processed SIMD values for which we create scalar debug
+  // instructions and is used to avoid duplicate LLVM dbg.value's.
+  SmallPtrSet<Value *, 4> VectorElements;
+
+  DIBuilder DIB(*Original->getModule(), false);
+  for (User *U : MDV->users()) {
+    DILocalVariable *DILocal = nullptr;
+    DebugLoc DILoc;
+
+    // These methods aren't virtual in DbgInfoIntrinsic for some reason
+    // TODO CA-1115 - Support llvm.dbg.addr() intrinsic
+    if (DbgValueInst *const DVI = dyn_cast<DbgValueInst>(U)) {
+      DILocal = DVI->getVariable();
+      DILoc = DVI->getDebugLoc();
+    } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(U)) {
+      DILocal = DDI->getVariable();
+      DILoc = DDI->getDebugLoc();
+    } else {
+      continue;
+    }
+
+    // Create new llvm.dbg.value() intrinsic across enabled SIMD lanes
+    const auto bitSize = Original->getType()->getScalarSizeInBits();
+    for (unsigned lane = 0; lane < Width; ++lane) {
+      Value *LaneVal = Packet->at(lane);
+      if (LaneVal && !isa<UndefValue>(LaneVal)) {
+        // Check if the LaneVal SIMD Value is already processed
+        // and a Debug Value Intrinsic has been created for it.
+        if (VectorElements.find(LaneVal) != VectorElements.end()) {
+          continue;
+        }
+        // DWARF bit piece expressions are used to describe part of an
+        // aggregate variable, our vector, which is fragmented across multiple
+        // values. First argument takes the offset of the piece, and the second
+        // takes the piece size.
+        auto DIExpr = *DIExpression::createFragmentExpression(
+            DIB.createExpression(), lane * bitSize, bitSize);
+        DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
+        VectorElements.insert(LaneVal);
+      }
+    }
+  }
+}
+
+SimdPacket *Scalarizer::assignScalar(SimdPacket *P, Value *V) {
+  if (!P) {
+    emitVeczRemarkMissed(&F, V, "Could not scalarize");
+  } else {
+    ++VeczScalarized;
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      IC.deleteInstructionLater(I);
+    }
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
+  Value *VecPtr = Load->getPointerOperand();
+  PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Load->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+
+  Type *ScalarEleTy = VecDataTy->getElementType();
+  PointerType *ScalarPtrTy =
+      PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
+
+  // Absorb redundant bitcasts
+  Value *ScalarPtrBase = nullptr;
+  if (auto *BitCast = dyn_cast<BitCastInst>(VecPtr)) {
+    // Note that we assume the bitcast isn't used by anything else other than
+    // loads or stores. Other uses of the bitcast are possible in principle,
+    // which cases could be purposely constructed but it is considered unlikely
+    // to occur naturally. If it happens, the DeleteInstructions pass will not
+    // actually delete it so no harm is done in any case.
+    IC.deleteInstructionLater(BitCast);
+    VecPtr = BitCast->getOperand(0);
+    if (BitCast->getSrcTy() == ScalarPtrTy) {
+      ScalarPtrBase = VecPtr;
+    }
+  }
+  GetElementPtrInst *VecPtrGEP = dyn_cast<GetElementPtrInst>(VecPtr);
+  const bool InBounds = (VecPtrGEP && VecPtrGEP->isInBounds());
+
+  IRBuilder<> B(Load);
+  if (!ScalarPtrBase) {
+    ScalarPtrBase = B.CreateBitCast(VecPtr, ScalarPtrTy);
+  }
+
+  SimdPacket PtrPacket;
+  SimdPacket *P = getPacket(Load, SimdWidth);
+  PtrPacket.resize(SimdWidth);
+
+  // Emit scalarized pointers.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || PtrPacket.at(i)) {
+      continue;
+    }
+
+    // Re-use GEPs if available
+    if (P->at(i)) {
+      LoadInst *LoadI = cast<LoadInst>(P->at(i));
+      Value *PtrI = LoadI->getPointerOperand();
+      if (isa<GetElementPtrInst>(PtrI)) {
+        PtrPacket.set(i, PtrI);
+        continue;
+      }
+    }
+
+    Value *ScalarPtr =
+        InBounds
+            ? B.CreateInBoundsGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i))
+            : B.CreateGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i));
+    PtrPacket.set(i, ScalarPtr);
+  }
+
+  // The individual elements may need laxer alignment requirements than the
+  // whole vector.
+  unsigned Alignment = Load->getAlign().value();
+  unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
+  if (Alignment < EleAlign) {
+    EleAlign = Alignment;
+  }
+
+  // Emit scalarized loads.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    LoadInst *NewLoad = B.CreateLoad(ScalarEleTy, PtrPacket.at(i),
+                                     Load->isVolatile(), Load->getName());
+
+    NewLoad->copyMetadata(*Load);
+    NewLoad->setAlignment(MaybeAlign(EleAlign).valueOrOne());
+
+    P->set(i, NewLoad);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
+  Value *VecPtr = Store->getPointerOperand();
+  assert(VecPtr && "Could not get pointer operand from Store");
+  PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
+  auto *VecDataTy =
+      dyn_cast<FixedVectorType>(Store->getValueOperand()->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  Type *ScalarEleTy = VecDataTy->getElementType();
+  PointerType *ScalarPtrTy =
+      PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
+  Value *VectorData = Store->getValueOperand();
+
+  // Emit scalarized data values.
+  SimdPacket *DataPacket = scalarize(VectorData, PM);
+  VECZ_FAIL_IF(!DataPacket);
+
+  // Absorb redundant bitcasts
+  Value *ScalarPtrBase = nullptr;
+  if (auto *BitCast = dyn_cast<BitCastInst>(VecPtr)) {
+    // See comment at equivalent part of Scalarizer::scalarizeLoad()
+    IC.deleteInstructionLater(BitCast);
+    VecPtr = BitCast->getOperand(0);
+    if (BitCast->getSrcTy() == ScalarPtrTy) {
+      ScalarPtrBase = VecPtr;
+    }
+  }
+  GetElementPtrInst *VecPtrGEP = dyn_cast<GetElementPtrInst>(VecPtr);
+  const bool InBounds = (VecPtrGEP && VecPtrGEP->isInBounds());
+
+  IRBuilder<> B(Store);
+  if (!ScalarPtrBase) {
+    ScalarPtrBase = B.CreateBitCast(VecPtr, ScalarPtrTy);
+  }
+
+  SimdPacket PtrPacket;
+  SimdPacket *P = getPacket(Store, SimdWidth);
+  PtrPacket.resize(SimdWidth);
+
+  // Emit scalarized pointers.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || PtrPacket.at(i)) {
+      continue;
+    }
+
+    // Re-use GEPs if available
+    if (P->at(i)) {
+      StoreInst *StoreI = cast<StoreInst>(P->at(i));
+      Value *PtrI = StoreI->getPointerOperand();
+      if (isa<GetElementPtrInst>(PtrI)) {
+        PtrPacket.set(i, PtrI);
+        continue;
+      }
+    }
+
+    Value *ScalarPtr =
+        InBounds
+            ? B.CreateInBoundsGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i))
+            : B.CreateGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i));
+    PtrPacket.set(i, ScalarPtr);
+  }
+
+  // See comment at equivalent part of scalarizeLoad()
+  unsigned Alignment = Store->getAlign().value();
+  unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
+  if (Alignment < EleAlign) {
+    EleAlign = Alignment;
+  }
+
+  // Emit scalarized stores.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *Data = DataPacket->at(i);
+    if (isa<UndefValue>(Data)) {
+      P->set(i, Data);
+    } else {
+      StoreInst *NewStore =
+          B.CreateStore(Data, PtrPacket.at(i), Store->isVolatile());
+
+      NewStore->copyMetadata(*Store);
+      NewStore->setAlignment(MaybeAlign(EleAlign).valueOrOne());
+
+      P->set(i, NewStore);
+    }
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeBinaryOp(BinaryOperator *BinOp,
+                                          PacketMask PM) {
+  IRBuilder<> B(BinOp);
+  Value *LHS = BinOp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(LHS->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  SimdPacket *LHSPacket = scalarize(LHS, PM);
+  VECZ_FAIL_IF(!LHSPacket);
+  Value *RHS = BinOp->getOperand(1);
+  SimdPacket *RHSPacket = scalarize(RHS, PM);
+  VECZ_FAIL_IF(!RHSPacket);
+  SimdPacket *P = getPacket(BinOp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateBinOp(BinOp->getOpcode(), LHSPacket->at(i),
+                               RHSPacket->at(i), BinOp->getName());
+    if (BinaryOperator *NewBinOp = dyn_cast<BinaryOperator>(New)) {
+      NewBinOp->copyIRFlags(BinOp);
+    }
+    P->set(i, New);
+  }
+  return P;
+}
+
+// Freeze instruction is not available in LLVM versions prior 10.0
+// and not used in LLVM versions prior to 11.0
+SimdPacket *Scalarizer::scalarizeFreeze(FreezeInst *FreezeI, PacketMask PM) {
+  IRBuilder<> B(FreezeI);
+  Value *Src = FreezeI->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+
+  // Create scalarized freeze.
+  SimdPacket *P = getPacket(FreezeI, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateFreeze(SrcPacket->at(i), FreezeI->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeUnaryOp(UnaryOperator *UnOp, PacketMask PM) {
+  IRBuilder<> B(UnOp);
+  Value *Src = UnOp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+  SimdPacket *P = getPacket(UnOp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New =
+        B.CreateUnOp(UnOp->getOpcode(), SrcPacket->at(i), UnOp->getName());
+    if (UnaryOperator *NewUnOp = dyn_cast<UnaryOperator>(New)) {
+      NewUnOp->copyIRFlags(UnOp);
+    }
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) {
+  // Make sure we support the cast operation.
+  CastInst::CastOps Opc = CastI->getOpcode();
+  switch (Opc) {
+    default:
+      return nullptr;
+    case CastInst::BitCast:
+      return scalarizeBitCast(cast<BitCastInst>(CastI), PM);
+    case CastInst::Trunc:
+    case CastInst::ZExt:
+    case CastInst::SExt:
+    case CastInst::FPToUI:
+    case CastInst::FPToSI:
+    case CastInst::UIToFP:
+    case CastInst::SIToFP:
+    case CastInst::FPTrunc:
+    case CastInst::FPExt:
+    case CastInst::AddrSpaceCast:
+      break;
+  }
+
+  // Scalarize the source value.
+  IRBuilder<> B(CastI);
+  Value *Src = CastI->getOperand(0);
+  auto *VecSrcTy = dyn_cast<FixedVectorType>(Src->getType());
+  VECZ_FAIL_IF(!VecSrcTy);
+  unsigned SimdWidth = VecSrcTy->getNumElements();
+  auto *VecDstTy = dyn_cast<FixedVectorType>(CastI->getType());
+  VECZ_STAT_FAIL_IF(!VecDstTy || (VecDstTy->getNumElements() != SimdWidth),
+                    VeczScalarizeFailCast);
+  SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+
+  // Create scalarized casts.
+  SimdPacket *P = getPacket(CastI, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    auto *const SrcPacketValue = SrcPacket->at(i);
+    VECZ_FAIL_IF(!SrcPacketValue);
+    Value *New = B.CreateCast(Opc, SrcPacketValue, VecDstTy->getElementType(),
+                              CastI->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
+  IRBuilder<> B(BC);
+  Type *SrcTy = BC->getSrcTy();
+  Value *Src = BC->getOperand(0);
+  auto *VecSrcTy = dyn_cast<FixedVectorType>(SrcTy);
+  auto *VecDstTy = dyn_cast<FixedVectorType>(BC->getDestTy());
+  VECZ_FAIL_IF(!VecDstTy);
+  unsigned SimdWidth = VecDstTy->getNumElements();
+  bool Vec3Src = VecSrcTy && (VecSrcTy->getNumElements() == 3);
+  bool Vec3Dst = (SimdWidth == 3);
+  VECZ_STAT_FAIL_IF(Vec3Src ^ Vec3Dst, VeczScalarizeFailBitcast);
+
+  // Handle non-vector -> vector casts and vector casts with different widths.
+  // This is done by casting the source to an integer and doing bitwise
+  // extractions with ANDs and shifts.
+  if (!VecSrcTy || (VecSrcTy->getNumElements() != SimdWidth)) {
+    Type *SrcAsIntTy = SrcTy;
+    Value *SrcAsInt = Src;
+    Type *DstEleTy = VecDstTy->getElementType();
+    Type *DstEleAsIntTy = DstEleTy;
+    unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
+    unsigned LaneBits = DstEleTy->getPrimitiveSizeInBits();
+    if (!SrcTy->isIntegerTy()) {
+      SrcAsIntTy = SrcTy->getIntNTy(BC->getContext(), SrcBits);
+      SrcAsInt = B.CreateBitCast(SrcAsInt, SrcAsIntTy);
+      SrcAsInt = scalarizeOperands(cast<Instruction>(SrcAsInt));
+    }
+    if (!DstEleTy->isIntegerTy()) {
+      DstEleAsIntTy = IntegerType::get(BC->getContext(), LaneBits);
+    }
+
+    SimdPacket *P = getPacket(BC, SimdWidth);
+    for (unsigned i = 0; i < SimdWidth; i++) {
+      if (!PM.isEnabled(i) || P->at(i)) {
+        continue;
+      }
+      APInt LaneMask(SrcBits, 1);
+      LaneMask = LaneMask.shl(LaneBits);
+      LaneMask -= APInt(SrcBits, 1);
+      LaneMask = LaneMask.shl(i * LaneBits);
+      Value *LaneMaskVal = ConstantInt::get(SrcAsIntTy, LaneMask);
+      Value *Lane = B.CreateAnd(SrcAsInt, LaneMaskVal);
+      Lane = B.CreateLShr(Lane, LaneBits * i);
+      Lane = B.CreateTrunc(Lane, DstEleAsIntTy);
+      if (!DstEleTy->isIntegerTy()) {
+        Lane = B.CreateBitCast(Lane, DstEleTy);
+      }
+      P->set(i, Lane);
+    }
+    return P;
+  }
+
+  // Handle vector -> vector casts, quite a more straighforward affair.
+  SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+  Type *DstEleTy = VecDstTy->getElementType();
+  SimdPacket *P = getPacket(BC, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *NewVal = B.CreateBitCast(SrcPacket->at(i), DstEleTy);
+    P->set(i, NewVal);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeICmp(ICmpInst *ICmp, PacketMask PM) {
+  IRBuilder<> B(ICmp);
+  Value *LHS = ICmp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(ICmp->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  SimdPacket *LHSPacket = scalarize(LHS, PM);
+  VECZ_FAIL_IF(!LHSPacket);
+  Value *RHS = ICmp->getOperand(1);
+  SimdPacket *RHSPacket = scalarize(RHS, PM);
+  VECZ_FAIL_IF(!RHSPacket);
+  SimdPacket *P = getPacket(ICmp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateICmp(ICmp->getPredicate(), LHSPacket->at(i),
+                              RHSPacket->at(i), ICmp->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeFCmp(FCmpInst *FCmp, PacketMask PM) {
+  IRBuilder<> B(FCmp);
+  Value *LHS = FCmp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(FCmp->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  SimdPacket *LHSPacket = scalarize(LHS, PM);
+  VECZ_FAIL_IF(!LHSPacket);
+  Value *RHS = FCmp->getOperand(1);
+  SimdPacket *RHSPacket = scalarize(RHS, PM);
+  VECZ_FAIL_IF(!RHSPacket);
+  SimdPacket *P = getPacket(FCmp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateFCmp(FCmp->getPredicate(), LHSPacket->at(i),
+                              RHSPacket->at(i), FCmp->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeSelect(SelectInst *Select, PacketMask PM) {
+  IRBuilder<> B(Select);
+  Value *Cond = Select->getCondition();
+  SimdPacket *CondPacket = nullptr;
+  if (Cond->getType()->isVectorTy()) {
+    CondPacket = scalarize(Cond, PM);
+    VECZ_FAIL_IF(!CondPacket);
+  }
+  Value *TrueVal = Select->getTrueValue();
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Select->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  SimdPacket *TruePacket = scalarize(TrueVal, PM);
+  VECZ_FAIL_IF(!TruePacket);
+  Value *FalseVal = Select->getFalseValue();
+  SimdPacket *FalsePacket = scalarize(FalseVal, PM);
+  VECZ_FAIL_IF(!FalsePacket);
+  SimdPacket *P = getPacket(Select, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *CondLane = CondPacket ? CondPacket->at(i) : Cond;
+    Value *New = B.CreateSelect(CondLane, TruePacket->at(i), FalsePacket->at(i),
+                                Select->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
+                                             MemOp &MaskedOp) {
+  Function *Callee = CI->getCalledFunction();
+  VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
+  auto *VecDataTy = getVectorType(CI);
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+  assert((MaskedOp.isLoad() || MaskedOp.isStore()) &&
+         "Masked op is not a store or load!");
+
+  // Scalarize mask
+  Value *MaskOperand = MaskedOp.getMaskOperand();
+  VECZ_FAIL_IF(!MaskOperand);
+  SimdPacket *MaskPacket = scalarize(MaskedOp.getMaskOperand(), PM);
+  VECZ_FAIL_IF(!MaskPacket);
+
+  Value *VecPtr = MaskedOp.getPointerOperand();
+  VECZ_FAIL_IF(!VecPtr);
+
+  // Scalarize data packet if this is a store
+  SimdPacket *DataPacket = nullptr;
+  if (MaskedOp.isStore()) {
+    DataPacket = scalarize(MaskedOp.getDataOperand(), PM);
+    VECZ_FAIL_IF(!DataPacket);
+  }
+
+  PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
+  Type *ScalarEleTy = VecDataTy->getElementType();
+  PointerType *ScalarPtrTy =
+      PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
+
+  // Absorb redundant bitcasts
+  Value *ScalarPtrBase = nullptr;
+  if (auto *BitCast = dyn_cast<BitCastInst>(VecPtr)) {
+    IC.deleteInstructionLater(BitCast);
+    VecPtr = BitCast->getOperand(0);
+    if (BitCast->getSrcTy() == ScalarPtrTy) {
+      ScalarPtrBase = VecPtr;
+    }
+  }
+  GetElementPtrInst *VecPtrGEP = dyn_cast<GetElementPtrInst>(VecPtr);
+  const bool InBounds = (VecPtrGEP && VecPtrGEP->isInBounds());
+
+  IRBuilder<> B(CI);
+  if (!ScalarPtrBase) {
+    ScalarPtrBase = B.CreateBitCast(VecPtr, ScalarPtrTy);
+  }
+
+  SimdPacket PtrPacket;
+  SimdPacket *P = getPacket(CI, SimdWidth);
+  PtrPacket.resize(SimdWidth);
+
+  // Create scalar pointers
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || PtrPacket.at(i)) {
+      continue;
+    }
+
+    Value *ScalarPtr =
+        InBounds
+            ? B.CreateInBoundsGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i))
+            : B.CreateGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i));
+    PtrPacket.set(i, ScalarPtr);
+  }
+
+  unsigned Alignment = MaskedOp.getAlignment();
+  unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
+  if (Alignment < EleAlign) {
+    EleAlign = Alignment;
+  }
+
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Instruction *ScalarMemOp = nullptr;
+    if (MaskedOp.isLoad()) {
+      ScalarMemOp =
+          createMaskedLoad(Ctx, ScalarEleTy, PtrPacket.at(i), MaskPacket->at(i),
+                           /*EVL*/ nullptr, EleAlign);
+    } else {
+      ScalarMemOp = createMaskedStore(Ctx, DataPacket->at(i), PtrPacket.at(i),
+                                      MaskPacket->at(i),
+                                      /*EVL*/ nullptr, EleAlign);
+    }
+    VECZ_FAIL_IF(!ScalarMemOp);
+    B.Insert(ScalarMemOp);
+    P->set(i, ScalarMemOp);
+  }
+
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *Callee = CI->getCalledFunction();
+  VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
+  auto *VecDataTy = getVectorType(CI);
+  VECZ_FAIL_IF(!VecDataTy);
+  unsigned SimdWidth = VecDataTy->getNumElements();
+
+  if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+    if (MaskedOp->isMaskedMemOp()) {
+      return scalarizeMaskedMemOp(CI, PM, *MaskedOp);
+    }
+  }
+
+  Value *VectorCallMask = nullptr;
+  if (Ctx.isMaskedFunction(Callee)) {
+    // We have a masked call to a function.
+    // Extract the mask from the call, we need to re-apply it later
+    VectorCallMask = CI->getArgOperand(CI->arg_size() - 1);
+
+    // Get the original function call from the masked wrapper function
+    Function *originalFunc = Ctx.getOriginalMaskedFunction(Callee);
+    Callee = originalFunc;
+  }
+
+  auto const Builtin = BI.analyzeBuiltin(*Callee);
+  Function *ScalarEquiv = BI.getScalarEquivalent(Builtin, F.getParent());
+  VECZ_STAT_FAIL_IF(!ScalarEquiv, VeczScalarizeFailBuiltin);
+
+  IRBuilder<> B(CI);
+  auto const Props = Builtin.properties;
+  // Ignore the mask if present
+  unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size();
+  SmallVector<SimdPacket *, 4> OpPackets(NumArgs);
+  SmallVector<Value *, 4> OpScalars(NumArgs);
+  for (unsigned i = 0; i < NumArgs; i++) {
+    Value *OrigOp = CI->getArgOperand(i);
+    Type *OldTy = OrigOp->getType();
+    if (OldTy->isVectorTy()) {
+      SimdPacket *OpPacket = scalarize(OrigOp, PM);
+      VECZ_FAIL_IF(!OpPacket);
+      OpPackets[i] = OpPacket;
+    } else if (PointerType *OldPtrTy = dyn_cast<PointerType>(OldTy)) {
+      auto *const PtrRetPointeeTy =
+          compiler::utils::getPointerReturnPointeeTy(*Callee, Props);
+      if (PtrRetPointeeTy && PtrRetPointeeTy->isVectorTy()) {
+        // Handle 'pointer return' arguments. The old type was Vector*, the new
+        // type is Scalar*. To accommodate the different we need to have
+        // individual offsets, one for each 'element pointer'.
+        auto *OldVecTy = cast<FixedVectorType>(PtrRetPointeeTy);
+        VECZ_STAT_FAIL_IF(OldVecTy->getNumElements() != SimdWidth,
+                          VeczScalarizeFailBuiltin);
+        Type *NewTy = PointerType::get(OldVecTy->getElementType(),
+                                       OldPtrTy->getAddressSpace());
+        Value *ScalarAddrBase = B.CreateBitCast(OrigOp, NewTy);
+        SimdPacket *OpPacket = getPacket(ScalarAddrBase, SimdWidth);
+        for (unsigned j = 0; j < SimdWidth; j++) {
+          if (!PM.isEnabled(j) || OpPacket->at(j)) {
+            continue;
+          }
+          Value *ScalarAddr = B.CreateGEP(OldVecTy->getElementType(),
+                                          ScalarAddrBase, B.getInt32(j));
+          OpPacket->set(j, ScalarAddr);
+          OpPackets[i] = OpPacket;
+        }
+      } else {
+        OpScalars[i] = OrigOp;
+      }
+    } else {
+      OpScalars[i] = OrigOp;
+    }
+  }
+
+  SimdPacket *P = getPacket(CI, SimdWidth);
+  for (unsigned j = 0; j < SimdWidth; j++) {
+    if (!PM.isEnabled(j) || P->at(j)) {
+      continue;
+    }
+    SmallVector<Value *, 4> Ops;
+    for (unsigned i = 0; i < NumArgs; i++) {
+      SimdPacket *OpPacket = OpPackets[i];
+      if (OpPacket) {
+        Ops.push_back(OpPacket->at(j));
+      } else {
+        Value *OrigOp = OpScalars[i];
+        VECZ_FAIL_IF(!OrigOp);
+        Ops.push_back(OrigOp);
+      }
+    }
+
+    CallInst *NewCI = B.CreateCall(ScalarEquiv, Ops, CI->getName());
+    NewCI->setCallingConv(CI->getCallingConv());
+    NewCI->setAttributes(CI->getAttributes());
+    // Re-apply mask. The new CI already has to exist to create the masked
+    // function which is why it gets updated here. We then need to add the
+    // mask argument back to the call, but LLVM won't let us update the existing
+    // one, so recreate the CallInst one last time
+    if (VectorCallMask) {
+      Function *MaskedScalarEquiv = Ctx.getOrCreateMaskedFunction(NewCI);
+      VECZ_FAIL_IF(!MaskedScalarEquiv);
+      Ops.push_back(VectorCallMask);
+      CallInst *NewCIMasked =
+          B.CreateCall(MaskedScalarEquiv, Ops, CI->getName());
+      NewCIMasked->setCallingConv(CI->getCallingConv());
+      NewCIMasked->setAttributes(CI->getAttributes());
+      P->set(j, NewCIMasked);
+      NewCI->eraseFromParent();
+    } else {
+      P->set(j, NewCI);
+    }
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeShuffleVector(ShuffleVectorInst *Shuffle,
+                                               PacketMask PM) {
+  auto *VecTy = dyn_cast<FixedVectorType>(Shuffle->getType());
+  VECZ_FAIL_IF(!VecTy);
+  Value *LHS = Shuffle->getOperand(0);
+  Value *RHS = Shuffle->getOperand(1);
+  assert(LHS && "Could not get operand 0");
+  assert(RHS && "Could not get operand 1");
+  auto *LHSVecTy = dyn_cast<FixedVectorType>(LHS->getType());
+  VECZ_FAIL_IF(!LHSVecTy);
+  unsigned SrcWidth = LHSVecTy->getNumElements();
+  unsigned DstWidth = VecTy->getNumElements();
+
+  // Determine which lanes we need from both vector operands.
+  PacketMask LHSMask;
+  PacketMask RHSMask;
+  for (unsigned i = 0; i < DstWidth; i++) {
+    if (!PM.isEnabled(i)) {
+      continue;
+    }
+    int MaskLane = Shuffle->getMaskValue(i);
+    if (MaskLane >= static_cast<int>(SrcWidth)) {
+      MaskLane -= static_cast<int>(SrcWidth);
+      RHSMask.enable(static_cast<unsigned>(MaskLane));
+    } else if (MaskLane >= 0) {
+      LHSMask.enable(static_cast<unsigned>(MaskLane));
+    }
+  }
+
+  // Scalarize each vector operand as needed.
+  SimdPacket *LHSPacket = nullptr;
+  if (LHSMask.Value != 0) {
+    LHSPacket = scalarize(LHS, LHSMask);
+    VECZ_FAIL_IF(!LHSPacket);
+  }
+  SimdPacket *RHSPacket = nullptr;
+  if (RHSMask.Value != 0) {
+    RHSPacket = scalarize(RHS, RHSMask);
+    VECZ_FAIL_IF(!RHSPacket);
+  }
+
+  // Copy the scalarized values to the result packet.
+  SimdPacket *P = getPacket(Shuffle, DstWidth);
+  for (unsigned i = 0; i < DstWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *Extracted = nullptr;
+    int MaskLane = Shuffle->getMaskValue(i);
+    if (MaskLane < 0) {
+      Extracted = UndefValue::get(VecTy->getElementType());
+    } else if (MaskLane >= (int)SrcWidth) {
+      MaskLane -= (int)SrcWidth;
+      if (RHSPacket) {
+        Extracted = RHSPacket->at(MaskLane);
+      }
+    } else if (MaskLane >= 0) {
+      if (LHSPacket) {
+        Extracted = LHSPacket->at(MaskLane);
+      }
+    }
+    P->set(i, Extracted);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeInsertElement(InsertElementInst *Insert,
+                                               PacketMask PM) {
+  Value *Vec = Insert->getOperand(0);
+  VECZ_FAIL_IF(!Vec);
+  Value *Ele = Insert->getOperand(1);
+  assert(Ele && "Could not get operand 1 of Insert");
+  Value *Index = Insert->getOperand(2);
+  assert(Index && "Could not get operand 2 of Insert");
+  const ConstantInt *CIndex = dyn_cast<ConstantInt>(Index);
+  const auto *VecTy = cast<FixedVectorType>(Vec->getType());
+  const unsigned IndexInt = CIndex ? CIndex->getZExtValue() : 0;
+  const unsigned SimdWidth = VecTy->getNumElements();
+
+  SimdPacket *P = getPacket(Insert, SimdWidth);
+
+  // Scalarize the vector operand
+  PacketMask OpMask;
+  OpMask.enableAll(SimdWidth);
+  // If we have a constant mask, we can skip the lane we are not going to use
+  if (CIndex) {
+    OpMask.disable(IndexInt);
+  }
+  SimdPacket *VecP = scalarize(Vec, OpMask);
+  VECZ_FAIL_IF(!VecP);
+
+  // For each lane, we need to select either the original vector element (from
+  // VecP) or the new value Ele. The selection is done based on the Index.
+  IRBuilder<> B(Insert);
+  for (unsigned lane = 0; lane < SimdWidth; ++lane) {
+    if (!PM.isEnabled(lane) || P->at(lane)) {
+      continue;
+    }
+    Value *LaneValue = nullptr;
+    if (CIndex) {
+      // If the Index is a Constant, then we can do the selection at compile
+      // time
+      LaneValue = (IndexInt == lane) ? Ele : VecP->at(lane);
+    } else {
+      // If the Index is a runtime value, then we have to emit select
+      // instructions to do selection at runtime
+      Constant *LaneC = ConstantInt::get(Index->getType(), lane);
+      LaneValue =
+          B.CreateSelect(B.CreateICmpEQ(Index, LaneC), Ele, VecP->at(lane));
+    }
+    P->set(lane, LaneValue);
+  }
+
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) {
+  auto *const vecDataTy = dyn_cast<FixedVectorType>(GEP->getType());
+  VECZ_FAIL_IF(!vecDataTy);
+  unsigned simdWidth = vecDataTy->getNumElements();
+
+  Value *const ptr = GEP->getPointerOperand();
+  SimdPacket *ptrPacket = nullptr;
+  if (ptr->getType()->isVectorTy()) {
+    ptrPacket = scalarize(ptr, PM);
+    VECZ_FAIL_IF(!ptrPacket);
+  }
+
+  // Scalarize any vector GEP indices.
+  SmallVector<SimdPacket *, 4> indexPackets;
+  for (unsigned i = 0, n = GEP->getNumIndices(); i < n; ++i) {
+    Value *const idx = GEP->getOperand(1 + i);
+    if (idx->getType()->isVectorTy()) {
+      SimdPacket *idxP = scalarize(idx, PM);
+      VECZ_FAIL_IF(!idxP);
+      indexPackets.push_back(idxP);
+    } else {
+      indexPackets.push_back(nullptr);
+    }
+  }
+
+  IRBuilder<> B(GEP);
+  bool const inBounds = GEP->isInBounds();
+  auto const name = GEP->getName();
+  SimdPacket *const P = getPacket(GEP, simdWidth);
+  for (unsigned i = 0; i < simdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+
+    // Get the GEP indices per lane, scalarized or otherwise
+    SmallVector<Value *, 4> scalarIndices;
+    unsigned indexN = 1U;
+    for (auto *idx : indexPackets) {
+      if (idx->at(i)) {
+        scalarIndices.push_back(idx->at(i));
+      } else {
+        scalarIndices.push_back(GEP->getOperand(indexN));
+      }
+      ++indexN;
+    }
+
+    auto *const scalarPointer = ptrPacket ? ptrPacket->at(i) : ptr;
+    Value *const newGEP =
+        inBounds ? B.CreateInBoundsGEP(GEP->getSourceElementType(),
+                                       scalarPointer, scalarIndices, name)
+                 : B.CreateGEP(GEP->getSourceElementType(), scalarPointer,
+                               scalarIndices, name);
+
+    P->set(i, newGEP);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizePHI(PHINode *Phi, PacketMask PM) {
+  auto *PhiTy = cast<FixedVectorType>(Phi->getType());
+  const unsigned Width = PhiTy->getNumElements();
+  const unsigned NumIncoming = Phi->getNumIncomingValues();
+  SmallVector<SimdPacket *, 2> Incoming;
+
+  SimdPacket *P = getPacket(Phi, Width);
+  IRBuilder<> B(Phi);
+
+  SmallVector<unsigned, 4> ActiveLanes;
+
+  // Start by creating the Phi nodes. This is done before everything else
+  // because the IR might contain cycles which will cause the scalarization to
+  // loop back to this Phi node when scalarizing the incoming values.
+  for (unsigned lane = 0; lane < Width; ++lane) {
+    if (!PM.isEnabled(lane) || P->at(lane)) {
+      continue;
+    }
+    PHINode *SPhi =
+        B.CreatePHI(PhiTy->getElementType(), NumIncoming, Phi->getName());
+    P->set(lane, SPhi);
+    ActiveLanes.push_back(lane);
+  }
+
+  // Scalarize the incoming values
+  for (auto &In : Phi->incoming_values()) {
+    SimdPacket *SIn = scalarize(In, PM);
+    VECZ_FAIL_IF(!SIn);
+    Incoming.push_back(SIn);
+  }
+
+  // Assign the scalarized incoming values to the scalarized Phi nodes
+  for (unsigned lane : ActiveLanes) {
+    VECZ_ERROR_IF(!PM.isEnabled(lane), "Active lane should be enabled.");
+    PHINode *SPhi = cast<PHINode>(P->at(lane));
+    for (unsigned i = 0; i < NumIncoming; ++i) {
+      SPhi->addIncoming(Incoming[i]->at(lane), Phi->getIncomingBlock(i));
+    }
+  }
+
+  return P;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
new file mode 100644
index 0000000000000..e3feb0573839e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -0,0 +1,142 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/Dominators.h>
+#include <llvm/Transforms/Scalar/LoopPassManager.h>
+
+#include <unordered_set>
+
+#include "debugging.h"
+#include "multi_llvm/multi_llvm.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+
+PreservedAnalyses vecz::SimplifyInfiniteLoopPass::run(
+    Loop &L, LoopAnalysisManager &, LoopStandardAnalysisResults &AR,
+    LPMUpdater &) {
+  bool modified = false;
+
+  SmallVector<BasicBlock *, 1> loopExitBlocks;
+  L.getExitBlocks(loopExitBlocks);
+
+  // If we have an infinite loop, create a virtual exit block that will target
+  // the unique exit block of the function.
+  if (loopExitBlocks.empty()) {
+    BasicBlock *latch = L.getLoopLatch();
+    assert(latch && "Loop should have a unique latch.");
+
+    Function *F = L.getHeader()->getParent();
+
+    // Get the return block of the function.
+    std::vector<BasicBlock *> returnBlocks;
+    for (BasicBlock &BB : *F) {
+      if (isa<ReturnInst>(BB.getTerminator())) {
+        returnBlocks.push_back(&BB);
+      }
+    }
+
+    if (returnBlocks.empty() || returnBlocks.size() > 1) {
+      assert(false && "Function should have only one exit.");
+      return PreservedAnalyses::all();
+    }
+
+    // The target of the virtual exit block of the infinite loop.
+    BasicBlock *target = returnBlocks[0];
+
+    // Replace the terminator of the latch with a fake conditional branch that
+    // will actually always target the header to maintain the semantic of the
+    // program.
+    latch->getTerminator()->eraseFromParent();
+    AR.DT.deleteEdge(latch, L.getHeader());
+    BasicBlock *virtualExit =
+        BasicBlock::Create(F->getContext(), L.getName() + ".virtual_exit", F);
+    AR.DT.addNewBlock(virtualExit, latch);
+    BranchInst::Create(L.getHeader(), virtualExit,
+                       ConstantInt::getTrue(F->getContext()), latch);
+    AR.DT.insertEdge(latch, L.getHeader());
+    AR.DT.insertEdge(latch, virtualExit);
+    BranchInst::Create(target, virtualExit);
+    AR.DT.insertEdge(virtualExit, target);
+
+    assert(AR.DT.verify() &&
+           "SimplifyInfiniteLoopPass: Dominator Tree failed verification");
+
+    std::unordered_set<Instruction *> toBlend;
+    // Find all instructions used in `target` that may be defined after the
+    // infinite loop, for which adding the edge from the infinite loop to the
+    // return block may break the SSA form.
+    for (Instruction &I : *target) {
+      if (!isa<PHINode>(&I)) {
+        for (Value *op : I.operands()) {
+          if (Instruction *opI = dyn_cast<Instruction>(op)) {
+            if (opI->getParent() != target) {
+              toBlend.insert(opI);
+            }
+          }
+        }
+      }
+    }
+
+    // Update the phi nodes in the return block because we added a new
+    // predecessor to it.
+    for (Instruction &I : *target) {
+      if (auto *PHI = dyn_cast<PHINode>(&I)) {
+        PHI->addIncoming(UndefValue::get(PHI->getType()), virtualExit);
+      }
+    }
+    // Add new phi nodes for instructions computed in `toBlend`.
+    for (Instruction *I : toBlend) {
+      PHINode *PHI = PHINode::Create(I->getType(), 2, I->getName() + ".blend",
+                                     &target->front());
+      for (BasicBlock *pred : predecessors(target)) {
+        if (pred != virtualExit) {
+          PHI->addIncoming(I, pred);
+        } else {
+          PHI->addIncoming(UndefValue::get(PHI->getType()), pred);
+        }
+      }
+    }
+
+    modified = true;
+  } else if (loopExitBlocks.size() == 1) {
+    // Canonicalize any other infinite loops so that the loop header is the
+    // true condition successor.
+    auto *const latch = L.getLoopLatch();
+    auto *const header = L.getHeader();
+    auto *const T = latch->getTerminator();
+    if (auto *const branch = dyn_cast<BranchInst>(T)) {
+      if (branch->isConditional()) {
+        if (auto *const cond = dyn_cast<Constant>(branch->getCondition())) {
+          if (branch->getSuccessor(1) == header) {
+            modified = true;
+            auto &ctx = latch->getParent()->getContext();
+            branch->setCondition(cond->isOneValue()
+                                     ? ConstantInt::getFalse(ctx)
+                                     : ConstantInt::getTrue(ctx));
+            branch->swapSuccessors();
+          }
+        }
+      }
+    }
+  }
+
+  if (!modified) {
+    return PreservedAnalyses::all();
+  }
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
new file mode 100644
index 0000000000000..7b8d991c681f0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -0,0 +1,277 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/PassManager.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "transform/packetization_helpers.h"
+#include "transform/passes.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+/// @brief replace loads of vectors of small vector loads and stores with scalar
+/// loads and stores, where the entire vector fits into a legal integer.
+///
+/// The rationale here is that if we end up generating a scatter/gather, or
+/// interleaved memop, it would be more efficient with the wider type than with
+/// the vector of the narrower type. Although it's not trivial to know in
+/// advance if we will get a scatter/gather or interleaved or contiguous load,
+/// so we just do all of them and not worry too much about doing it when we
+/// didn't really need to.
+///
+/// Be careful not to run Instruction Combine Pass between this pass and
+/// packetization, because it is likely to undo it.
+PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  bool changed = false;
+
+  auto const &UVR = AM.getResult<UniformValueAnalysis>(F);
+  auto const &SAR = AM.getResult<StrideAnalysis>(F);
+  auto &DL = F.getParent()->getDataLayout();
+  auto &context = F.getContext();
+
+  // Keep a cache of the bitcasts so we don't create multiple bitcasts for the
+  // same value in each BasicBlock.
+  DenseMap<const Value *, BitCastInst *> squashCasts;
+  auto getSquashed = [&](Value *vector, Type *intTy,
+                         IRBuilder<> &B) -> Value * {
+    auto *&bitCast = squashCasts[vector];
+    Value *element = bitCast;
+    if (!element) {
+      if (auto *const bcast = dyn_cast<BitCastInst>(vector)) {
+        // "See through" existing bitcasts.
+        element = bcast->getOperand(0);
+      } else {
+        element = vector;
+      }
+
+      if (element->getType() != intTy) {
+        // Note we have to freeze the vector value first, because individual
+        // elements can be `poison`, which would result in the entire value
+        // becoming `poison`, which is not a valid transform (it is not valid to
+        // increase the amount of `poison` in the IR).
+        element = B.CreateBitCast(B.CreateFreeze(element), intTy,
+                                  Twine(vector->getName(), ".squash"));
+        bitCast = dyn_cast<BitCastInst>(element);
+      }
+    }
+    return element;
+  };
+
+  SmallVector<Instruction *, 16> toErase;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto *load = dyn_cast<LoadInst>(&I)) {
+        if (!UVR.isVarying(load)) {
+          continue;
+        }
+
+        auto *const ty = load->getType();
+        auto *const scalarTy = ty->getScalarType();
+        unsigned const numBits = ty->getPrimitiveSizeInBits();
+        if ((numBits & (numBits - 1)) == 0 && scalarTy != ty &&
+            DL.fitsInLegalInteger(numBits)) {
+          auto const align = load->getAlign();
+          auto *const intTy = IntegerType::get(context, numBits);
+          if (DL.getABITypeAlign(intTy) > align) {
+            // The alignment of this type is too strict to convert
+            continue;
+          }
+
+          auto *const ptr = load->getPointerOperand();
+          auto const *const info = SAR.getInfo(ptr);
+          if (info && info->hasStride() &&
+              info->getConstantMemoryStride(ty, &DL) == 1) {
+            // No need to perform this transform on contiguous loads
+            continue;
+          }
+
+          IRBuilder<> B(load);
+          auto const name = load->getName();
+          auto *const newPtrTy =
+              PointerType::get(intTy, ptr->getType()->getPointerAddressSpace());
+          auto *const ptrCast = B.CreatePointerCast(
+              ptr, newPtrTy, Twine(ptr->getName(), ".squashptr"));
+          auto *newLoad = cast<LoadInst>(
+              B.CreateLoad(intTy, ptrCast, Twine(name, ".squashed")));
+          newLoad->setAlignment(align);
+          newLoad->copyMetadata(*load);
+
+          auto *const newVec =
+              B.CreateBitCast(newLoad, ty, Twine(name, ".unsquash"));
+
+          load->replaceAllUsesWith(newVec);
+          toErase.push_back(load);
+          changed = true;
+        }
+      } else if (auto *store = dyn_cast<StoreInst>(&I)) {
+        if (!UVR.isVarying(store)) {
+          continue;
+        }
+
+        auto *const data = store->getValueOperand();
+        auto *const ty = data->getType();
+        auto *const scalarTy = ty->getScalarType();
+        unsigned const numBits = ty->getPrimitiveSizeInBits();
+        if ((numBits & (numBits - 1)) == 0 && scalarTy != ty &&
+            DL.fitsInLegalInteger(numBits)) {
+          auto const align = store->getAlign();
+          auto *const intTy = IntegerType::get(context, numBits);
+          if (DL.getABITypeAlign(intTy) > align) {
+            // The alignment of this type is too strict to convert
+            continue;
+          }
+
+          auto *const ptr = store->getPointerOperand();
+          auto const *const info = SAR.getInfo(ptr);
+          if (info && info->hasStride() &&
+              info->getConstantMemoryStride(ty, &DL) == 1) {
+            // No need to perform this transform on contiguous stores
+            continue;
+          }
+
+          IRBuilder<> B(store);
+          auto *const newPtrTy =
+              PointerType::get(intTy, ptr->getType()->getPointerAddressSpace());
+          auto *const newPtr = B.CreatePointerCast(
+              ptr, newPtrTy, Twine(ptr->getName(), ".squashptr"));
+          auto *const newData = getSquashed(data, intTy, B);
+          auto *newStore = cast<StoreInst>(B.CreateStore(newData, newPtr));
+          newStore->setAlignment(align);
+          newStore->copyMetadata(*store);
+
+          toErase.push_back(store);
+          changed = true;
+        }
+      } else if (auto *zext = dyn_cast<ZExtInst>(&I)) {
+        if (!UVR.isVarying(zext)) {
+          continue;
+        }
+        // A zero-extend of an extract element can be squashed, if the source
+        // vector size is the same as the extended integer size. That is (for
+        // little-endian systems):
+        //
+        // zext i32(extract <4 x i8> data, i32 3)
+        //
+        // becomes
+        //
+        // and(lshr(bitcast i32 data), i32 24), 0xFF)
+        //
+        // this avoids creating shufflevectors during packetization
+        //
+        auto *const srcOp = zext->getOperand(0);
+        if (auto *const extract = dyn_cast<ExtractElementInst>(srcOp)) {
+          auto *const vector = extract->getVectorOperand();
+          auto *const indexOp = extract->getIndexOperand();
+          auto *const intTy = zext->getType();
+          auto *const vecTy = vector->getType();
+          if (vecTy->getPrimitiveSizeInBits() ==
+                  intTy->getPrimitiveSizeInBits() &&
+              isa<ConstantInt>(indexOp)) {
+            IRBuilder<> B(zext);
+            Value *element = getSquashed(vector, intTy, B);
+
+            auto const bits = zext->getSrcTy()->getScalarSizeInBits();
+            auto const scaled =
+                cast<ConstantInt>(indexOp)->getZExtValue() * bits;
+
+            // Note on Little Endian systems, element 0 occupies the least
+            // significant bits of the vector. On Big Endian systems it occupies
+            // the most significant bits. Thus, we shift by "maximum element
+            // number minus current element number" times by "number of bits
+            // per element".
+            auto const shift =
+                DL.isBigEndian()
+                    ? intTy->getPrimitiveSizeInBits() - bits - scaled
+                    : scaled;
+
+            if (shift != 0) {
+              element =
+                  B.CreateLShr(element, ConstantInt::get(intTy, shift),
+                               Twine(extract->getName(), ".squashExtract"));
+            }
+            element =
+                B.CreateAnd(element, ConstantInt::get(intTy, (1 << bits) - 1),
+                            Twine(zext->getName(), ".squashZExt"));
+
+            zext->replaceAllUsesWith(element);
+            toErase.push_back(zext);
+            changed = true;
+          }
+        }
+      } else if (auto *sext = dyn_cast<SExtInst>(&I)) {
+        if (!UVR.isVarying(sext)) {
+          continue;
+        }
+        // We can squash sign extends in-place as well.
+        // We do this by shifting the required element into most-significant
+        // position, and then arithmetic-shifting it back down to the least-
+        // significant position.
+        auto *const srcOp = sext->getOperand(0);
+        if (auto *const extract = dyn_cast<ExtractElementInst>(srcOp)) {
+          auto *const vector = extract->getVectorOperand();
+          auto *const indexOp = extract->getIndexOperand();
+          auto *const intTy = sext->getType();
+          auto *const vecTy = vector->getType();
+          if (vecTy->getPrimitiveSizeInBits() ==
+                  intTy->getPrimitiveSizeInBits() &&
+              isa<ConstantInt>(indexOp)) {
+            IRBuilder<> B(sext);
+            Value *element = getSquashed(vector, intTy, B);
+
+            auto const bits = sext->getSrcTy()->getScalarSizeInBits();
+            auto const shiftr = intTy->getPrimitiveSizeInBits() - bits;
+            auto const scaled =
+                cast<ConstantInt>(indexOp)->getZExtValue() * bits;
+            auto const shiftl = DL.isBigEndian() ? scaled : shiftr - scaled;
+
+            if (shiftl != 0) {
+              element =
+                  B.CreateShl(element, ConstantInt::get(intTy, shiftl),
+                              Twine(extract->getName(), ".squashExtract"));
+            }
+            element = B.CreateAShr(element, ConstantInt::get(intTy, shiftr),
+                                   Twine(extract->getName(), ".squashSExt"));
+
+            sext->replaceAllUsesWith(element);
+            toErase.push_back(sext);
+            changed = true;
+          }
+        }
+      }
+    }
+
+    // only re-use casts within a basic block
+    squashCasts.clear();
+  }
+
+  for (auto *I : toErase) {
+    I->eraseFromParent();
+  }
+
+  auto preserved = PreservedAnalyses::all();
+  if (changed) {
+    preserved.abandon<UniformValueAnalysis>();
+    preserved.abandon<StrideAnalysis>();
+  }
+  return preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
new file mode 100644
index 0000000000000..a70163986fcd2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -0,0 +1,234 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/ternary_transform_pass.h"
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "memory_operations.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+/// @brief Determine whether the select can and should be transformed. This is
+/// the case when there is at most one GEP to it and followed by Load/Store
+/// memory op and there are no other users to GEP.
+/// Additionally, we reject various cases where the tranform would not result
+/// in better code.
+bool shouldTransform(SelectInst *Select, StrideAnalysisResult const &SAR) {
+  // The transform only applies to pointer selects.
+  if (!Select->getType()->isPointerTy()) {
+    return false;
+  }
+
+  // There is absolutely no need to transform a uniform select.
+  if (!SAR.UVR.isVarying(Select)) {
+    return false;
+  }
+
+  {
+    // If the select itself is a strided pointer, we don't gain anything by
+    // transforming it into a pair of masked memops.
+    auto const *info = SAR.getInfo(Select);
+    if (info && info->hasStride()) {
+      return false;
+    }
+  }
+
+  // Validate Select operands
+  Value *VecTrue = Select->getOperand(1);
+  Value *VecFalse = Select->getOperand(2);
+
+  assert(VecTrue && VecFalse);
+
+  // If both pointers are uniform, it's worth doing the transform, since we get
+  // only scalar Mask Varying memops, instead of vector memops.
+  if (SAR.UVR.isVarying(VecTrue) || SAR.UVR.isVarying(VecFalse)) {
+    // Both pointers must be either strided or uniform (i.e. not divergent).
+    auto const *infoT = SAR.getInfo(VecTrue);
+    auto const *infoF = SAR.getInfo(VecFalse);
+    if (!infoT || !infoF || infoT->mayDiverge() || infoF->mayDiverge()) {
+      return false;
+    }
+  }
+
+  // Validate Select users
+  GetElementPtrInst *TheGEP = nullptr;
+  SmallVector<Instruction *, 8> SelectsUsers;
+  for (User *U : Select->users()) {
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // There can be at most one GEP
+      if (TheGEP) {
+        return false;
+      }
+      TheGEP = GEP;
+      SelectsUsers.push_back(GEP);
+    } else {
+      return false;
+    }
+  }
+
+  // Validate GEP users
+  while (!SelectsUsers.empty()) {
+    VECZ_FAIL_IF(!isa<GetElementPtrInst>(SelectsUsers.back()));
+    GetElementPtrInst *GEP =
+        cast<GetElementPtrInst>(SelectsUsers.pop_back_val());
+
+    // Validate the GEP indices
+    for (Value *idx : GEP->indices()) {
+      auto const *info = SAR.getInfo(idx);
+      if (!info || info->mayDiverge()) {
+        return false;
+      }
+    }
+    // We only transform selects used by GEPs who are exclusively used by
+    // scalar loads and stores. Performing this transform on vectors was
+    // historically banned due to internal limitations, but these days we
+    // *should* be able to. It's just that we don't know whether it's
+    // beneficial: see CA-4337.
+    for (User *U : GEP->users()) {
+      if (auto *const LI = dyn_cast<LoadInst>(U)) {
+        if (LI->getType()->isVectorTy()) {
+          return false;
+        }
+      } else if (auto *const SI = dyn_cast<StoreInst>(U)) {
+        if (SI->getValueOperand()->getType()->isVectorTy()) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// @brief Try to transform the select, remove GEP & memory op and
+/// replace with transformed GEP and masked memory op.
+void Transform(SelectInst *Select, VectorizationContext &Ctx) {
+  SmallVector<Instruction *, 8> ToDelete;
+
+  auto transformSelect = [&](GetElementPtrInst *GEP, Instruction *Memop,
+                             Value *StoredValue, ArrayRef<Value *> Indices) {
+    // Non-obviously, we need to insert the new instructions at the GEP. The GEP
+    // is a user of the select, so we can guarantee that the GEP dominates the
+    // select. To ensure that the new instructions added also dominate the
+    // indices of the GEP, we need to insert at the GEP.
+    IRBuilder<> B(GEP);
+
+    Value *Condition = Select->getCondition();
+    Value *InvCondition = B.CreateXor(Condition, 1);
+    Value *True = Select->getTrueValue();
+    Value *False = Select->getFalseValue();
+    Value *GepTrue = B.CreateGEP(GEP->getSourceElementType(), True, Indices);
+    Value *GepFalse = B.CreateGEP(GEP->getSourceElementType(), False, Indices);
+    auto MaskedOp = MemOp::get(Memop);
+    assert(MaskedOp);
+    MemOpDesc Mem = MaskedOp->getDesc();
+
+    // We should have filtered out all vector memory operations earlier.
+    assert(!Mem.getDataType()->isVectorTy());
+
+    auto Alignment = Mem.getAlignment();
+    if (isa<LoadInst>(Memop)) {
+      // Transform load
+      Value *LoadTrue =
+          createMaskedLoad(Ctx, Mem.getDataType(), GepTrue, Condition,
+                           /*VL*/ nullptr, Alignment, "", Memop);
+      Value *LoadFalse =
+          createMaskedLoad(Ctx, Mem.getDataType(), GepFalse, InvCondition,
+                           /*VL*/ nullptr, Alignment, "", Memop);
+      B.SetInsertPoint(Memop);
+      Value *LoadResult = B.CreateSelect(Condition, LoadTrue, LoadFalse);
+
+      // Replace all uses with new value
+      Memop->replaceAllUsesWith(LoadResult);
+    } else if (isa<StoreInst>(Memop)) {
+      // Transform store
+      createMaskedStore(Ctx, StoredValue, GepTrue, Condition, /*VL*/ nullptr,
+                        Alignment, "", Memop);
+      createMaskedStore(Ctx, StoredValue, GepFalse, InvCondition,
+                        /*VL*/ nullptr, Alignment, "", Memop);
+    }
+  };
+
+  for (User *U : Select->users()) {
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      ToDelete.push_back(GEP);
+
+      SmallVector<Value *, 2> Indices(GEP->idx_begin(), GEP->idx_end());
+
+      for (User *G : GEP->users()) {
+        if (LoadInst *Load = dyn_cast<LoadInst>(G)) {
+          ToDelete.push_back(Load);
+          transformSelect(GEP, Load, nullptr, Indices);
+        } else if (StoreInst *Store = dyn_cast<StoreInst>(G)) {
+          ToDelete.push_back(Store);
+          transformSelect(GEP, Store, Store->getValueOperand(), Indices);
+        }
+      }
+    }
+  }
+
+  // Clean up instructions bottom-up (users first).
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    if (I->use_empty()) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  IRCleanup::deleteInstructionNow(Select);
+}
+}  // namespace
+
+PreservedAnalyses TernaryTransformPass::run(llvm::Function &F,
+                                            llvm::FunctionAnalysisManager &AM) {
+  auto const &SAR = AM.getResult<StrideAnalysis>(F);
+
+  // Find selects that can be transformed
+  SmallVector<SelectInst *, 4> Selects;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (SelectInst *Select = dyn_cast<SelectInst>(&I)) {
+        if (shouldTransform(Select, SAR)) {
+          Selects.push_back(Select);
+        }
+      }
+    }
+  }
+
+  if (Selects.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+
+  // Transform them.
+  for (SelectInst *Select : Selects) {
+    Transform(Select, Ctx);
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
new file mode 100644
index 0000000000000..99cdd0a35a22e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -0,0 +1,355 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+
+#define DEBUG_TYPE "vecz"
+
+// WHAT THIS DOES
+//
+// Where we have some expression involving binary operators over uniform and
+// varying values, it can sometimes be advantageous to re-arrange the terms
+// to reduce the vectorization overhead. For example, we might have:
+//
+//   (Varying + Uniform) + Uniform
+//
+// The above expression requires TWO vector broadcasts of the uniform values,
+// and TWO vector additions. However, if we re-associate the operators to get:
+//
+//  Varying + (Uniform + Uniform)
+//
+// In this new form, we only need a scalar addition and a single broadcast,
+// followed by a single vector addition.
+//
+// We also make the following transformations:
+//
+//   (Varying + Uniform) + Varying -> (Varying + Varying) + Uniform
+//   Varying + (Varying + Uniform) -> (Varying + Varying) + Uniform
+//
+// Although these transformations don't reduce the number of vector
+// instructions, they may reduce the vector register pressure somewhat. But
+// more importantly they may enable further transforms on the CFG.
+//
+// A common pattern is a conditional statement like this:
+//
+//    if (uniform_condition && varying_condition) { ... }
+//
+// Control flow conversion quite often replaces the && with an & in order to
+// reduce the number of branches/basic blocks. In this case, however, that is
+// counter-productive for us, since we wish to retain the uniform branch and
+// linearize the varying one. This pass also splits up such branch conditions.
+//
+// POTENTIAL FURTHER WORK
+//
+// Currently, this pass only works on expressions involving a single kind of
+// associative and commutative operators. However, similar transformations
+// are possible with subtracts and mixtures of subtracts and additions.
+
+using namespace llvm;
+
+namespace {
+
+/// @brief it goes through all the PHI nodes in BB and duplicates the incoming
+/// values from "original" to new the new incoming block "extra"
+void updatePHIs(BasicBlock &BB, BasicBlock *original, BasicBlock *extra) {
+  for (auto &I : BB) {
+    auto *const PHI = dyn_cast<PHINode>(&I);
+    if (!PHI) {
+      break;
+    }
+    PHI->addIncoming(PHI->getIncomingValueForBlock(original), extra);
+  }
+}
+
+}  // namespace
+
+namespace vecz {
+class Reassociator {
+ public:
+  Reassociator() {}
+
+  /// @brief perform the Branch Split transformation
+  ///
+  /// @param[in] F Function to transform.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  /// @returns true iff any branches were split
+  bool run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+ private:
+  /// @brief classification of a binary operand according to whether its
+  ///        operands are Uniform, Varying, both (Varying Op Uniform), or non-
+  ///        canonically both (i.e. Uniform Op Varying).
+  enum class OpForm { Uniform, Varying, Mixed, NonCanonical };
+
+  /// @brief tries to transform a Binary Operator into a canonical form, such
+  ///        that if only one operand is Uniform, it is the second operand.
+  ///
+  /// @param[in] Op the Binary Operator to transform
+  /// @returns the form of the canonicalized operator
+  OpForm canonicalizeBinOp(llvm::BinaryOperator &Op);
+
+  /// @brief tries to rearrange a binary operator expression to reduce vector
+  ///        broadcasts, or to facilitate branch splitting.
+  ///
+  /// @param[in] Op the Binary Operator to transform
+  /// @returns true iff the expression was transformed
+  bool reassociate(llvm::BinaryOperator &Op);
+
+  /// @brief canonicalizes a branch into a form that can be split
+  ///
+  /// @param[in] Branch the branch instruction to canonicalize
+  /// @returns true iff the branch condition is mixed (Varying Op Uniform)
+  ///          and can be split into two separate branches.
+  bool canSplitBranch(llvm::BranchInst &Branch);
+
+  UniformValueResult *UVR = nullptr;
+};
+
+Reassociator::OpForm Reassociator::canonicalizeBinOp(llvm::BinaryOperator &Op) {
+  if (!UVR->isVarying(&Op)) {
+    // Both operands are uniform
+    return OpForm::Uniform;
+  }
+
+  if (!UVR->isVarying(Op.getOperand(0))) {
+    if (Op.isCommutative()) {
+      // canonicalize the operator so that operand 1 is uniform
+      Op.swapOperands();
+      return OpForm::Mixed;
+    }
+    return OpForm::NonCanonical;
+  }
+
+  if (!UVR->isVarying(Op.getOperand(1))) {
+    return OpForm::Mixed;
+  }
+
+  // Both operands are varying
+  return OpForm::Varying;
+}
+
+bool Reassociator::reassociate(llvm::BinaryOperator &Op) {
+  if (!Op.isAssociative() || !Op.isCommutative()) {
+    return false;
+  }
+
+  auto const Opcode = Op.getOpcode();
+  auto *const LHS = Op.getOperand(0);
+  auto *const RHS = Op.getOperand(1);
+
+  auto *const A = dyn_cast<BinaryOperator>(LHS);
+  if (A && A->getOpcode() == Opcode && A->hasNUses(1) &&
+      canonicalizeBinOp(*A) == OpForm::Mixed) {
+    if (UVR->isVarying(RHS)) {
+      // Transform (Varying Op Uniform) Op Varying
+      // into (Varying Op Varying) Op Uniform
+      auto *const P = BinaryOperator::Create(Opcode, A->getOperand(0), RHS,
+                                             "varying.reassoc", &Op);
+      UVR->setVarying(P);
+      Op.setOperand(0, P);
+      Op.setOperand(1, A->getOperand(1));
+      UVR->remove(A);
+      A->eraseFromParent();
+      return true;
+    } else {
+      // Transform (Varying Op Uniform) Op Uniform
+      // into Varying Op (Uniform Op Uniform)
+      auto *const P = BinaryOperator::Create(Opcode, A->getOperand(1), RHS,
+                                             "uniform.reassoc", &Op);
+      Op.setOperand(0, A->getOperand(0));
+      Op.setOperand(1, P);
+      UVR->remove(A);
+      A->eraseFromParent();
+      return true;
+    }
+  }
+
+  auto *const B = dyn_cast<BinaryOperator>(RHS);
+  if (B && B->getOpcode() == Opcode && B->hasNUses(1) &&
+      canonicalizeBinOp(*B) == OpForm::Mixed) {
+    // Transform Varying Op (Varying Op Uniform)
+    // into (Varying Op Varying) Op Uniform
+    auto *const P = BinaryOperator::Create(Opcode, B->getOperand(0), LHS,
+                                           "varying.reassoc", &Op);
+    Op.setOperand(0, P);
+    Op.setOperand(1, B->getOperand(1));
+    UVR->setVarying(P);
+    UVR->remove(B);
+    B->eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool Reassociator::canSplitBranch(BranchInst &Branch) {
+  if (auto *Op = dyn_cast<BinaryOperator>(Branch.getCondition())) {
+    auto Opcode = Op->getOpcode();
+    if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+      auto Form = canonicalizeBinOp(*Op);
+      if (Form == OpForm::Mixed) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool Reassociator::run(llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo *LI = nullptr;
+  UVR = &AM.getResult<UniformValueAnalysis>(F);
+
+  // Iterate over all instructions in dominance order, so that we always
+  // transform an expression before any of its uses.
+  SmallVector<BasicBlock *, 16> Blocks;
+  DT->getDescendants(&F.getEntryBlock(), Blocks);
+
+  SmallVector<BranchInst *, 4> SplitBranches;
+  for (auto *const BB : Blocks) {
+    for (auto Iit = BB->begin(); Iit != BB->end();) {
+      auto &I = *(Iit++);
+      if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+        auto const form = canonicalizeBinOp(*BinOp);
+        if (form == OpForm::Varying || form == OpForm::Mixed) {
+          reassociate(*BinOp);
+        }
+      } else if (auto *Branch = dyn_cast<BranchInst>(&I)) {
+        if (Branch->isConditional() && Branch->getNumSuccessors() == 2 &&
+            canSplitBranch(*Branch)) {
+          // Lazily obtain the Loop Info
+          if (!LI) {
+            LI = &AM.getResult<LoopAnalysis>(F);
+          }
+
+          if (auto *const L = LI->getLoopFor(BB)) {
+            if (L->isLoopExiting(BB)) {
+              // No need to do this transform on loop exits (?)
+              continue;
+            }
+          }
+
+          SplitBranches.push_back(Branch);
+        }
+      }
+    }
+  }
+
+  if (SplitBranches.empty()) {
+    return false;
+  }
+
+  auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  do {
+    auto *Branch = SplitBranches.back();
+    SplitBranches.pop_back();
+    BasicBlock *BB = Branch->getParent();
+
+    BasicBlock *newBB = SplitBlock(BB, Branch, DT, LI);
+    newBB->setName(Twine(BB->getName(), ".cond_split"));
+
+    // update the PostDominatorTree manually..
+    PDT->addNewBlock(newBB, PDT->getNode(BB)->getIDom()->getBlock());
+
+    // Remove the unconditional branch created by splitting..
+    BB->getTerminator()->eraseFromParent();
+
+    auto *Cond = cast<BinaryOperator>(Branch->getCondition());
+    auto *varyingCond = Cond->getOperand(0);
+    auto *uniformCond = Cond->getOperand(1);
+
+    // Create a new Uniform branch condition to the Return block..
+    // Note that a conditional branch's successors are returned in reverse
+    // order, relative to how they appear in the IR, with the "true" target
+    // last. However, "getSuccessor(n)" also indexes backwards, from the end.
+    auto Opcode = Cond->getOpcode();
+
+    if (Opcode == Instruction::Or) {
+      BasicBlock *SuccT = Branch->getSuccessor(0);
+
+      BranchInst::Create(SuccT, newBB, uniformCond, BB);
+      Branch->setCondition(varyingCond);
+
+      // If the branch target has PHI nodes, they need to get an extra target
+      updatePHIs(*SuccT, newBB, BB);
+
+      // Update Dominator and PostDominator trees..
+      DT->insertEdge(BB, SuccT);
+      PDT->insertEdge(BB, SuccT);
+    } else {
+      BasicBlock *SuccF = Branch->getSuccessor(1);
+
+      BranchInst::Create(newBB, SuccF, uniformCond, BB);
+      Branch->setCondition(varyingCond);
+
+      // If the branch target has PHI nodes, they need to get an extra target
+      updatePHIs(*SuccF, newBB, BB);
+
+      // Update Dominator and PostDominator trees..
+      DT->insertEdge(BB, SuccF);
+      PDT->insertEdge(BB, SuccF);
+    }
+
+    // If we made the condition dead, we can delete it
+    if (Cond->use_empty()) {
+      Cond->eraseFromParent();
+    }
+
+    // The branch may still have a mixed condition after splitting..
+    if (canSplitBranch(*Branch)) {
+      SplitBranches.push_back(Branch);
+    }
+  } while (!SplitBranches.empty());
+
+  assert(DT->verify() && "Reassociator: Dominator Tree failed verification");
+
+  assert(PDT->verify() &&
+         "Reassociator: Post-Dominator Tree failed verification");
+
+  if (LI) {
+    // Unlike the dominator trees, LoopInfo::verify() returns void and asserts
+    // internally on failure, for some reason
+    LI->verify(*DT);
+  }
+
+  return true;
+}
+
+/// @brief reassociate uniform binary operators and split branches
+PreservedAnalyses UniformReassociationPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  Reassociator reassociator;
+  bool changed = reassociator.run(F, AM);
+  (void)changed;
+
+  PreservedAnalyses PA;
+  PA.preserve<UniformValueAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
new file mode 100644
index 0000000000000..8f3094e93434a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -0,0 +1,1379 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/ADT/Triple.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/creation_apis_helper.h>
+#include <multi_llvm/opaque_pointers.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "memory_operations.h"
+#include "transform/packetization_helpers.h"
+#include "vecz/vecz_target_info.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+/// @brief Applies @a EVL to @a Mask, clearing those bits in a position greater
+/// than @a EVL.
+Value *applyEVLToMask(IRBuilder<> &B, Value *EVL, Value *Mask) {
+  if (EVL) {
+    auto *const IndexVector = B.CreateStepVector(VectorType::get(
+        EVL->getType(), multi_llvm::getVectorElementCount(Mask->getType())));
+    auto *const Splat = B.CreateVectorSplat(
+        multi_llvm::getVectorElementCount(Mask->getType()), EVL);
+    auto *const M = B.CreateICmpULT(IndexVector, Splat);
+    Mask = B.CreateLogicalAnd(Mask, M);
+  }
+  return Mask;
+}
+
+bool isLegalMaskedLoad(const TargetTransformInfo &TTI, Type *Ty,
+                       unsigned Alignment) {
+  return TTI.isLegalMaskedLoad(Ty, Align(Alignment));
+}
+
+bool isLegalMaskedStore(const TargetTransformInfo &TTI, Type *Ty,
+                        unsigned Alignment) {
+  return TTI.isLegalMaskedStore(Ty, Align(Alignment));
+}
+
+bool isLegalMaskedGather(const TargetTransformInfo &TTI, Type *Ty,
+                         unsigned Alignment) {
+  return TTI.isLegalMaskedGather(Ty, Align(Alignment));
+}
+
+bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
+                          unsigned Alignment) {
+  return TTI.isLegalMaskedScatter(Ty, Align(Alignment));
+}
+}  // namespace
+
+// NOTE the TargetMachine is allowed to be null here; it isn't used in the
+// implementation at present, but if it gets used in future it needs to be
+// guarded.
+TargetInfo::TargetInfo(TargetMachine *tm) : TM_(tm) {}
+
+Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                              Value *Stride, Value *EVL) const {
+  if (!Ptr || !Stride || !Ty->isVectorTy()) {
+    return nullptr;
+  }
+
+  // Validate the pointer type.
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  Type *EleTy = Ty->getScalarType();
+
+  // Trivial case: contiguous load.
+  ConstantInt *CIntStride = dyn_cast<ConstantInt>(Stride);
+  PointerType *VecPtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
+  Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
+  if (CIntStride && CIntStride->getSExtValue() == 1) {
+    unsigned Align = EleTy->getScalarSizeInBits() / 8;
+    if (EVL) {
+      const Function *F = B.GetInsertBlock()->getParent();
+      auto const Legality = isVPLoadLegal(F, Ty, Align);
+      if (!Legality.isVPLegal()) {
+        emitVeczRemarkMissed(F,
+                             "Could not create a VP load as the target "
+                             "reported it would be illegal");
+        VECZ_FAIL();
+      }
+      auto *Mask = multi_llvm::createAllTrueMask(
+          B, multi_llvm::getVectorElementCount(Ty));
+      SmallVector<llvm::Value *, 2> Args = {VecPtr, Mask, EVL};
+      SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
+    }
+    return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Align));
+  }
+
+  if (EVL) {
+    emitVeczRemarkMissed(
+        B.GetInsertBlock()->getParent(), Ptr,
+        "Could not create vector-length-predicated interleaved load");
+    return nullptr;
+  }
+
+  auto Elts = multi_llvm::getVectorElementCount(Ty);
+  if (Elts.isScalable()) {
+    emitVeczRemarkMissed(B.GetInsertBlock()->getParent(), Ptr,
+                         "Could not create a scalable-vector interleaved load");
+    VECZ_FAIL();
+  }
+  unsigned SimdWidth = Elts.getFixedValue();
+  // Load individual values.
+  SmallVector<Value *, 8> Values;
+  Value *Index = B.getInt64(0);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Value *GEP = B.CreateGEP(EleTy, Ptr, Index);
+    Values.push_back(B.CreateLoad(EleTy, GEP, false, "interleaved.load"));
+    Index = B.CreateAdd(Index, Stride);
+  }
+
+  // Create a vector out of these values.
+  Value *Result = UndefValue::get(Ty);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Result = B.CreateInsertElement(Result, Values[i], B.getInt32(i));
+  }
+  return Result;
+}
+
+Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
+                               Value *Stride, unsigned Alignment,
+                               Value *EVL) const {
+  if (!Ptr || !Data || !Stride) {
+    return nullptr;
+  }
+
+  // Validate the pointer type.
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  Type *VecTy = Data->getType();
+  Type *EleTy = VecTy->getScalarType();
+
+  // Trivial case: contiguous store.
+  ConstantInt *CIntStride = dyn_cast<ConstantInt>(Stride);
+  if (CIntStride && CIntStride->getSExtValue() == 1) {
+    PointerType *VecPtrTy = VecTy->getPointerTo(PtrTy->getAddressSpace());
+    Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
+    if (EVL) {
+      const Function *F = B.GetInsertBlock()->getParent();
+      auto const Legality = isVPStoreLegal(F, VecTy, Alignment);
+      if (!Legality.isVPLegal()) {
+        emitVeczRemarkMissed(F,
+                             "Could not create a VP store as the target "
+                             "reported it would be illegal");
+        VECZ_FAIL();
+      }
+      auto *Mask = multi_llvm::createAllTrueMask(
+          B, multi_llvm::getVectorElementCount(VecTy));
+      SmallVector<llvm::Value *, 3> Args = {Data, VecPtr, Mask, EVL};
+      SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtr->getType()};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
+    }
+    return B.CreateAlignedStore(Data, VecPtr, MaybeAlign(Alignment));
+  }
+
+  if (EVL) {
+    emitVeczRemarkMissed(
+        B.GetInsertBlock()->getParent(), Ptr,
+        "Could not create vector-length-predicated interleaved store");
+    return nullptr;
+  }
+
+  auto Elts = multi_llvm::getVectorElementCount(VecTy);
+  if (Elts.isScalable()) {
+    emitVeczRemarkMissed(
+        B.GetInsertBlock()->getParent(), Ptr,
+        "Could not create a scalable-vector interleaved store");
+    VECZ_FAIL();
+  }
+  unsigned SimdWidth = Elts.getFixedValue();
+  // Extract values from the vector.
+  SmallVector<Value *, 8> Values;
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Values.push_back(B.CreateExtractElement(Data, B.getInt32(i)));
+  }
+
+  // Store individual values.
+  Value *Ret = nullptr;
+  Value *Index = B.getInt64(0);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Value *GEP = B.CreateGEP(EleTy, Ptr, Index);
+    Ret = B.CreateStore(Values[i], GEP);
+    cast<StoreInst>(Ret)->setAlignment(MaybeAlign(Alignment).valueOrOne());
+
+    Index = B.CreateAdd(Index, Stride);
+  }
+  return Ret;
+}
+
+Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                    Value *Mask, Value *EVL,
+                                    unsigned Alignment) const {
+  VECZ_FAIL_IF(!Ptr || !Mask);
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  VECZ_FAIL_IF(!PtrTy);
+  Type *EleTy = Ty->getScalarType();
+
+  // Validate the pointer and mask types.
+  auto *DataVecTy = dyn_cast<VectorType>(Ty);
+  auto *MaskVecTy = dyn_cast<VectorType>(Mask->getType());
+  if (DataVecTy && MaskVecTy) {
+    VECZ_ERROR_IF(multi_llvm::getVectorElementCount(DataVecTy) !=
+                      multi_llvm::getVectorElementCount(MaskVecTy),
+                  "The mask and the data need to have the same width");
+  }
+
+  // Use LLVM intrinsics for masked vector loads.
+  if (Ty->isVectorTy()) {
+    PtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
+    Ptr = B.CreateBitCast(Ptr, PtrTy);
+    const Function *F = B.GetInsertBlock()->getParent();
+    auto const Legality = isVPLoadLegal(F, Ty, Alignment);
+    if (EVL && Legality.isVPLegal()) {
+      SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      SmallVector<llvm::Type *, 2> Tys = {Ty, PtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Mask = applyEVLToMask(B, EVL, Mask);
+      VECZ_FAIL_IF(!Mask);
+      return B.CreateMaskedLoad(Ty, Ptr, Align(Alignment), Mask);
+    } else {
+      emitVeczRemarkMissed(F,
+                           "Could not create a masked load as the target "
+                           "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  unsigned const Width = 1;
+
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || !Ptr || !Mask || EVL);
+
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> LoadBlocks;
+  TestBlocks.push_back(Entry);
+  LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_load_exit", F);
+
+  Constant *const DefaultEleData = UndefValue::get(EleTy);
+  SmallVector<Value *, 4> LoadedLanes;
+  SmallVector<Value *, 4> LanePhis;
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    if (i > 0) {
+      PHINode *LanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+      LanePhi->addIncoming(LoadedLanes[i - 1], LoadBlocks[i - 1]);
+      LanePhi->addIncoming(DefaultEleData, TestBlocks[i - 1]);
+      LanePhis.push_back(LanePhi);
+    }
+
+    Value *MaskLane =
+        (Width == 1) ? Mask
+                     : B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, LoadBlocks[i], Next);
+
+    // Load the element and branch.
+    B.SetInsertPoint(LoadBlocks[i]);
+    Value *LanePtr =
+        i > 0 ? B.CreateGEP(EleTy, Ptr, B.getInt32(i), "lane_ptr") : Ptr;
+    LoadInst *Load = B.CreateLoad(EleTy, LanePtr, false, "masked_load");
+    Load->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    LoadedLanes.push_back(Load);
+    B.CreateBr(Next);
+  }
+
+  // Aggregate the loaded lanes.
+  B.SetInsertPoint(Exit);
+  PHINode *LastLanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+  LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]);
+  LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]);
+  LanePhis.push_back(LastLanePhi);
+
+  Value *Result = nullptr;
+  if (Width > 1) {
+    Result = UndefValue::get(Ty);
+    for (unsigned i = 0; i < Width; i++) {
+      Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i));
+    }
+  } else {
+    Result = LanePhis[Width - 1];
+  }
+
+  return Result;
+}
+
+Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
+                                     Value *Mask, Value *EVL,
+                                     unsigned Alignment) const {
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  VECZ_FAIL_IF(!PtrTy);
+  Type *DataTy = Data->getType();
+  Type *EleTy = DataTy->getScalarType();
+
+  auto *DataVecTy = dyn_cast<VectorType>(DataTy);
+  auto *MaskVecTy = dyn_cast<VectorType>(Mask->getType());
+  if (DataVecTy && MaskVecTy) {
+    VECZ_ERROR_IF(multi_llvm::getVectorElementCount(DataVecTy) !=
+                      multi_llvm::getVectorElementCount(MaskVecTy),
+                  "The mask and the data need to have the same width");
+  }
+
+  // Use LLVM intrinsics for masked vector Stores.
+  if (DataTy->isVectorTy()) {
+    PtrTy = DataTy->getPointerTo(PtrTy->getAddressSpace());
+    Ptr = B.CreateBitCast(Ptr, PtrTy);
+    const Function *F = B.GetInsertBlock()->getParent();
+    auto const Legality = isVPStoreLegal(F, DataTy, Alignment);
+    if (EVL && Legality.isVPLegal()) {
+      SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      SmallVector<llvm::Type *, 2> Tys = {Data->getType(), PtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Mask = applyEVLToMask(B, EVL, Mask);
+      VECZ_FAIL_IF(!Mask);
+      return B.CreateMaskedStore(Data, Ptr, Align(Alignment), Mask);
+    } else {
+      emitVeczRemarkMissed(F,
+                           "Could not create a masked store as the target "
+                           "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  unsigned const Width = 1;
+
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  StoreInst *FirstStore = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || EVL);
+
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> StoreBlocks;
+  TestBlocks.push_back(Entry);
+  StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_store_exit", F);
+
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    Value *MaskLane =
+        (Width == 1) ? Mask
+                     : B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, StoreBlocks[i], Next);
+
+    // Extract the data elements and store.
+    B.SetInsertPoint(StoreBlocks[i]);
+    Value *DataLane =
+        (Width == 1) ? Data
+                     : B.CreateExtractElement(Data, B.getInt32(i), "data_lane");
+    Value *LanePtr = Ptr;
+    if (i > 0) {
+      LanePtr = B.CreateGEP(EleTy, LanePtr, B.getInt32(i), "lane_ptr");
+    }
+    StoreInst *Store = B.CreateStore(DataLane, LanePtr);
+    if (i == 0) {
+      FirstStore = Store;
+    }
+    Store->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    B.CreateBr(Next);
+  }
+
+  B.SetInsertPoint(Exit);
+  return FirstStore;
+}
+
+Value *TargetInfo::createInterleavedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                         Value *Stride, Value *EVL,
+                                         unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Ty);
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedInterleavedLoad(B, Ty, Ptr, Mask, Stride, EVL, Alignment);
+}
+
+Value *TargetInfo::createInterleavedStore(IRBuilder<> &B, Value *Data,
+                                          Value *Ptr, Value *Stride, Value *EVL,
+                                          unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Data->getType());
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedInterleavedStore(B, Data, Ptr, Mask, Stride, EVL,
+                                      Alignment);
+}
+
+Value *TargetInfo::createMaskedInterleavedLoad(IRBuilder<> &B, Type *Ty,
+                                               Value *Ptr, Value *Mask,
+                                               Value *Stride, Value *EVL,
+                                               unsigned Alignment) const {
+  // We only support scalar pointer types
+  assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved load");
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+      cast<PointerType>(Ptr->getType()), Ty->getScalarType()));
+
+  auto EC = multi_llvm::getVectorElementCount(Ty);
+  Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr");
+  Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
+
+  Value *IndicesVector =
+      multi_llvm::createIndexSequence(B, StrideSplat->getType(), EC);
+  VECZ_FAIL_IF(!IndicesVector);
+  IndicesVector = B.CreateMul(StrideSplat, IndicesVector);
+
+  Value *Address =
+      B.CreateGEP(Ty->getScalarType(), BroadcastAddr, IndicesVector);
+
+  return createMaskedGatherLoad(B, Ty, Address, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createMaskedInterleavedStore(IRBuilder<> &B, Value *Data,
+                                                Value *Ptr, Value *Mask,
+                                                Value *Stride, Value *EVL,
+                                                unsigned Alignment) const {
+  // We only support scalar pointer types
+  assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved store");
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+      cast<PointerType>(Ptr->getType()), Data->getType()->getScalarType()));
+  auto EC = multi_llvm::getVectorElementCount(Data->getType());
+  Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr");
+  Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
+
+  Value *IndicesVector =
+      multi_llvm::createIndexSequence(B, StrideSplat->getType(), EC);
+  VECZ_FAIL_IF(!IndicesVector);
+  IndicesVector = B.CreateMul(StrideSplat, IndicesVector);
+
+  Value *Address = B.CreateGEP(Data->getType()->getScalarType(), BroadcastAddr,
+                               IndicesVector);
+
+  return createMaskedScatterStore(B, Data, Address, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                    Value *EVL, unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Ty);
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedGatherLoad(B, Ty, Ptr, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createScatterStore(IRBuilder<> &B, Value *Data, Value *Ptr,
+                                      Value *EVL, unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Data->getType());
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedScatterStore(B, Data, Ptr, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                          Value *Mask, Value *EVL,
+                                          unsigned Alignment) const {
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || !Ptr || !Mask);
+
+  auto *VecPtrTy = dyn_cast<VectorType>(Ptr->getType());
+  VECZ_FAIL_IF(!VecPtrTy);
+  PointerType *PtrTy = dyn_cast<PointerType>(VecPtrTy->getElementType());
+  VECZ_FAIL_IF(!PtrTy);
+  Type *EleTy = Ty->getScalarType();
+  Constant *DefaultEleData = UndefValue::get(EleTy);
+
+  if (Ty->isVectorTy()) {
+    auto const Legality = isVPGatherLegal(F, Ty, Alignment);
+    if (EVL && Legality.isVPLegal()) {
+      SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_gather, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Function *MaskedGather = Intrinsic::getDeclaration(
+          F->getParent(), Intrinsic::masked_gather, {Ty, VecPtrTy});
+
+      if (MaskedGather) {
+        Mask = applyEVLToMask(B, EVL, Mask);
+        VECZ_FAIL_IF(!Mask);
+        // Create the call to the function
+        Value *Args[] = {Ptr, B.getInt32(Alignment), Mask, UndefValue::get(Ty)};
+        CallInst *CI = B.CreateCall(MaskedGather, Args);
+        if (CI) {
+          CI->setCallingConv(MaskedGather->getCallingConv());
+          CI->setAttributes(MaskedGather->getAttributes());
+          return CI;
+        }
+      }
+    } else {
+      emitVeczRemarkMissed(F,
+                           "Could not create a masked gather as the target "
+                           "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  VECZ_FAIL_IF(EVL);
+  auto VecWidth = multi_llvm::getVectorElementCount(Ty);
+  unsigned Width = VecWidth.getFixedValue();
+
+  // Fallback scalar function generator
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> LoadBlocks;
+  TestBlocks.push_back(Entry);
+  LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_load_exit", F);
+
+  SmallVector<Value *, 4> LoadedLanes;
+  SmallVector<Value *, 4> LanePhis;
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    if (i > 0) {
+      PHINode *LanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+      LanePhi->addIncoming(LoadedLanes[i - 1], LoadBlocks[i - 1]);
+      LanePhi->addIncoming(DefaultEleData, TestBlocks[i - 1]);
+      LanePhis.push_back(LanePhi);
+    }
+
+    Value *MaskLane = B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, LoadBlocks[i], Next);
+
+    // Load the element and branch.
+    B.SetInsertPoint(LoadBlocks[i]);
+    Value *PtrLane = B.CreateExtractElement(Ptr, B.getInt32(i), "ptr_lane");
+    LoadInst *Load = B.CreateLoad(EleTy, PtrLane, false, "masked_load");
+    Load->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    LoadedLanes.push_back(Load);
+    B.CreateBr(Next);
+  }
+
+  // Aggregate the loaded lanes.
+  B.SetInsertPoint(Exit);
+  PHINode *LastLanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+  LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]);
+  LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]);
+  LanePhis.push_back(LastLanePhi);
+  Value *Result = UndefValue::get(Ty);
+  for (unsigned i = 0; i < Width; i++) {
+    Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i));
+  }
+  return Result;
+}
+
+Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
+                                            Value *Ptr, Value *Mask, Value *EVL,
+                                            unsigned Alignment) const {
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  StoreInst *FirstStore = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || !Ptr || !Mask);
+  auto *DataTy = Data->getType();
+
+  if (DataTy->isVectorTy()) {
+    auto *VecPtrTy = dyn_cast<VectorType>(Ptr->getType());
+    VECZ_FAIL_IF(!VecPtrTy);
+    auto const Legality = isVPScatterLegal(F, DataTy, Alignment);
+    if (EVL && Legality.isVPLegal()) {
+      SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_scatter, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Function *MaskedScatter = Intrinsic::getDeclaration(
+          F->getParent(), Intrinsic::masked_scatter, {DataTy, VecPtrTy});
+
+      if (MaskedScatter) {
+        Mask = applyEVLToMask(B, EVL, Mask);
+        VECZ_FAIL_IF(!Mask);
+        // Create the call to the function
+        Value *Args[] = {Data, Ptr, B.getInt32(Alignment), Mask};
+        CallInst *CI = B.CreateCall(MaskedScatter, Args);
+        if (CI) {
+          CI->setCallingConv(MaskedScatter->getCallingConv());
+          CI->setAttributes(MaskedScatter->getAttributes());
+          return CI;
+        }
+      }
+    } else {
+      emitVeczRemarkMissed(F,
+                           "Could not create a masked scatter as the target "
+                           "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  VECZ_FAIL_IF(EVL);
+  auto VecWidth = multi_llvm::getVectorElementCount(DataTy);
+  unsigned Width = VecWidth.getFixedValue();
+
+  // Fallback scalar function generator
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> StoreBlocks;
+  TestBlocks.push_back(Entry);
+  StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_store_exit", F);
+
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    Value *MaskLane = B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, StoreBlocks[i], Next);
+
+    // Extract the data elements and store.
+    B.SetInsertPoint(StoreBlocks[i]);
+    Value *PtrLane = B.CreateExtractElement(Ptr, B.getInt32(i), "ptr_lane");
+    Value *DataLane = B.CreateExtractElement(Data, B.getInt32(i), "data_lane");
+    StoreInst *Store = B.CreateStore(DataLane, PtrLane);
+    if (i == 0) {
+      FirstStore = Store;
+    }
+    Store->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    B.CreateBr(Next);
+  }
+
+  B.SetInsertPoint(Exit);
+  return FirstStore;
+}
+
+Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
+                                                VectorizationContext &Ctx,
+                                                Instruction *extract,
+                                                Type *narrowTy, Value *src,
+                                                Value *index, Value *VL) const {
+  (void)VL;
+  auto const *origSrc = extract->getOperand(0);
+  auto *eltTy = src->getType()->getScalarType();
+
+  auto *wideTy = src->getType();
+
+  auto it = B.GetInsertPoint();
+
+  // Insert alloca at the beginning of the function.
+  auto allocaIt =
+      B.GetInsertBlock()->getParent()->getEntryBlock().getFirstInsertionPt();
+  B.SetInsertPoint(&*allocaIt);
+  auto *const alloc = B.CreateAlloca(wideTy, nullptr, "fixlen.alloc");
+
+  // Reset the insertion point to wherever we must insert instructions
+  B.SetInsertPoint(&*it);
+
+  // Store the packetized vector to the allocation
+  B.CreateStore(src, alloc);
+
+  // Re-interpret the allocation as a pointer to the element type
+  auto *const eltptrTy = eltTy->getPointerTo();
+  auto *const bcastalloc =
+      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
+
+  unsigned const fixedVecElts =
+      multi_llvm::getVectorNumElements(origSrc->getType());
+
+  Value *load = nullptr;
+  if (!index->getType()->isVectorTy()) {
+    // If the index remains a scalar (is uniform) then we can use a strided load
+    // starting from the address '&alloc[index]', strided by the original vector
+    // width: &alloc[index], &alloc[index+N], &alloc[index+2N], ...
+    auto *const stride = getSizeInt(B, fixedVecElts);
+    auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our loads. This is either a scalar pointer, or a vector of
+    // pointers.
+    auto *const gep =
+        B.CreateInBoundsGEP(eltTy, bcastalloc, index, "vec.alloc");
+
+    load = ::createInterleavedLoad(Ctx, narrowTy, gep, stride, /*Mask*/ nullptr,
+                                   /*EVL*/ nullptr, alignment.value(), "",
+                                   &*B.GetInsertPoint());
+  } else {
+    // Else if we've got a varying, vector index, then we must use a gather.
+    // Take our indices, and add them to a step multiplied by the original
+    // vecor width. Use that to create a vector of pointers.
+    auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+
+    index = getGatherIndicesVector(
+        B, index, index->getType(),
+        multi_llvm::getVectorNumElements(origSrc->getType()), "idx");
+
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our striding load.
+    auto *const gep =
+        B.CreateInBoundsGEP(eltTy, bcastalloc, index, "vec.alloc");
+
+    load = ::createGather(Ctx, narrowTy, gep, /*Mask*/ nullptr, /*EVL*/ nullptr,
+                          alignment.value(), "", &*B.GetInsertPoint());
+  }
+
+  return load;
+}
+
+Value *TargetInfo::createOuterScalableBroadcast(IRBuilder<> &builder,
+                                                Value *vector, Value *VL,
+                                                ElementCount factor) const {
+  return createScalableBroadcast(builder, vector, VL, factor,
+                                 /* URem */ true);
+}
+
+Value *TargetInfo::createInnerScalableBroadcast(IRBuilder<> &builder,
+                                                Value *vector, Value *VL,
+                                                ElementCount factor) const {
+  return createScalableBroadcast(builder, vector, VL, factor,
+                                 /* URem */ false);
+}
+
+Value *TargetInfo::createScalableBroadcast(IRBuilder<> &B, Value *vector,
+                                           Value *VL, ElementCount factor,
+                                           bool URem) const {
+  (void)VL;
+  auto *const ty = vector->getType();
+  auto *const wideTy = ScalableVectorType::get(
+      multi_llvm::getVectorElementType(ty),
+      factor.getKnownMinValue() *
+          multi_llvm::getVectorElementCount(ty).getKnownMinValue());
+  auto wideEltCount = multi_llvm::getVectorElementCount(wideTy);
+
+  // The splats must be inserted after any Allocas
+  auto it = B.GetInsertBlock()->getParent()->getEntryBlock().begin();
+  while (isa<AllocaInst>(*it)) {
+    ++it;
+  }
+  IRBuilder<> AllocaB(&*it);
+
+  auto *const alloc = AllocaB.CreateAlloca(ty, nullptr, "fixlen.alloc");
+
+  // Store the vector to the allocation.
+  B.CreateStore(vector, alloc);
+
+  auto *const eltTy = cast<llvm::VectorType>(ty)->getElementType();
+
+  auto *const eltptrTy = eltTy->getPointerTo();
+  auto *const bcastalloc =
+      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
+  auto *const stepsRem = TargetInfo::createBroadcastIndexVector(
+      B,
+      ScalableVectorType::get(B.getInt32Ty(), cast<ScalableVectorType>(wideTy)),
+      factor, URem, "idx1");
+  auto *const gep =
+      B.CreateInBoundsGEP(eltTy, bcastalloc, stepsRem, "vec.alloc");
+  auto *const boolTrue = ConstantInt::getTrue(B.getContext());
+  auto *const mask = B.CreateVectorSplat(wideEltCount, boolTrue, "truemask");
+  // Set the alignment to that of vector element type.
+  auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+  return B.CreateMaskedGather(wideTy, gep, alignment, mask,
+                              UndefValue::get(wideTy));
+}
+
+Value *TargetInfo::createBroadcastIndexVector(IRBuilder<> &B, Type *ty,
+                                              ElementCount factor, bool URem,
+                                              const llvm::Twine &N) {
+  auto *const steps = B.CreateStepVector(ty, "idx0");
+  auto const tyEC = multi_llvm::getVectorElementCount(ty);
+  unsigned const factorMinVal = factor.getKnownMinValue();
+
+  unsigned fixedAmt;
+  Instruction::BinaryOps Opc;
+  if (URem) {
+    fixedAmt = tyEC.getKnownMinValue() / factorMinVal;
+    Opc = BinaryOperator::URem;
+  } else {
+    fixedAmt = factorMinVal;
+    Opc = BinaryOperator::UDiv;
+  }
+  auto *const vectorEltsSplat = B.CreateVectorSplat(
+      tyEC, ConstantInt::get(multi_llvm::getVectorElementType(ty), fixedAmt));
+  return B.CreateBinOp(Opc, steps, vectorEltsSplat, N);
+}
+
+Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
+                                               VectorizationContext &Ctx,
+                                               Instruction *insert, Value *elt,
+                                               Value *into, Value *index,
+                                               Value *VL) const {
+  (void)VL;
+  auto *eltTy = elt->getType();
+  auto *intoTy = into->getType();
+  auto *scalarTy = elt->getType()->getScalarType();
+
+  // The alloca must be inserted at the beginning of the function.
+  auto allocaIt =
+      B.GetInsertBlock()->getParent()->getEntryBlock().getFirstInsertionPt();
+  auto it = B.GetInsertPoint();
+
+  B.SetInsertPoint(&*allocaIt);
+  auto *const alloc = B.CreateAlloca(intoTy, nullptr);
+
+  // Reset the insertion point to wherever we must insert instructions
+  B.SetInsertPoint(&*it);
+
+  // Store the wide vector to the allocation
+  B.CreateStore(into, alloc);
+
+  // Re-interpret the allocation as a pointer to the element type
+  auto *const eltptrTy = scalarTy->getPointerTo();
+  auto *const bcastalloc =
+      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
+
+  unsigned const fixedVecElts =
+      multi_llvm::getVectorNumElements(insert->getOperand(0)->getType());
+
+  // Construct the index, either by packetizing if (if varying) or by
+  // splatting it and combining it with a step vector
+  if (!index->getType()->isVectorTy()) {
+    // If the index remains a scalar (is uniform) then we can use a strided
+    // store starting from the address '&alloc[index]', strided by the original
+    // vector width: &alloc[index], &alloc[index+N], &alloc[index+2N], ...
+    auto *const stride = getSizeInt(B, fixedVecElts);
+    auto alignment =
+        MaybeAlign(scalarTy->getScalarSizeInBits() / 8).valueOrOne();
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our loads. This is either a scalar pointer, or a vector of
+    // pointers.
+    auto *const gep =
+        B.CreateInBoundsGEP(scalarTy, bcastalloc, index, "vec.alloc");
+
+    Value *store = ::createInterleavedStore(
+        Ctx, elt, gep, stride, /*Mask*/ nullptr,
+        /*EVL*/ nullptr, alignment.value(), "", &*B.GetInsertPoint());
+    VECZ_FAIL_IF(!store);
+  } else {
+    // Else if we've got a varying, vector index, then we must use a scatter.
+    // Take our indices, and add them to a step multiplied by the original
+    // vecor width. Use that to create a vector of pointers.
+    auto alignment =
+        MaybeAlign(scalarTy->getScalarSizeInBits() / 8).valueOrOne();
+
+    auto narrowEltCount = multi_llvm::getVectorElementCount(eltTy);
+
+    auto *steps = B.CreateStepVector(index->getType(), "idx0");
+    auto *const fixedVecEltsSplat = B.CreateVectorSplat(
+        narrowEltCount,
+        ConstantInt::get(index->getType()->getScalarType(), fixedVecElts));
+    auto *const stepsMul = B.CreateMul(steps, fixedVecEltsSplat, "idx.scale");
+    index = B.CreateAdd(stepsMul, index, "idx");
+
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our striding load.
+    auto *const gep =
+        B.CreateInBoundsGEP(scalarTy, bcastalloc, index, "vec.alloc");
+
+    Value *store = ::createScatter(Ctx, elt, gep, /*Mask*/ nullptr,
+                                   /*EVL*/ nullptr, alignment.value(), "",
+                                   &*B.GetInsertPoint());
+    VECZ_FAIL_IF(!store);
+  }
+
+  // Load the vector back from the stack
+  return B.CreateLoad(intoTy, alloc);
+}
+
+bool TargetInfo::isVPVectorLegal(const Function &F, Type *Ty) const {
+  return !TM_ ||
+         TM_->getTargetTransformInfo(F).isElementTypeLegalForScalableVector(
+             multi_llvm::getVectorElementType(Ty));
+}
+
+TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
+    const Function *F,
+    function_ref<bool(const llvm::TargetTransformInfo &, Type *, unsigned)>
+        Checker,
+    Type *Ty, unsigned Alignment) const {
+  assert(Ty->isVectorTy() && "Expected a vector type");
+  bool const isMaskLegal =
+      !(isa<ScalableVectorType>(Ty) && TM_) ||
+      Checker(TM_->getTargetTransformInfo(*F), Ty, Alignment);
+  // Assuming a pointer bit width of 64
+  bool isVPLegal = isMaskLegal && isVPVectorLegal(*F, Ty);
+  if (isVPLegal) {
+    const unsigned PtrBitWidth =
+        TM_ ? TM_->createDataLayout().getPointerTypeSizeInBits(
+                  Ty->getPointerTo())
+            : 64;
+    auto &Ctx = Ty->getContext();
+    auto *const IntTy = IntegerType::get(Ctx, PtrBitWidth);
+    auto *const IntVecTy =
+        VectorType::get(IntTy, multi_llvm::getVectorElementCount(Ty));
+    isVPLegal = isVPVectorLegal(*F, IntVecTy);
+  }
+  return {isVPLegal, isMaskLegal};
+}
+
+TargetInfo::VPMemOpLegality TargetInfo::isVPLoadLegal(
+    const Function *F, Type *Ty, unsigned Alignment) const {
+  return checkMemOpLegality(F, isLegalMaskedLoad, Ty, Alignment);
+}
+
+TargetInfo::VPMemOpLegality TargetInfo::isVPStoreLegal(
+    const Function *F, Type *Ty, unsigned Alignment) const {
+  return checkMemOpLegality(F, isLegalMaskedStore, Ty, Alignment);
+}
+
+TargetInfo::VPMemOpLegality TargetInfo::isVPGatherLegal(
+    const Function *F, Type *Ty, unsigned Alignment) const {
+  return checkMemOpLegality(F, isLegalMaskedGather, Ty, Alignment);
+}
+
+TargetInfo::VPMemOpLegality TargetInfo::isVPScatterLegal(
+    const Function *F, Type *Ty, unsigned Alignment) const {
+  return checkMemOpLegality(F, isLegalMaskedScatter, Ty, Alignment);
+}
+
+bool TargetInfo::isLegalVPElementType(Type *) const { return true; }
+
+llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
+                                             llvm::Value *src,
+                                             llvm::Value *mask,
+                                             llvm::Value *evl) const {
+  auto *const srcTy = dyn_cast<VectorType>(src->getType());
+  auto *const maskTy = dyn_cast<VectorType>(mask->getType());
+  assert(
+      srcTy && maskTy &&
+      "TargetInfo::createVectorShuffle: source and mask must have vector type");
+
+  if (isa<Constant>(mask)) {
+    // Special case if the mask happens to be a constant.
+    return B.CreateShuffleVector(src, UndefValue::get(srcTy), mask);
+  }
+
+  // The alloca must be inserted at the beginning of the function.
+  auto *const curBlock = B.GetInsertBlock();
+  auto &entryBlock = curBlock->getParent()->getEntryBlock();
+  auto const allocaIt = entryBlock.getFirstInsertionPt();
+  auto const it = B.GetInsertPoint();
+
+  B.SetInsertPoint(&entryBlock, allocaIt);
+  auto *const alloc = B.CreateAlloca(srcTy, nullptr);
+
+  // Reset the insertion point to wherever we must insert instructions
+  B.SetInsertPoint(curBlock, it);
+
+  // Store the wide vector to the allocation
+  B.CreateStore(src, alloc);
+
+  auto *const eltTy = srcTy->getElementType();
+
+  // Re-interpret the allocation as a pointer to the element type
+  auto *const eltptrTy = eltTy->getPointerTo();
+  auto *const bcastalloc =
+      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
+
+  // Index into the allocation.
+  auto *const gep = B.CreateInBoundsGEP(eltTy, bcastalloc, mask, "vec.alloc");
+
+  auto const eltCount = maskTy->getElementCount();
+  auto *const dstTy = VectorType::get(eltTy, eltCount);
+  auto const alignment =
+      MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+
+  Value *gatherMask = nullptr;
+  if (evl) {
+    auto const EC = srcTy->getElementCount();
+    auto *const IndexTy = VectorType::get(evl->getType(), EC);
+    auto *const step = B.CreateStepVector(IndexTy);
+    gatherMask = B.CreateICmpULT(step, B.CreateVectorSplat(EC, evl));
+  } else {
+    gatherMask = B.CreateVectorSplat(eltCount, B.getTrue());
+  }
+
+  return B.CreateMaskedGather(dstTy, gep, alignment, gatherMask,
+                              UndefValue::get(dstTy));
+}
+
+llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
+                                             llvm::Value *src,
+                                             llvm::Value *insert,
+                                             llvm::Value *) const {
+  auto *const srcTy = dyn_cast<VectorType>(src->getType());
+  assert(srcTy &&
+         "TargetInfo::createVectorShuffle: source must have vector type");
+
+  auto *const undef = UndefValue::get(srcTy);
+  auto const EC = srcTy->getElementCount();
+  if (!EC.isScalable()) {
+    // Special case for fixed-width vectors
+    auto const width = EC.getFixedValue();
+    SmallVector<int, 16> mask(width);
+    auto it = mask.begin();
+    *it++ = 0;
+    for (size_t i = 1; i < width; ++i) {
+      *it++ = i - 1;
+    }
+
+    auto *const rotate =
+        createOptimalShuffle(B, src, undef, mask, Twine("slide_up"));
+    return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in");
+  }
+
+  auto *const rotate = B.CreateVectorSplice(undef, src, -1, "slide_up");
+  return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in");
+}
+
+bool TargetInfo::canOptimizeInterleavedGroup(const Instruction &val,
+                                             InterleavedOperation Kind,
+                                             int Stride,
+                                             unsigned GroupSize) const {
+  if ((Stride == 2) || (Stride == 4)) {
+    VECZ_FAIL_IF((int)GroupSize != abs(Stride));
+    VECZ_FAIL_IF((Kind != eInterleavedLoad) && (Kind != eInterleavedStore) &&
+                 (Kind != eMaskedInterleavedLoad) &&
+                 (Kind != eMaskedInterleavedStore));
+    Type *DataType = nullptr;
+    if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
+      DataType = val.getOperand(0)->getType();
+    } else {
+      DataType = val.getType();
+    }
+    VECZ_FAIL_IF(!DataType);
+    VECZ_FAIL_IF(!isa<FixedVectorType>(DataType));
+    return true;
+  }
+  return false;
+}
+
+bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
+                                          InterleavedOperation Kind,
+                                          ArrayRef<Value *> Group,
+                                          ArrayRef<Value *> Masks,
+                                          Value *Address, int Stride) const {
+  VECZ_FAIL_IF(Stride < 0);
+
+  // Validate the operations in the group.
+  SmallVector<CallInst *, 4> Calls;
+  for (unsigned i = 0; i < Group.size(); i++) {
+    CallInst *Op = dyn_cast<CallInst>(Group[i]);
+    VECZ_FAIL_IF(!Op);
+    Calls.push_back(Op);
+  }
+  PointerType *PtrTy = dyn_cast<PointerType>(Address->getType());
+  VECZ_FAIL_IF(!PtrTy);
+  CallInst *Op0 = Calls[0];
+  VECZ_FAIL_IF(!canOptimizeInterleavedGroup(*Op0, Kind, Stride, Group.size()));
+
+  // canOptimizeInterleavedGroup() performs several checks, including valid
+  // Kind and Op0 types. Thus, these casts are safe.
+  FixedVectorType *VecTy = nullptr;
+  if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
+    VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
+  } else {  // eInterleavedLoad || eMaskedInterleavedLoad
+    VecTy = cast<FixedVectorType>(Op0->getType());
+  }
+
+  auto VecWidth = multi_llvm::getVectorElementCount(VecTy);
+  unsigned SimdWidth = VecWidth.getFixedValue();
+
+  Type *EleTy = VecTy->getElementType();
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, EleTy) &&
+         "Unhandled interleaved accesses");
+  unsigned Align = EleTy->getScalarSizeInBits() / 8;
+
+  bool HasMask =
+      (Kind == eMaskedInterleavedLoad) || (Kind == eMaskedInterleavedStore);
+  SmallVector<Value *, 4> Vectors;
+  SmallVector<Value *, 4> VecMasks(Masks.begin(), Masks.end());
+  if (Kind == eInterleavedLoad || Kind == eMaskedInterleavedLoad) {
+    // Create one regular vector load per interleaved load in the group.
+    if (HasMask) {
+      VECZ_FAIL_IF(!interleaveVectors(B, VecMasks, true));
+    }
+
+    for (unsigned i = 0; i < Group.size(); i++) {
+      Value *AddressN = Address;
+      if (i > 0) {
+        unsigned Offset = i * SimdWidth;
+        AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset));
+      }
+      Value *Load = nullptr;
+      if (!HasMask) {
+        Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1));
+      } else {
+        Value *Mask = VecMasks[i];
+        Load =
+            createMaskedLoad(B, VecTy, AddressN, Mask, /*EVL*/ nullptr, Align);
+      }
+      VECZ_FAIL_IF(!Load);
+      Vectors.push_back(Load);
+    }
+    // Transpose the loaded vectors and replace the original loads.
+    VECZ_FAIL_IF(!interleaveVectors(B, Vectors, false));
+    for (unsigned i = 0; i < Group.size(); i++) {
+      Value *Vector = Vectors[i];
+      Value *OrigLoad = Group[i];
+      OrigLoad->replaceAllUsesWith(Vector);
+    }
+  } else if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
+    // Transpose the vectors to store with interleave.
+    for (unsigned i = 0; i < Group.size(); i++) {
+      CallInst *OrigStore = cast<CallInst>(Group[i]);
+      Vectors.push_back(OrigStore->getOperand(0));
+    }
+    VECZ_FAIL_IF(!interleaveVectors(B, Vectors, true));
+    if (HasMask) {
+      VECZ_FAIL_IF(!interleaveVectors(B, VecMasks, true));
+    }
+    // Create one regular vector store per interleaved store in the group.
+    for (unsigned i = 0; i < Group.size(); i++) {
+      Value *Vector = Vectors[i];
+      Value *AddressN = Address;
+      if (i > 0) {
+        unsigned Offset = i * SimdWidth;
+        AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset));
+      }
+      Value *Store = nullptr;
+      if (!HasMask) {
+        Store = createStore(B, Vector, AddressN, getSizeInt(B, 1), Align);
+      } else {
+        Value *Mask = VecMasks[i];
+        Store = createMaskedStore(B, Vector, AddressN, Mask, /*EVL*/ nullptr,
+                                  Align);
+      }
+      VECZ_FAIL_IF(!Store);
+    }
+  }
+
+  return true;
+}
+
+bool TargetInfo::interleaveVectors(IRBuilder<> &B,
+                                   MutableArrayRef<Value *> Vectors,
+                                   bool Forward) const {
+  const unsigned Stride = Vectors.size();
+  if (Stride == 0) {
+    return true;
+  }
+  auto *VecTy = dyn_cast<FixedVectorType>(Vectors[0]->getType());
+  VECZ_FAIL_IF(!VecTy);
+  if (Stride == 1) {
+    return true;
+  }
+  const unsigned Width = VecTy->getNumElements();
+  VECZ_FAIL_IF(Width < Stride);
+  VECZ_FAIL_IF((Width % Stride) != 0);
+  for (unsigned i = 1; i < Stride; i++) {
+    auto *VecTyN = dyn_cast<FixedVectorType>(Vectors[i]->getType());
+    VECZ_FAIL_IF(!VecTyN || (VecTyN != VecTy));
+  }
+
+  // Prepare the masks.
+  SmallVector<unsigned, 4> MaskLow2;
+  SmallVector<unsigned, 4> MaskHigh2;
+
+  StringRef Name;
+  if (Forward) {
+    Name = "interleave";
+    const unsigned Width2 = Width >> 1;
+    const unsigned Width3 = Width2 + Width;
+    for (unsigned i = 0; i < Width2; ++i) {
+      MaskLow2.push_back(i);
+      MaskHigh2.push_back(i + Width2);
+      MaskLow2.push_back(i + Width);
+      MaskHigh2.push_back(i + Width3);
+    }
+  } else {
+    Name = "deinterleave";
+    const unsigned Width2 = Width << 1;
+    for (unsigned i = 0; i < Width2; i += 2) {
+      MaskLow2.push_back(i);
+      MaskHigh2.push_back(i + 1);
+    }
+  }
+  Constant *CMaskLow2 = ConstantDataVector::get(B.getContext(), MaskLow2);
+  Constant *CMaskHigh2 = ConstantDataVector::get(B.getContext(), MaskHigh2);
+
+  if (Stride == 2) {
+    Value *Src0 = Vectors[0];
+    Value *Src1 = Vectors[1];
+    Vectors[0] = B.CreateShuffleVector(Src0, Src1, CMaskLow2, Name);
+    Vectors[1] = B.CreateShuffleVector(Src0, Src1, CMaskHigh2, Name);
+
+    return true;
+  } else if (Stride == 4) {
+    // For a 4-way interleave, we need two layers of shuffles.
+    // Starting with vectors   a..A : b..B : c..C : d..D
+    // first shuffle layer  -> ab.. : ..AB : cd.. : ..CD
+    // second shuffle layer -> abcd : .... : .... : ABCD
+    Value *Src0 = Vectors[0];
+    Value *Src1 = Vectors[1];
+    Value *Src2 = Vectors[2];
+    Value *Src3 = Vectors[3];
+
+    Constant *CMaskLow4 = nullptr;
+    Constant *CMaskHigh4 = nullptr;
+    if (Forward) {
+      SmallVector<unsigned, 4> MaskLow4;
+      SmallVector<unsigned, 4> MaskHigh4;
+      const unsigned Width2 = Width >> 1;
+      const unsigned Width3 = Width2 + Width;
+      for (unsigned i = 0; i < Width2; i += 2) {
+        MaskLow4.push_back(i);
+        MaskLow4.push_back(i + 1);
+        MaskLow4.push_back(i + Width);
+        MaskLow4.push_back(i + 1 + Width);
+        MaskHigh4.push_back(Width2 + i);
+        MaskHigh4.push_back(Width2 + i + 1);
+        MaskHigh4.push_back(Width3 + i);
+        MaskHigh4.push_back(Width3 + i + 1);
+      }
+      CMaskLow4 = ConstantDataVector::get(B.getContext(), MaskLow4);
+      CMaskHigh4 = ConstantDataVector::get(B.getContext(), MaskHigh4);
+    } else {
+      SmallVector<unsigned, 4> MaskLow4;
+      SmallVector<unsigned, 4> MaskHigh4;
+      const unsigned Width2 = Width << 1;
+      for (unsigned i = 0; i < Width2; i += 4) {
+        MaskLow4.push_back(i);
+        MaskLow4.push_back(i + 1);
+        MaskHigh4.push_back(i + 2);
+        MaskHigh4.push_back(i + 3);
+      }
+
+      // to perform the de-interleave we reverse the functions of the masks.
+      CMaskLow4 = CMaskLow2;
+      CMaskHigh4 = CMaskHigh2;
+      CMaskLow2 = ConstantDataVector::get(B.getContext(), MaskLow4);
+      CMaskHigh2 = ConstantDataVector::get(B.getContext(), MaskHigh4);
+    }
+
+    Value *Tmp0 = B.CreateShuffleVector(Src0, Src1, CMaskLow2, Name);
+    Value *Tmp1 = B.CreateShuffleVector(Src0, Src1, CMaskHigh2, Name);
+    Value *Tmp2 = B.CreateShuffleVector(Src2, Src3, CMaskLow2, Name);
+    Value *Tmp3 = B.CreateShuffleVector(Src2, Src3, CMaskHigh2, Name);
+    Vectors[0] = B.CreateShuffleVector(Tmp0, Tmp2, CMaskLow4, Name);
+    Vectors[1] = B.CreateShuffleVector(Tmp0, Tmp2, CMaskHigh4, Name);
+    Vectors[2] = B.CreateShuffleVector(Tmp1, Tmp3, CMaskLow4, Name);
+    Vectors[3] = B.CreateShuffleVector(Tmp1, Tmp3, CMaskHigh4, Name);
+
+    return true;
+  }
+  return false;
+}
+
+unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
+                                       const ArrayRef<const Value *> vals,
+                                       unsigned width) const {
+  unsigned MaxVecRegBitWidth = multi_llvm::getFixedValue(
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector));
+
+  unsigned NumVecRegs =
+      TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));
+
+  unsigned VaryingUsage = 0;
+  for (const auto *VI : vals) {
+    const auto *Ty = VI->getType();
+    VaryingUsage +=
+        Ty->isPointerTy()
+            ? TM_->getPointerSizeInBits(Ty->getPointerAddressSpace())
+            : VI->getType()->getPrimitiveSizeInBits();
+  }
+  unsigned const MaxBits = MaxVecRegBitWidth * NumVecRegs;
+  while (VaryingUsage * width > MaxBits) {
+    width >>= 1;
+  }
+
+  return width;
+}
+
+unsigned TargetInfo::getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
+                                           const llvm::Type &Ty) const {
+  unsigned MaxVecRegBitWidth = multi_llvm::getFixedValue(
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector));
+
+  if (MaxVecRegBitWidth == 0) {
+    return 0;
+  }
+
+  unsigned BitWidth = 0;
+  if (!Ty.isPtrOrPtrVectorTy()) {
+    BitWidth = Ty.getScalarSizeInBits();
+  } else if (TM_) {
+    BitWidth = TM_->getPointerSizeInBits(Ty.getPointerAddressSpace());
+  }
+
+  if (BitWidth == 0) {
+    // Couldn't work out the vector width..
+    return 0;
+  }
+
+  // The floor of 8 prevents poor double precision performance.
+  // Not sure why (CA-3461 related?)
+  return std::max(MaxVecRegBitWidth / BitWidth, 8u);
+}
+
+bool TargetInfo::canPacketize(const llvm::Value *, ElementCount) const {
+  return true;
+}
+
+namespace vecz {
+std::unique_ptr<TargetInfo> createTargetInfoArm(TargetMachine *tm);
+
+std::unique_ptr<TargetInfo> createTargetInfoAArch64(TargetMachine *tm);
+
+std::unique_ptr<TargetInfo> createTargetInfoRISCV(TargetMachine *tm);
+}  // namespace vecz
+
+std::unique_ptr<TargetInfo> vecz::createTargetInfoFromTargetMachine(
+    TargetMachine *tm) {
+  // The TargetMachine is allowed to be null
+  if (tm) {
+    const Triple &TT(tm->getTargetTriple());
+    switch (TT.getArch()) {
+      case Triple::arm:
+        return createTargetInfoArm(tm);
+      case Triple::aarch64:
+        return createTargetInfoAArch64(tm);
+      case Triple::riscv32:
+      case Triple::riscv64:
+        return createTargetInfoRISCV(tm);
+      default:
+        // Just use the generic TargetInfo unless we know better
+        break;
+    }
+  }
+  return std::make_unique<TargetInfo>(tm);
+}
+
+AnalysisKey TargetInfoAnalysis::Key;
+
+TargetInfoAnalysis::TargetInfoAnalysis()
+    : TICallback([](const Module &) {
+        return std::make_unique<TargetInfo>(/*TM*/ nullptr);
+      }) {}
+
+TargetInfoAnalysis::TargetInfoAnalysis(TargetMachine *TM)
+    : TICallback([TM](const Module &) {
+        return vecz::createTargetInfoFromTargetMachine(TM);
+      }) {}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
new file mode 100644
index 0000000000000..cb7b220cabd45
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -0,0 +1,408 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicsAArch64.h>
+#include <llvm/IR/IntrinsicsARM.h>
+
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "vecz/vecz_target_info.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace vecz {
+
+class TargetInfoArm final : public TargetInfo {
+ public:
+  TargetInfoArm(TargetMachine *tm) : TargetInfo(tm) {}
+
+  ~TargetInfoArm() = default;
+
+  bool canOptimizeInterleavedGroup(const Instruction &val,
+                                   InterleavedOperation kind, int stride,
+                                   unsigned groupSize) const override;
+
+  bool optimizeInterleavedGroup(IRBuilder<> &builder, InterleavedOperation kind,
+                                ArrayRef<Value *> group,
+                                ArrayRef<Value *> masks, Value *baseAddress,
+                                int stride) const override;
+
+ private:
+  bool canOptimizeInterleavedGroupImpl(const Instruction &val,
+                                       InterleavedOperation kind, int stride,
+                                       unsigned groupSize,
+                                       unsigned &intrinsicID) const;
+};
+
+class TargetInfoAArch64 final : public TargetInfo {
+ public:
+  TargetInfoAArch64(TargetMachine *tm) : TargetInfo(tm) {}
+
+  ~TargetInfoAArch64() = default;
+
+  bool canOptimizeInterleavedGroup(const Instruction &val,
+                                   InterleavedOperation kind, int stride,
+                                   unsigned groupSize) const override;
+
+  bool optimizeInterleavedGroup(IRBuilder<> &builder, InterleavedOperation kind,
+                                ArrayRef<Value *> group,
+                                ArrayRef<Value *> masks, Value *baseAddress,
+                                int stride) const override;
+
+ private:
+  bool canOptimizeInterleavedGroupImpl(const Instruction &val,
+                                       InterleavedOperation kind, int stride,
+                                       unsigned groupSize,
+                                       unsigned &intrinsicID) const;
+};
+
+std::unique_ptr<TargetInfo> createTargetInfoArm(TargetMachine *tm) {
+  return std::make_unique<TargetInfoArm>(tm);
+}
+
+std::unique_ptr<TargetInfo> createTargetInfoAArch64(TargetMachine *tm) {
+  return std::make_unique<TargetInfoAArch64>(tm);
+}
+
+}  // namespace vecz
+
+bool TargetInfoArm::canOptimizeInterleavedGroup(const Instruction &val,
+                                                InterleavedOperation kind,
+                                                int stride,
+                                                unsigned groupSize) const {
+  unsigned IntrID;
+  return canOptimizeInterleavedGroupImpl(val, kind, stride, groupSize, IntrID);
+}
+
+bool TargetInfoArm::canOptimizeInterleavedGroupImpl(const Instruction &val,
+                                                    InterleavedOperation kind,
+                                                    int stride,
+                                                    unsigned groupSize,
+                                                    unsigned &IntrID) const {
+  IntrID = Intrinsic::not_intrinsic;
+  Type *dataType = nullptr;
+  if (kind == eInterleavedStore) {
+    switch (stride) {
+      default:
+        break;
+      case 2:
+        IntrID = Intrinsic::arm_neon_vst2;
+        break;
+      case 3:
+        IntrID = Intrinsic::arm_neon_vst3;
+        break;
+      case 4:
+        IntrID = Intrinsic::arm_neon_vst4;
+        break;
+    }
+    dataType = val.getOperand(0)->getType();
+  } else if (kind == eInterleavedLoad) {
+    switch (stride) {
+      default:
+        break;
+      case 2:
+        IntrID = Intrinsic::arm_neon_vld2;
+        break;
+      case 3:
+        IntrID = Intrinsic::arm_neon_vld3;
+        break;
+      case 4:
+        IntrID = Intrinsic::arm_neon_vld4;
+        break;
+    }
+    dataType = val.getType();
+  } else {
+    return false;
+  }
+
+  if (IntrID == Intrinsic::not_intrinsic || ((groupSize % stride) != 0)) {
+    return false;
+  }
+
+  if (!dataType) {
+    return false;
+  }
+
+  auto *VecTy = dyn_cast<FixedVectorType>(dataType);
+  if (!VecTy) {
+    return false;
+  }
+
+  unsigned VecBits = VecTy->getPrimitiveSizeInBits();
+  if ((VecBits != 128) && (VecBits != 64)) {
+    return false;
+  }
+
+  // NEON interleave instructions only allow 8, 16, and 32 bit elements
+  unsigned ElementSize = VecTy->getScalarSizeInBits();
+  if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
+                                             InterleavedOperation kind,
+                                             ArrayRef<Value *> group,
+                                             ArrayRef<Value *>, Value *address,
+                                             int stride) const {
+  bool HasMask =
+      (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore);
+  // canOptimizeInterleavedGroup() should have returned false in this case.
+  // ARM does not have masked vector load or store instructions.
+  VECZ_FAIL_IF(HasMask);
+  VECZ_FAIL_IF(stride < 0);
+
+  // TODO CA-3100 fetch information on SubTargetInfo
+  // load instructions seems to be easily split in the backend whereas stores
+  // generate a backend error because of invalid data type on vector operands.
+  // Vector operands are enabled in the backend only when SubTargetInfo ensures
+  // NEON instrutions are supported.
+  const bool subTargetHasNeon = false;
+  if (!subTargetHasNeon && kind == eInterleavedStore) {
+    return false;
+  }
+
+  // Validate the operations in the group.
+  SmallVector<CallInst *, 4> Calls;
+  for (unsigned i = 0; i < group.size(); i++) {
+    CallInst *Op = dyn_cast<CallInst>(group[i]);
+    if (!Op) {
+      return false;
+    }
+    Calls.push_back(Op);
+  }
+
+  PointerType *PtrTy = dyn_cast<PointerType>(address->getType());
+  if (!PtrTy) {
+    return false;
+  }
+
+  CallInst *Op0 = Calls[0];
+  // Determine the intrinsic to emit for this group.
+  unsigned IntrID = Intrinsic::not_intrinsic;
+  if (!canOptimizeInterleavedGroupImpl(*Op0, kind, stride, group.size(),
+                                       IntrID)) {
+    return false;
+  }
+
+  // canOptimizeInterleavedGroup() performs several checks, including valid
+  // Kind and Op0 types. Thus, these casts are safe.
+  FixedVectorType *VecTy = nullptr;
+  if (kind == eInterleavedStore) {
+    VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
+  } else {  // eInterleavedLoad
+    VecTy = cast<FixedVectorType>(Op0->getType());
+  }
+
+  Type *EleTy = VecTy->getElementType();
+  unsigned Alignment = (EleTy->getPrimitiveSizeInBits() / 8);
+
+  // Declare the intrinsic if needed.
+  SmallVector<Type *, 2> Tys;
+  if (kind == eInterleavedStore) {
+    Tys = {PtrTy, VecTy};
+  } else if (kind == eInterleavedLoad) {
+    Tys = {VecTy, PtrTy};
+  }
+
+  Function *IntrFn =
+      Intrinsic::getDeclaration(Op0->getModule(), (Intrinsic::ID)IntrID, Tys);
+  if (!IntrFn) {
+    return false;
+  }
+
+  // Create a NEON load or store to replace the interleaved calls.
+  SmallVector<Value *, 8> Ops;
+  Ops.push_back(address);
+  if (kind == eInterleavedStore) {
+    for (unsigned i = 0; i < group.size(); i++) {
+      CallInst *Op = Calls[i];
+      Ops.push_back(Op->getOperand(0));
+    }
+  }
+  Ops.push_back(B.getInt32(Alignment));
+  CallInst *CI = B.CreateCall(IntrFn, Ops, Op0->getName());
+  CI->setCallingConv(IntrFn->getCallingConv());
+  if (kind == eInterleavedLoad) {
+    for (unsigned i = 0; i < Calls.size(); i++) {
+      CallInst *Op = Calls[i];
+      ArrayRef<unsigned> Indices(&i, 1);
+      Value *Extract = B.CreateExtractValue(CI, Indices);
+      Op->replaceAllUsesWith(Extract);
+    }
+  }
+  return true;
+}
+
+bool TargetInfoAArch64::canOptimizeInterleavedGroup(const Instruction &val,
+                                                    InterleavedOperation kind,
+                                                    int stride,
+                                                    unsigned groupSize) const {
+  unsigned IntrID;
+  return canOptimizeInterleavedGroupImpl(val, kind, stride, groupSize, IntrID);
+}
+
+bool TargetInfoAArch64::canOptimizeInterleavedGroupImpl(
+    const Instruction &val, InterleavedOperation kind, int stride,
+    unsigned groupSize, unsigned &IntrID) const {
+  IntrID = Intrinsic::not_intrinsic;
+  Type *dataType = nullptr;
+  if (kind == eInterleavedStore) {
+    switch (stride) {
+      default:
+        break;
+      case 2:
+        IntrID = Intrinsic::aarch64_neon_st2;
+        break;
+      case 3:
+        IntrID = Intrinsic::aarch64_neon_st3;
+        break;
+      case 4:
+        IntrID = Intrinsic::aarch64_neon_st4;
+        break;
+    }
+    dataType = val.getOperand(0)->getType();
+  } else if (kind == eInterleavedLoad) {
+    switch (stride) {
+      default:
+        break;
+      case 2:
+        IntrID = Intrinsic::aarch64_neon_ld2;
+        break;
+      case 3:
+        IntrID = Intrinsic::aarch64_neon_ld3;
+        break;
+      case 4:
+        IntrID = Intrinsic::aarch64_neon_ld4;
+        break;
+    }
+    dataType = val.getType();
+  } else {
+    return false;
+  }
+
+  if (IntrID == Intrinsic::not_intrinsic || ((groupSize % stride) != 0)) {
+    return false;
+  }
+
+  if (!dataType) {
+    return false;
+  }
+
+  auto *VecTy = dyn_cast<FixedVectorType>(dataType);
+  if (!VecTy) {
+    return false;
+  }
+
+  unsigned VecBits = VecTy->getPrimitiveSizeInBits();
+  if ((VecBits != 128) && (VecBits != 64)) {
+    return false;
+  }
+
+  // NEON interleave instructions only allow 8, 16, and 32 bit elements
+  unsigned ElementSize = VecTy->getScalarSizeInBits();
+  if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool TargetInfoAArch64::optimizeInterleavedGroup(
+    IRBuilder<> &B, InterleavedOperation kind, ArrayRef<Value *> group,
+    ArrayRef<Value *>, Value *address, int stride) const {
+  bool HasMask =
+      (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore);
+  // canOptimizeInterleavedGroup() should have returned false in this case.
+  // AArch64 does not have masked vector load or store instructions.
+  VECZ_FAIL_IF(HasMask);
+  VECZ_FAIL_IF(stride < 0);
+
+  // TODO CA-3100 fetch information on SubTargetInfo
+  // load instructions seems to be easily split in the backend whereas stores
+  // generate a backend error because of invalid data type on vector operands.
+  // Vector operands are enabled in the backend only when SubTargetInfo ensures
+  // NEON instrutions are supported.
+  const bool subTargetHasNeon = false;
+  if (!subTargetHasNeon && kind == eInterleavedStore) {
+    return false;
+  }
+
+  // Validate the operations in the group.
+  SmallVector<CallInst *, 4> Calls;
+  for (unsigned i = 0; i < group.size(); i++) {
+    CallInst *Op = dyn_cast<CallInst>(group[i]);
+    if (!Op) {
+      return false;
+    }
+    Calls.push_back(Op);
+  }
+
+  PointerType *PtrTy = dyn_cast<PointerType>(address->getType());
+  if (!PtrTy) {
+    return false;
+  }
+
+  CallInst *Op0 = Calls[0];
+  // Determine the intrinsic to emit for this group.
+  unsigned IntrID = Intrinsic::not_intrinsic;
+  if (!canOptimizeInterleavedGroupImpl(*Op0, kind, stride, group.size(),
+                                       IntrID)) {
+    return false;
+  }
+
+  // canOptimizeInterleavedGroup() performs several checks, including valid
+  // Kind and Op0 types. Thus, these casts are safe.
+  FixedVectorType *VecTy = nullptr;
+  if (kind == eInterleavedStore) {
+    VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
+  } else {  // eInterleavedLoad
+    VecTy = cast<FixedVectorType>(Op0->getType());
+  }
+
+  Function *IntrFn = Intrinsic::getDeclaration(
+      Op0->getModule(), (Intrinsic::ID)IntrID, {VecTy, PtrTy});
+  if (!IntrFn) {
+    return false;
+  }
+
+  // Create a NEON load or store to replace the interleaved calls.
+  SmallVector<Value *, 8> Ops;
+  if (kind == eInterleavedStore) {
+    for (unsigned i = 0; i < group.size(); i++) {
+      CallInst *Op = Calls[i];
+      Ops.push_back(Op->getOperand(0));
+    }
+  }
+  Ops.push_back(address);
+  CallInst *CI = B.CreateCall(IntrFn, Ops, Op0->getName());
+  CI->setCallingConv(IntrFn->getCallingConv());
+  if (kind == eInterleavedLoad) {
+    for (unsigned i = 0; i < Calls.size(); i++) {
+      CallInst *Op = Calls[i];
+      ArrayRef<unsigned> Indices(&i, 1);
+      Value *Extract = B.CreateExtractValue(CI, Indices);
+      Op->replaceAllUsesWith(Extract);
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
new file mode 100644
index 0000000000000..a9a58cad5326c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -0,0 +1,752 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicsRISCV.h>
+#include <llvm/Support/MathExtras.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/creation_apis_helper.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "transform/packetization_helpers.h"
+#include "vecz/vecz_target_info.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace vecz {
+
+class TargetInfoRISCV final : public TargetInfo {
+ public:
+  TargetInfoRISCV(TargetMachine *tm) : TargetInfo(tm) {}
+
+  ~TargetInfoRISCV() = default;
+
+  bool canPacketize(const llvm::Value *Val, ElementCount Width) const override;
+
+  // These functions should only be overriden in LLVM >= 13.
+  llvm::Value *createScalableExtractElement(
+      llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
+      llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src,
+      llvm::Value *index, llvm::Value *evl) const override;
+
+  llvm::Value *createOuterScalableBroadcast(
+      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
+      ElementCount factor) const override {
+    return createScalableBroadcast(builder, vector, VL, factor,
+                                   /* URem */ true);
+  }
+
+  llvm::Value *createInnerScalableBroadcast(
+      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
+      ElementCount factor) const override {
+    return createScalableBroadcast(builder, vector, VL, factor,
+                                   /* URem */ false);
+  }
+
+  llvm::Value *createScalableInsertElement(llvm::IRBuilder<> &builder,
+                                           vecz::VectorizationContext &Ctx,
+                                           llvm::Instruction *insert,
+                                           llvm::Value *elt, llvm::Value *into,
+                                           llvm::Value *index,
+                                           llvm::Value *evl) const override;
+  bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const override;
+
+  llvm::Value *createVectorShuffle(llvm::IRBuilder<> &builder, llvm::Value *src,
+                                   llvm::Value *mask,
+                                   llvm::Value *evl) const override;
+
+  llvm::Value *createVectorSlideUp(llvm::IRBuilder<> &builder, llvm::Value *src,
+                                   llvm::Value *insert,
+                                   llvm::Value *evl) const override;
+
+ private:
+  bool isOperationLegal(llvm::Intrinsic::ID ID,
+                        llvm::ArrayRef<llvm::Type *> Tys) const;
+
+  /// @brief Maximum vector type size in bits for VP intrinsics.
+  static constexpr unsigned MaxLegalVectorTypeBits = 8 * 64;
+
+  /// @return Whether the minimum size of a given vector type is less than 64
+  /// bytes and the length is a power of 2.
+  bool isVectorTypeLegal(llvm::Type *Ty) const;
+
+  llvm::Value *createScalableBroadcast(llvm::IRBuilder<> &builder,
+                                       llvm::Value *vector, llvm::Value *VL,
+                                       ElementCount factor, bool URem) const;
+
+  Value *createVPKernelWidth(IRBuilder<> &, Value *, unsigned,
+                             ElementCount) const override;
+};
+
+// LLVM 14 introduced vp intrinsics legalization.
+bool TargetInfoRISCV::isVPVectorLegal(const llvm::Function &F,
+                                      llvm::Type *Ty) const {
+  (void)F;
+  return isVectorTypeLegal(Ty);
+}
+
+// Should be target-dependent. Take RISCV legal types for now.
+// FIXME: LLVM 14 adds better support for legalization of vp intrinsics, but
+// not RISCV ones like vrgather_vv. See CA-4071.
+bool TargetInfoRISCV::isVectorTypeLegal(Type *Ty) const {
+  assert(Ty->isVectorTy() && "Expecting a vector type.");
+  (void)Ty;
+  // FIXME: VP boolean logical operators (and,or,xor) are not matched in the
+  // LLVM 13 RVV backend: we must backport https://reviews.llvm.org/D115546
+  // before we can enable this for Int1Ty as well.
+  bool isLegal = isLegalVPElementType(multi_llvm::getVectorElementType(Ty));
+  if (isLegal) {
+    uint32_t const MinSize =
+        multi_llvm::getVectorElementCount(Ty).getKnownMinValue();
+    isLegal = isPowerOf2_32(MinSize) &&
+              MinSize * Ty->getScalarSizeInBits() <= MaxLegalVectorTypeBits;
+  }
+  return isLegal;
+}
+
+std::unique_ptr<TargetInfo> createTargetInfoRISCV(TargetMachine *tm) {
+  return std::make_unique<TargetInfoRISCV>(tm);
+}
+
+}  // namespace vecz
+
+bool TargetInfoRISCV::canPacketize(const llvm::Value *Val,
+                                   ElementCount Width) const {
+  // If we're not scalable, assume the backend will sort everything out.
+  if (!Width.isScalable()) {
+    return true;
+  }
+  // Do a relatively simple check that instructions aren't defining any types
+  // that can't be legalized when turned into scalable vectors.
+  if (!llvm::isa<llvm::Instruction>(Val)) {
+    return true;
+  }
+  const auto *I = llvm::cast<llvm::Instruction>(Val);
+
+  const auto IsIllegalIntBitwidth = [](const llvm::Type *Ty) {
+    if (!Ty->isIntOrIntVectorTy()) {
+      return false;
+    }
+    auto ScalarBitWidth =
+        llvm::cast<IntegerType>(Ty->getScalarType())->getBitWidth();
+    return ScalarBitWidth > 64;
+  };
+
+  if (IsIllegalIntBitwidth(I->getType())) {
+    return false;
+  }
+  for (auto *O : I->operand_values()) {
+    if (IsIllegalIntBitwidth(O->getType())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// @return Whether RISCV intrinsic @a ID is legal for types @a Tys.
+///
+/// This function does not check whether the intrinsic is being called
+/// with the right argument types, it just tests that all the types
+/// used to call the intrinsic (and its return type) are
+/// isVectorTypeLegal().
+///
+/// @param[in] ID The intrinsic ID
+/// @param[in] Tys A subset of the overloaded types of the intrinsic required to
+/// check whether it's legal.
+bool TargetInfoRISCV::isOperationLegal(llvm::Intrinsic::ID ID,
+                                       llvm::ArrayRef<llvm::Type *> Tys) const {
+  switch (ID) {
+    case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv:
+    case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask:
+      // riscv_vrgather_vv[_mask](RetTy, _IdxTy)
+      // We only need to check the return type here, as it should be greater or
+      // equal to the index type.
+      assert(Tys.size() == 1 &&
+             "Only the return type is needed to check vrgather_vv intrinsics");
+      return isVectorTypeLegal(Tys.front());
+    case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv:
+    case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask: {
+      constexpr unsigned MaxVectorSize = MaxLegalVectorTypeBits / 16;
+      // riscv_vrgatherei16_vv[_mask](RetTy, _IdxTy)
+      // Case similar to that of riscv_vrgather_vv[_mask], but we also need to
+      // check that the vector size is no greater than MaxLegalVectorTypeSize /
+      // 16, as the index type will always be i16.
+      assert(
+          Tys.size() == 1 &&
+          "Only the return type is needed to check vrgatherei16_vv intrinsics");
+      auto *const RetTy = Tys.front();
+      return isVectorTypeLegal(RetTy) &&
+             multi_llvm::getVectorElementCount(RetTy).getKnownMinValue() <=
+                 MaxVectorSize;
+    }
+    default:
+      break;
+  }
+  llvm_unreachable("Don't know how to check whether this intrinsic is legal.");
+}
+
+namespace {
+static unsigned getRISCVBits(const TargetMachine *TM) {
+  auto const &Triple = TM->getTargetTriple();
+  return Triple.isArch32Bit() ? 32 : 64;
+}
+
+/// @brief Get VL to be used as a parameter of a RISCV intrinsic.
+///
+/// The type of this value will depend on the architecture (RISCV32 or
+/// RISCV64).
+///
+/// @return A pair containig the VL value and its type.
+///
+/// @param[in] B Builder to use when creating the VL value.
+/// @param[in] VL Original VL. If non-nullptr, this value (zero-extended for
+/// RISCV64) will be returned.
+/// @param[in] wideTy Type of the vectors which will be used in the intrinsics.
+/// If no VL is provided and `<vscale x N x Ty>` is used here, `<call
+/// llvm.vscale> * N` will be returned.
+/// @param[in] TM Target machine.
+/// @param[in] N name of the instruction to generate. "xlen" by default.
+llvm::Value *getIntrinsicVL(llvm::IRBuilderBase &B, llvm::Value *VL,
+                            llvm::Type *wideTy, llvm::TargetMachine *TM,
+                            const Twine &N = "xlen") {
+  unsigned XLenTyWidth = getRISCVBits(TM);
+  Type *XLen = B.getIntNTy(XLenTyWidth);
+
+  if (VL) {
+    // Our incoming VP VL type is always i32, so zero-extend to 64 bits if
+    // required.
+    return XLenTyWidth == 32 ? VL : B.CreateZExt(VL, XLen, N);
+  }
+
+  // Else create a 'default' VL which covers the entire scalable vector.
+  return B.CreateVScale(
+      B.getIntN(XLenTyWidth,
+                cast<VectorType>(wideTy)->getElementCount().getKnownMinValue()),
+      N);
+}
+
+/// @brief Returns a pair with the `vrgather` intrinsic variation to use and the
+/// bitwidth of the `vs1` parameter to this intrinsic.
+///
+/// @param[in] vs2Ty Type of the source vector.
+/// @param[in] isMasked Whether the intrinsic should be masked.
+std::pair<llvm::Intrinsic::RISCVIntrinsics, unsigned> getGatherIntrinsic(
+    llvm::Type *vs2Ty, bool isMasked = false) {
+  assert(!vs2Ty->isPtrOrPtrVectorTy() &&
+         "Cannot get gather intrinsic for a vector of pointers");
+
+  Intrinsic::RISCVIntrinsics Opc;
+  auto *vecTy = multi_llvm::getVectorElementType(vs2Ty);
+  unsigned vs1Width;
+  if (vecTy->isIntegerTy() && vecTy->getIntegerBitWidth() == 8) {
+    Opc = isMasked ? Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask
+                   : Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv;
+
+    vs1Width = 16;
+  } else {
+    Opc = isMasked ? Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask
+                   : Intrinsic::RISCVIntrinsics::riscv_vrgather_vv;
+
+    vs1Width = vecTy->getScalarSizeInBits();
+  }
+  return std::make_pair(Opc, vs1Width);
+}
+
+/// @brief Returns the `v?slide1up.v?` intrinsic variation to use.
+///
+/// @param[in] vs2Ty Type of the source vector.
+llvm::Intrinsic::RISCVIntrinsics getSlideUpIntrinsic(llvm::Type *vs2Ty) {
+  assert(!vs2Ty->isPtrOrPtrVectorTy() &&
+         "Cannot get gather intrinsic for a vector of pointers");
+
+  Intrinsic::RISCVIntrinsics Opc;
+  auto *vecTy = multi_llvm::getVectorElementType(vs2Ty);
+  if (vecTy->isFloatingPointTy()) {
+    Opc = Intrinsic::RISCVIntrinsics::riscv_vfslide1up;
+  } else {
+    Opc = Intrinsic::RISCVIntrinsics::riscv_vslide1up;
+  }
+  return Opc;
+}
+
+}  // namespace
+
+llvm::Value *TargetInfoRISCV::createScalableExtractElement(
+    llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
+    llvm::Instruction *origExtract, llvm::Type *narrowTy, llvm::Value *src,
+    llvm::Value *index, llvm::Value *VL) const {
+  // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through
+  // memory when creating this operation.
+  //   vrgather: vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+  // or,
+  //   vrgather: res[i] = (idxs[i] >= VLMAX) ? 0 : src[idxs[i]];
+  // An example: extractelement <A,B,C,D>, I - vectorized by <vscale x 1> - we
+  // receive here as packetized arguments:
+  //   src:  <A,B,C,D, E,F,G,H,    ...> (  <vscale x 4 x ty>    )
+  //   idxs: <I,       J,       K, ...> (  <vscale x 1 x idxty> )
+  // We want to construct operands such that we have:
+  //   srcs: as before
+  //   idxs: <I+0,J+4,K+8,...>           (  <vscale x 4 x idxty> )
+  // So that vrgather extracts the Ith element from the first 4 elements, the
+  // Jth element from the second 4, etc.
+  auto *srcTy = cast<ScalableVectorType>(src->getType());
+
+  Intrinsic::ID intrinsicID;
+  unsigned intrIdxBitWidth;
+  std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy);
+
+  auto const srcEC = multi_llvm::getVectorElementCount(srcTy);
+  auto const resEC = multi_llvm::getVectorElementCount(narrowTy);
+
+  auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth);
+  Type *const indexVecTy = VectorType::get(indexEltTy, resEC);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {srcTy})) {
+    return TargetInfo::createScalableExtractElement(B, Ctx, origExtract,
+                                                    narrowTy, src, index, VL);
+  }
+
+  auto *const avl = getIntrinsicVL(B, VL, narrowTy, getTargetMachine());
+
+  auto *indexTy = index->getType();
+  bool const isIdxVector = indexTy->isVectorTy();
+  unsigned const idxBitWidth = indexTy->getScalarSizeInBits();
+
+  // The intrinsic may demand a larger index type than we currently have;
+  // extend up to the right type.
+  if (idxBitWidth != intrIdxBitWidth) {
+    index = B.CreateZExtOrTrunc(index, isIdxVector ? indexVecTy : indexEltTy);
+  }
+
+  // If the index is uniform, it may not be a vector. We need one for the
+  // intrinsic, so splat it here.
+  if (!isIdxVector) {
+    index = B.CreateVectorSplat(resEC, index);
+  }
+
+  // Construct the indices such that each packetized index (still indexing into
+  // the original vector of 4 elements) is spread out such that each index
+  // indexes into its own 4-element slice: e.g., <I+0, J+4, K+8, ...>.
+  auto *indices = getGatherIndicesVector(
+      B, index, indexVecTy,
+      multi_llvm::getVectorNumElements(origExtract->getOperand(0)->getType()),
+      "vs1");
+
+  auto *const zero = B.getInt64(0);
+
+  // Our indices are still in the narrower vectorized type (e.g., <vscale x 1 x
+  // idxTy>), but the vrgather intrinsics need equally-sized vector types. So
+  // insert the indices into a wide dummy vector (e.g., <vscale x 4 x idxTy>),
+  // perform the vrgather, and extract the subvector back out again.
+  auto *const intrIndexTy = VectorType::get(indexEltTy, srcEC);
+  indices = B.CreateInsertVector(intrIndexTy, PoisonValue::get(intrIndexTy),
+                                 indices, zero);
+
+  SmallVector<Value *, 4> ops;
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+  // LLVM 15+ has a pass-through operand - we set it to undef.
+  ops.push_back(UndefValue::get(srcTy));
+#endif
+  ops.push_back(src);
+  ops.push_back(indices);
+  ops.push_back(avl);
+
+  auto *const gather =
+      B.CreateIntrinsic(intrinsicID, {srcTy, avl->getType()}, ops);
+
+  return B.CreateExtractVector(narrowTy, gather, zero);
+}
+
+llvm::Value *TargetInfoRISCV::createScalableBroadcast(llvm::IRBuilder<> &B,
+                                                      llvm::Value *vector,
+                                                      llvm::Value *VL,
+                                                      ElementCount factor,
+                                                      bool URem) const {
+  // Using rvv instruction:
+  // vrgather.vv vd, vs2, vs1, vm s.t.
+  // vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]
+
+  auto *vectorTy = vector->getType();
+  auto *const origElTy = multi_llvm::getVectorElementType(vectorTy);
+
+  // We first check we are not broadcasting a vector of pointers,
+  // unsupported by the intrinsic.
+  const bool isVectorOfPointers = origElTy->isPtrOrPtrVectorTy();
+  if (isVectorOfPointers) {
+    vectorTy = VectorType::get(B.getIntNTy(getRISCVBits(getTargetMachine())),
+                               multi_llvm::getVectorElementCount(vectorTy));
+  }
+
+  auto *const wideTy = ScalableVectorType::get(
+      multi_llvm::getVectorElementType(vectorTy),
+      factor.getKnownMinValue() *
+          multi_llvm::getVectorElementCount(vectorTy).getKnownMinValue());
+
+  Intrinsic::RISCVIntrinsics intrinsicID;
+  unsigned vs1Width;
+  std::tie(intrinsicID, vs1Width) = getGatherIntrinsic(wideTy);
+  auto *const vs1ElTy = B.getIntNTy(vs1Width);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {wideTy})) {
+    return URem
+               ? TargetInfo::createOuterScalableBroadcast(B, vector, VL, factor)
+               : TargetInfo::createInnerScalableBroadcast(B, vector, VL,
+                                                          factor);
+  }
+
+  // Cast the vector of pointers to a vector of integers if needed.
+  if (isVectorOfPointers) {
+    vector = B.CreatePtrToInt(vector, vectorTy);
+  }
+
+  // We grow the fixed vector to consume an entire RVV register.
+  auto *const vs2 = B.CreateInsertVector(wideTy, PoisonValue::get(wideTy),
+                                         vector, B.getInt64(0), "vs2");
+
+  auto *const vs1 = createBroadcastIndexVector(
+      B, VectorType::get(vs1ElTy, wideTy), factor, URem, "vs1");
+
+  auto *const avl = getIntrinsicVL(B, VL, wideTy, getTargetMachine());
+
+  SmallVector<Value *, 4> ops;
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+  // LLVM 15+ has a pass-through operand - we set it to undef.
+  ops.push_back(UndefValue::get(vs2->getType()));
+#endif
+  ops.push_back(vs2);
+  ops.push_back(vs1);
+  ops.push_back(avl);
+
+  Value *gather =
+      B.CreateIntrinsic(intrinsicID, {vs2->getType(), avl->getType()}, ops);
+
+  // If we had to cast the vector before, we do the reverse operation
+  // on the result.
+  if (isVectorOfPointers) {
+    gather = B.CreateIntToPtr(gather, VectorType::get(origElTy, wideTy));
+  }
+
+  return gather;
+}
+
+llvm::Value *TargetInfoRISCV::createScalableInsertElement(
+    llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
+    llvm::Instruction *origInsert, llvm::Value *elt, llvm::Value *into,
+    llvm::Value *index, llvm::Value *VL) const {
+  // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through
+  // memory when creating this operation.
+  //   vrgather: vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+  // or,
+  //   vrgather: res[i] = (idxs[i] >= VLMAX) ? 0 : src[idxs[i]];
+  // An example: insertelement <A,B,C,D>, X, I - vectorized by <vscale x 1> -
+  // we receive here as packetized arguments:
+  //   into: <A,B,C,D, E,F,G,H,    ...> (  <vscale x 4 x ty>    )
+  //   elt:  <X,       Y,       Z, ...> (  <vscale x 1 x ty>    )
+  //   idxs: <I,       J,       K, ...> (  <vscale x 1 x idxty> )
+  // We want to construct operands such that we have:
+  //   into: as before
+  //   elt:  <X,X,X,X, Y,Y,Y,Y, Z,Z,Z,Z, ... >  ( <vscale x 4 x ty>    )
+  //   mask: true where the elts indices are to be inserted according to the
+  //         indices, e.g.,
+  //         <0,1,0,0, 0,0,0,1,   1,0,0,0, ...  ( <vscale x 4 x i1>    )
+  //   idxs: <0,I,0,0, 0,0,0,J+4, K+8,...>      ( <vscale x 4 x idxty> )
+  // So that vrgather inserts X into the Ith element of the first 4 elements, Y
+  // into the Jth element of the second 4, etc:
+  //   res:  <u,X,u,u, u,u,u,Y, Z,u,u,u, ... >
+  // If instead we use a masked vrgather with the same mask as before and with
+  // a merge operand of 'into', we expect the blended operation to be correct:
+  //   res:  <A,X,C,D, E,F,G,Y, Z,I,J,K, ... >
+  auto *const eltTy = elt->getType();
+  auto *const intoTy = into->getType();
+
+  Intrinsic::ID intrinsicID;
+  unsigned intrIdxBitWidth;
+  std::tie(intrinsicID, intrIdxBitWidth) =
+      getGatherIntrinsic(intoTy, /*isMasked*/ true);
+
+  auto const eltEC = multi_llvm::getVectorElementCount(eltTy);
+  auto const intoEC = multi_llvm::getVectorElementCount(intoTy);
+  auto const fixedAmt =
+      multi_llvm::getVectorElementCount(origInsert->getType());
+  assert(!fixedAmt.isScalable() && "Scalable pre-packetized value?");
+
+  auto *indexEltTy = B.getIntNTy(intrIdxBitWidth);
+  Type *const indexVecTy = VectorType::get(indexEltTy, eltEC);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {intoTy})) {
+    return TargetInfo::createScalableInsertElement(B, Ctx, origInsert, elt,
+                                                   into, index, VL);
+  }
+
+  auto *const avl = getIntrinsicVL(B, VL, intoTy, getTargetMachine());
+
+  auto *const indexTy = index->getType();
+  unsigned const idxBitWidth = indexTy->getScalarSizeInBits();
+  bool const indexIsVector = indexTy->isVectorTy();
+
+  // The intrinsic may demand a larger index type than we currently have;
+  // extend up to the right type.
+  if (idxBitWidth != intrIdxBitWidth) {
+    index = B.CreateZExtOrTrunc(index, indexIsVector ? indexVecTy : indexEltTy);
+  }
+
+  // If the index is uniform, it may not be a vector. We need one for the
+  // intrinsic, so splat it here.
+  if (!indexIsVector) {
+    index = B.CreateVectorSplat(intoEC, index);
+  } else {
+    index = createInnerScalableBroadcast(B, index, VL, fixedAmt);
+  }
+
+  auto *const zero = B.getInt64(0);
+
+  auto *const intrEltTy =
+      VectorType::get(multi_llvm::getVectorElementType(elt->getType()), intoEC);
+  elt = B.CreateInsertVector(intrEltTy, PoisonValue::get(intrEltTy), elt, zero,
+                             "vs2");
+
+  auto *steps = B.CreateStepVector(VectorType::get(indexEltTy, intoEC));
+
+  // Create our inner indices, e.g.: <0,1,2,3, 0,1,2,3, 0,1,2,3, ... >
+  auto *const innerIndices = B.CreateURem(
+      steps,
+      ConstantVector::getSplat(
+          intoEC, ConstantInt::get(indexEltTy, fixedAmt.getFixedValue())));
+
+  // Create our outer indices, e.g., <0,0,0,0,1,1,1,1,2,2,2,2,...>
+  auto *const outerIndices = B.CreateUDiv(
+      steps,
+      ConstantVector::getSplat(
+          intoEC, ConstantInt::get(indexEltTy, fixedAmt.getFixedValue())));
+
+  // Now compare the insert indices with the inner index vector: only one per
+  // N-element slice will be 'on', depending on the exact indices, e.g., if we
+  // originally have:
+  //    <1,3,0, ...>
+  // we have prepared it when constructing the indices:
+  //    <1,1,1,1, 3,3,3,3, 0,0,0,0, ...>
+  // == <0,1,2,3, 0,1,2,3, 0,1,2,3, ...>
+  // -> <0,1,0,0, 0,0,0,1, 1,0,0,0, ...>
+  auto *const mask = B.CreateICmpEQ(index, innerIndices, "vm");
+
+  return multi_llvm::createRISCVMaskedIntrinsic(
+      B, intrinsicID, {intoTy, avl->getType()},
+      {into, elt, outerIndices, mask, avl},
+      /*TailUndisturbed*/ 1);
+}
+
+llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
+                                                  llvm::Value *src,
+                                                  llvm::Value *mask,
+                                                  llvm::Value *VL) const {
+  // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through
+  // memory when creating this operation.
+  assert(isa<VectorType>(src->getType()) &&
+         "TargetInfoRISCV::createVectorShuffle: source must have vector type");
+  assert(isa<VectorType>(mask->getType()) &&
+         "TargetInfoRISCV::createVectorShuffle: mask must have vector type");
+
+  auto *const srcTy = cast<VectorType>(src->getType());
+  if (isa<Constant>(mask)) {
+    // Special case if the mask happens to be a constant.
+    return B.CreateShuffleVector(src, UndefValue::get(srcTy), mask);
+  }
+
+  if (isa<FixedVectorType>(srcTy)) {
+    // The gather intrinsics don't work with fixed vectors.
+    return TargetInfo::createVectorShuffle(B, src, mask, VL);
+  }
+
+  auto *const maskTy = cast<VectorType>(mask->getType());
+  auto const srcEC = multi_llvm::getVectorElementCount(srcTy);
+  auto const resEC = multi_llvm::getVectorElementCount(maskTy);
+
+  auto *const resTy = VectorType::get(srcTy->getElementType(), resEC);
+
+  // We can't create the intrinsics with a scalar size smaller than 8 bits, so
+  // extend it to i8, perform the shuffle, and truncate the result back.
+  if (srcTy->getScalarSizeInBits() < 8) {
+    auto *const fix = B.CreateZExt(src, VectorType::get(B.getInt8Ty(), srcEC));
+    auto *const res = createVectorShuffle(B, fix, mask, VL);
+    return B.CreateTrunc(res, resTy);
+  }
+
+  Intrinsic::ID intrinsicID;
+  unsigned intrIdxBitWidth;
+  std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy);
+
+  auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth);
+  auto *const indexVecTy = VectorType::get(indexEltTy, resEC);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {srcTy})) {
+    return TargetInfo::createVectorShuffle(B, src, mask, VL);
+  }
+
+  // The intrinsic may demand a larger index type than we currently have;
+  // extend up to the right type.
+  if (indexVecTy != maskTy) {
+    mask = B.CreateZExtOrTrunc(mask, indexVecTy);
+  }
+
+  auto *const zero = B.getInt64(0);
+
+  bool const same = (resEC == srcEC);
+  bool const narrow = !same && (srcEC.isScalable() || !resEC.isScalable()) &&
+                      resEC.getKnownMinValue() <= srcEC.getKnownMinValue();
+  bool const widen = !same && (resEC.isScalable() || !srcEC.isScalable()) &&
+                     srcEC.getKnownMinValue() <= resEC.getKnownMinValue();
+
+  assert((srcTy == resTy || narrow || widen) &&
+         "TargetInfoRISCV::createVectorShuffle: "
+         "unexpected combination of source and mask vector types");
+
+  auto *gatherTy = resTy;
+  if (narrow) {
+    // The vrgather intrinsics need equally-sized vector types. So
+    // insert the indices into a wide dummy vector (e.g., <vscale x 4 x idxTy>),
+    // perform the vrgather, and extract the subvector back out again.
+    auto *const wideMaskTy = VectorType::get(indexEltTy, srcEC);
+    mask = B.CreateInsertVector(wideMaskTy, PoisonValue::get(wideMaskTy), mask,
+                                zero);
+    gatherTy = srcTy;
+  } else if (widen) {
+    // The result is wider than the source, so insert the source vector into a
+    // wider vector first.
+    src = B.CreateInsertVector(resTy, PoisonValue::get(resTy), src, zero);
+  }
+
+  auto *const avl = getIntrinsicVL(B, VL, gatherTy, getTargetMachine());
+
+  SmallVector<Value *, 4> ops;
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+  // LLVM 15+ has a pass-through operand - we set it to undef.
+  ops.push_back(UndefValue::get(gatherTy));
+#endif
+  ops.push_back(src);
+  ops.push_back(mask);
+  ops.push_back(avl);
+
+  auto *const gather =
+      B.CreateIntrinsic(intrinsicID, {gatherTy, avl->getType()}, ops);
+
+  if (narrow) {
+    return B.CreateExtractVector(resTy, gather, zero);
+  }
+  return gather;
+}
+
+llvm::Value *TargetInfoRISCV::createVectorSlideUp(llvm::IRBuilder<> &B,
+                                                  llvm::Value *src,
+                                                  llvm::Value *insert,
+                                                  llvm::Value *VL) const {
+  auto *const srcTy = dyn_cast<VectorType>(src->getType());
+  assert(srcTy &&
+         "TargetInfo::createVectorShuffle: source must have vector type");
+
+  if (isa<FixedVectorType>(srcTy)) {
+    // The slide1up intrinsics don't work with fixed vectors.
+    return TargetInfo::createVectorSlideUp(B, src, insert, VL);
+  }
+
+  auto const intrinsicID = getSlideUpIntrinsic(srcTy);
+
+  auto *const avl = getIntrinsicVL(B, VL, srcTy, getTargetMachine());
+
+  SmallVector<Value *, 4> ops;
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+  // LLVM 15+ has a pass-through operand - we set it to undef.
+  ops.push_back(UndefValue::get(srcTy));
+#endif
+  ops.push_back(src);
+  ops.push_back(insert);
+  ops.push_back(avl);
+
+  return B.CreateIntrinsic(intrinsicID,
+                           {srcTy, insert->getType(), avl->getType()}, ops);
+}
+
+// This enum was copy/pasted from the RISCV backend
+enum VLMUL : uint8_t {
+  LMUL_1 = 0,
+  LMUL_2,
+  LMUL_4,
+  LMUL_8,
+  LMUL_RESERVED,
+  LMUL_F8,
+  LMUL_F4,
+  LMUL_F2
+};
+
+Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
+                                            Value *RemainingIters,
+                                            unsigned WidestEltTy,
+                                            ElementCount VF) const {
+  // The widest element type can only be one of the supported legal RVV vector
+  // element types.
+  if (WidestEltTy < 8 || WidestEltTy > 64 || !isPowerOf2_32(WidestEltTy)) {
+    return nullptr;
+  }
+  auto const KnownMin = VF.getKnownMinValue();
+  // The vectorization factor must be scalable and a legal vsetvli amount: no
+  // greater than the maximum vector length for each element width:
+  // nx64vi8,nx32vi16,nx16vi32,nxv8i64
+  if (!VF.isScalable() || !isPowerOf2_32(KnownMin) ||
+      KnownMin > MaxLegalVectorTypeBits / WidestEltTy) {
+    return nullptr;
+  }
+
+  unsigned LMUL = 0;
+  unsigned const MaxLegalElementWidth = 64;
+
+  if ((WidestEltTy * KnownMin) / MaxLegalElementWidth) {
+    // Non-fractional LMULs
+    LMUL = Log2_64((WidestEltTy * KnownMin) / MaxLegalElementWidth);
+  } else {
+    // Fractional LMULs
+    auto const Fraction = MaxLegalElementWidth / (WidestEltTy * KnownMin);
+    if (Fraction == 2) {
+      LMUL = LMUL_F2;
+    } else if (Fraction == 4) {
+      LMUL = LMUL_F4;
+    } else if (Fraction == 8) {
+      LMUL = LMUL_F4;
+    } else {
+      return nullptr;
+    }
+  }
+
+  auto *const VLMul = B.getInt64(LMUL);
+  auto *const VSEW = B.getInt64(Log2_64(WidestEltTy) - 3);
+
+  auto *const I32Ty = Type::getInt32Ty(B.getContext());
+  auto *const I64Ty = Type::getInt64Ty(B.getContext());
+
+  auto *const VL =
+      B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli_opt, {I64Ty},
+                        {RemainingIters, VSEW, VLMul});
+
+  return B.CreateTrunc(VL, I32Ty);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
new file mode 100644
index 0000000000000..1c48d0d0ddfd2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
@@ -0,0 +1,172 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+
+namespace {
+using namespace vecz;
+static const VectorizationChoices::ChoiceInfo choicesArray[] = {
+    {"PacketizeUniform", VectorizationChoices::eOptimizationPacketizeUniform,
+     "Packetizes all packetizable instructions whether they are varying or "
+     "not."},
+
+    {"PacketizeUniformInLoops",
+     VectorizationChoices::eOptimizationPacketizeUniformInLoops,
+     "Packetizes all packetizable instructions in loops, whether they are "
+     "varying or not."},
+
+    {"InstantiateCallsInLoops",
+     VectorizationChoices::eOptimizationInstantiateCallsInLoops,
+     "Uses loops to instantiate call instructions, instead of duplication."},
+
+    {"LinearizeBOSCC", VectorizationChoices::eLinearizeBOSCC,
+     "Control Flow Conversion uses Branch On Superword Condition Code."},
+
+    {"FullScalarization", VectorizationChoices::eFullScalarization,
+     "The scalarization pass scalarizes everything it can, regardless of any "
+     "performance benefit."},
+
+    {"DivisionExceptions", VectorizationChoices::eDivisionExceptions,
+     "Specify this when the target throws hardware exceptions on integer "
+     "division by zero."},
+
+    {"VectorPredication", VectorizationChoices::eVectorPredication,
+     "Generate a vector-predicated kernel safe to run on any workgroup size, "
+     "even those smaller than the vectorization width"},
+
+    {"TargetIndependentPacketization",
+     VectorizationChoices::eTargetIndependentPacketization,
+     "Force target-independent packetization choices (e.g., for testing "
+     "purposes)"},
+};
+
+}  // namespace
+
+namespace vecz {
+
+VectorizationChoices::VectorizationChoices() {}
+
+bool VectorizationChoices::parseChoicesString(StringRef Str) {
+  // If the string is empty, our work here is done
+  if (Str.empty()) {
+    return true;
+  }
+
+  // first = Choice, second = enable
+  using ChoiceValuePair = std::pair<Choice, bool>;
+  // The lexer implementation from the name mangling module is fairly generic,
+  // so we will use it here.
+  compiler::utils::Lexer L(Str);
+  // We support multiple separators in case of platform-dependent issues
+  StringRef Separators = ":;,";
+  // All the parsed choices will be stored in a set and will only be
+  // enabled/disabled after the parsing has been completed successfully.
+  SmallVector<ChoiceValuePair, 4> ParsedChoices;
+
+  // Start by lexing and parsing the Choices string
+
+  bool read_separator = false;
+  do {
+    StringRef ParsedChoice;
+    // Strip any leading whitespace
+    L.ConsumeWhitespace();
+    // If we have reached the end of the string, we are done
+    if (L.Left() == 0) {
+      break;
+    }
+    // Consume the optional "no" prefix, which disables the given prefix
+    bool disable = L.Consume("no");
+    // Consume the Choice name
+    if (L.ConsumeAlphanumeric(ParsedChoice)) {
+      // Convert the string to a Choice value
+      Choice C = fromString(ParsedChoice);
+      if (C == eInvalid) {
+        printChoicesParseError(Str, L.CurrentPos() - ParsedChoice.size(),
+                               "Invalid Choice \"" + ParsedChoice + "\"");
+        return false;
+      }
+      ParsedChoices.push_back(std::make_pair(C, !disable));
+    } else {
+      printChoicesParseError(Str, L.CurrentPos(), "Expected Choice");
+      return false;
+    }
+    // Strip any trailing whitespace
+    L.ConsumeWhitespace();
+    // Consume the separator (if any)
+    read_separator = false;
+    auto Current = L.Current();
+    if (Current != -1 && Separators.contains(char(Current))) {
+      L.Consume(1);
+      read_separator = true;
+    }
+  } while (read_separator && L.Left() > 0);
+
+  // If there is any string left, there must be some kind of mistake
+  if (L.Left() != 0) {
+    printChoicesParseError(Str, L.CurrentPos(), "Expected ';'");
+    return false;
+  }
+
+  // Set all the choices parsed in the loop
+
+  for (auto C : ParsedChoices) {
+    if (C.second == true) {
+      enable(C.first);
+    } else {
+      disable(C.first);
+    }
+  }
+
+  // We have finished successfully
+
+  return true;
+}
+
+VectorizationChoices::Choice VectorizationChoices::fromString(StringRef Str) {
+  auto Choose = StringSwitch<Choice>(Str);
+  for (const auto &info : ArrayRef<ChoiceInfo>(choicesArray)) {
+    Choose.Case(info.name, info.number);
+  }
+  return Choose.Default(eInvalid);
+}
+
+ArrayRef<VectorizationChoices::ChoiceInfo>
+VectorizationChoices::queryAvailableChoices() {
+  return ArrayRef<VectorizationChoices::ChoiceInfo>(choicesArray);
+}
+
+void VectorizationChoices::printChoicesParseError(StringRef Input,
+                                                  unsigned Position,
+                                                  Twine Msg) {
+  errs() << "CODEPLAY_VECZ_CHOICES parsing error: " << Msg << " at position "
+         << Position << "\n";
+  errs() << "    " << Input << "\n    ";
+  // We use the range [1, Position) instead of [0, Position - 1) to avoid
+  // an underflow in the case of Position = 0
+  for (unsigned i = 0; i < Position; ++i) {
+    errs() << ' ';
+  }
+  errs() << "^\n";
+}
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
new file mode 100644
index 0000000000000..8bf445572b7b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -0,0 +1,894 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_context.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/creation_apis_helper.h>
+#include <multi_llvm/opaque_pointers.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/packetization_helpers.h"
+#include "vectorization_helpers.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+STATISTIC(VeczContextFailBuiltin,
+          "Context: builtins with no vector equivalent [ID#V84]");
+STATISTIC(VeczContextFailScalarizeCall,
+          "Context: non-scalarizable vector builtin [ID#V86]");
+
+/// @brief Prefix used to distinguish internal vecz builtins from OpenCL
+/// builtins and user functions.
+const char *VectorizationContext::InternalBuiltinPrefix = "__vecz_b_";
+
+VectorizationContext::VectorizationContext(llvm::Module &target,
+                                           TargetInfo &vti,
+                                           compiler::utils::BuiltinInfo &bi)
+    : VTI(vti), Module(target), BI(bi), DL(&Module.getDataLayout()) {}
+
+TargetTransformInfo VectorizationContext::getTargetTransformInfo(
+    Function &F) const {
+  auto *const TM = targetInfo().getTargetMachine();
+  if (TM) {
+    return TM->getTargetTransformInfo(F);
+  } else {
+    return TargetTransformInfo(F.getParent()->getDataLayout());
+  }
+}
+
+VectorizationUnit *VectorizationContext::getActiveVU(const Function *F) const {
+  const auto I = ActiveVUs.find(F);
+  if (I == ActiveVUs.end()) {
+    return nullptr;
+  }
+  VectorizationUnit *VU = I->second;
+  assert(VU->vectorizedFunction() == F);
+  return VU;
+}
+
+compiler::utils::BuiltinInfo &VectorizationContext::builtins() { return BI; }
+
+const compiler::utils::BuiltinInfo &VectorizationContext::builtins() const {
+  return BI;
+}
+
+VectorizationUnit *VectorizationContext::createVectorizationUnit(
+    llvm::Function &F, ElementCount VF, unsigned Dimension,
+    const VectorizationChoices &Ch) {
+  KernelUnits.push_back(
+      std::make_unique<VectorizationUnit>(F, VF, Dimension, *this, Ch));
+  return KernelUnits.back().get();
+}
+
+bool VectorizationContext::isVector(const Instruction &I) {
+  if (I.getType()->isVectorTy()) {
+    return true;
+  }
+  for (const Use &op : I.operands()) {
+    if (op->getType()->isVectorTy()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool VectorizationContext::canExpandBuiltin(const Function *ScalarFn) const {
+  // Builtins that return no value must have side-effects.
+  if (ScalarFn->getReturnType()->isVoidTy()) {
+    return false;
+  }
+  for (const Argument &Arg : ScalarFn->args()) {
+    // Most builtins that take pointers have side-effects. Be conservative.
+    if (Arg.getType()->isPointerTy()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+VectorizationResult &VectorizationContext::getOrCreateBuiltin(
+    llvm::Function &F, unsigned SimdWidth) {
+  compiler::utils::BuiltinInfo &BI = builtins();
+  auto const Cached = VectorizedBuiltins.find(&F);
+  if (Cached != VectorizedBuiltins.end()) {
+    auto const Found = Cached->second.find(SimdWidth);
+    if (Found != Cached->second.end()) {
+      return Found->second;
+    }
+  }
+
+  auto const Builtin = BI.analyzeBuiltin(F);
+
+  // Try to find a vector equivalent for the builtin.
+  Function *const VectorCallee =
+      isInternalBuiltin(&F)
+          ? getInternalVectorEquivalent(&F, SimdWidth)
+          : BI.getVectorEquivalent(Builtin, SimdWidth, &Module);
+
+  auto &result = VectorizedBuiltins[&F][SimdWidth];
+  if (!VectorCallee) {
+    ++VeczContextFailBuiltin;
+    return result;
+  }
+
+  compiler::utils::NameMangler Mangler(&F.getContext(), &Module);
+  auto const BuiltinName = Mangler.demangleName(F.getName()).str();
+
+  result.func = VectorCallee;
+
+  // Gather information about the function's arguments.
+  auto const Props = Builtin.properties;
+  unsigned i = 0;
+  for (Argument &Arg : F.args()) {
+    Type *pointerRetPointeeTy = nullptr;
+    VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR;
+
+    if (Arg.getType()->isPointerTy()) {
+      pointerRetPointeeTy =
+          compiler::utils::getPointerReturnPointeeTy(F, Props);
+      kind = VectorizationResult::Arg::POINTER_RETURN;
+    } else {
+      kind = VectorizationResult::Arg::VECTORIZED;
+    }
+    result.args.emplace_back(kind, VectorCallee->getArg(i)->getType(),
+                             pointerRetPointeeTy);
+    i++;
+  }
+  return result;
+}
+
+VectorizationResult VectorizationContext::getVectorizedFunction(
+    Function &callee, ElementCount factor) {
+  VectorizationResult result;
+  if (factor.isScalable()) {
+    // We can't vectorize builtins by a scalable factor yet.
+    return result;
+  }
+
+  auto simdWidth = factor.getFixedValue();
+  if (auto *vecTy = dyn_cast<FixedVectorType>(callee.getReturnType())) {
+    auto const Builtin = BI.analyzeBuiltin(callee);
+    Function *scalarEquiv = builtins().getScalarEquivalent(Builtin, &Module);
+    if (!scalarEquiv) {
+      ++VeczContextFailScalarizeCall;
+      return VectorizationResult();
+    }
+
+    auto scalarWidth = vecTy->getNumElements();
+
+    result = getOrCreateBuiltin(*scalarEquiv, simdWidth * scalarWidth);
+  } else {
+    result = getOrCreateBuiltin(callee, simdWidth);
+  }
+  return result;
+}
+
+bool VectorizationContext::isInternalBuiltin(const Function *F) {
+  return F->getName().startswith(VectorizationContext::InternalBuiltinPrefix);
+}
+
+Function *VectorizationContext::getOrCreateInternalBuiltin(StringRef Name,
+                                                           FunctionType *FT) {
+  Function *F = Module.getFunction(Name);
+  if (!F && FT) {
+    F = dyn_cast_or_null<Function>(
+        Module.getOrInsertFunction(Name, FT).getCallee());
+  }
+
+  return F;
+}
+
+Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
+  Function *F = CI->getCalledFunction();
+  if (!F) {
+    F = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
+  }
+  VECZ_FAIL_IF(!F);  // TODO: CA-1505: Support indirect function calls.
+  LLVMContext &ctx = F->getContext();
+
+  // We will handle printf statements, but handling every possible vararg
+  // function can become a bit too complex, among other things because name
+  // mangling with arbitrary types can become a bit complex. printf is the only
+  // vararg OpenCL builtin, so only user functions are affected by this.
+  bool isVarArg = F->isVarArg();
+  VECZ_FAIL_IF(isVarArg && F->getName() != "printf");
+  // Copy the argument types. This is done from the CallInst instead of the
+  // called Function because the called Function might be a VarArg function, in
+  // which case we need to create the wrapper with the expanded argument list.
+  SmallVector<Type *, 8> argTys;
+  for (auto const &U : CI->args()) {
+    argTys.push_back(U->getType());
+  }
+  AttributeList fnAttrs = F->getAttributes();
+  unsigned firstImmArg;
+  const bool hasImmArg =
+      F->isIntrinsic() &&
+      fnAttrs.hasAttrSomewhere(Attribute::ImmArg, &firstImmArg);
+  if (hasImmArg) {
+    firstImmArg -= AttributeList::FirstArgIndex;
+    // We can only handle a single `i1` `Immarg` parameter. If we outgrow this
+    // limitation we need a different approach to the single inner branch
+    int count = 0;
+    for (unsigned i = firstImmArg, n = argTys.size(); i < n; ++i) {
+      if (!fnAttrs.hasAttributeAtIndex(AttributeList::FirstArgIndex + i,
+                                       Attribute::ImmArg)) {
+        continue;
+      }
+      // We only support one ImmArg or i1 type
+      if (count++ || argTys[i] != Type::getInt1Ty(ctx)) {
+        return nullptr;
+      }
+      fnAttrs = fnAttrs.removeAttributeAtIndex(ctx, i, Attribute::ImmArg);
+    }
+  }
+  // Add one extra argument for the mask
+  argTys.push_back(Type::getInt1Ty(ctx));
+  // Generate the function name
+  compiler::utils::NameMangler mangler(&ctx);
+  SmallVector<compiler::utils::TypeQualifiers, 8> quals(
+      argTys.size(), compiler::utils::TypeQualifiers());
+  std::string newFName;
+  raw_string_ostream O(newFName);
+  O << VectorizationContext::InternalBuiltinPrefix << "masked_" << F->getName();
+  // We need to mangle the names of the vararg masked functions, since we will
+  // generate different masked functions for invocations with different argument
+  // types. For non-vararg functions, we don't need the mangling so we skip it.
+  if (isVarArg) {
+    O << "_";
+    for (auto T : argTys) {
+      VECZ_FAIL_IF(!mangler.mangleType(
+          O, T,
+          compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone)));
+    }
+  }
+  O.flush();
+  // Check if we have a masked version already
+  auto maskedVersion = MaskedVersions.find(newFName);
+  if (maskedVersion != MaskedVersions.end()) {
+    LLVM_DEBUG(dbgs() << "vecz: Found existing masked function " << newFName
+                      << "\n");
+    return maskedVersion->second;
+  }
+  // Create the function type
+  FunctionType *newFunctionTy =
+      FunctionType::get(F->getReturnType(), argTys, false);
+  Function *newFunction = Function::Create(
+      newFunctionTy, GlobalValue::PrivateLinkage, newFName, F->getParent());
+  const CallingConv::ID cc = CI->getCallingConv();
+  LLVM_DEBUG(dbgs() << "vecz: Created masked function " << newFName << "\n");
+
+  // Create the function's basic blocks
+  BasicBlock *entryBlock = BasicBlock::Create(ctx, "entry", newFunction);
+  BasicBlock *activeBlock = BasicBlock::Create(ctx, "active", newFunction);
+  BasicBlock *mergeBlock = BasicBlock::Create(ctx, "exit", newFunction);
+
+  // Create a new call instruction to call the masked function
+  SmallVector<Value *, 8> CIArgs;
+  for (Value &arg : newFunction->args()) {
+    CIArgs.push_back(&arg);
+  }
+  // Remove the mask argument
+  CIArgs.pop_back();
+
+  FunctionType *FTy = CI->getFunctionType();
+  AttributeList callAttrs = CI->getAttributes();
+  SmallVector<std::pair<Value *, BasicBlock *>, 4> PhiOperands;
+  if (hasImmArg) {
+    Value *immArg = newFunction->getArg(firstImmArg);
+    BasicBlock *immTrue =
+        BasicBlock::Create(ctx, "active.imm.1", newFunction, mergeBlock);
+    CIArgs[firstImmArg] = ConstantInt::getTrue(ctx);
+    CallInst *c0 =
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immTrue);
+    c0->setCallingConv(cc);
+    c0->setAttributes(callAttrs);
+    BranchInst::Create(mergeBlock, immTrue);
+
+    CIArgs[firstImmArg] = ConstantInt::getFalse(ctx);
+    // Now the false half
+    BasicBlock *immFalse =
+        BasicBlock::Create(ctx, "active.imm.0", newFunction, mergeBlock);
+
+    CallInst *c1 =
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immFalse);
+    c1->setCallingConv(cc);
+    c1->setAttributes(callAttrs);
+    BranchInst::Create(mergeBlock, immFalse);
+    BranchInst::Create(immTrue, immFalse, immArg, activeBlock);
+    PhiOperands.push_back({c0, immTrue});
+    PhiOperands.push_back({c1, immFalse});
+
+    // Now fix up the new function's signature. It can't be inheriting illegal
+    // attributes; only intrinsics may have the `ImmArg` Attribute. The verifier
+    // complains loudly otherwise, and then comes into our houses at night, and
+    // wrecks up the place...
+    for (unsigned i = 0, n = fnAttrs.getNumAttrSets(); i < n; ++i) {
+      fnAttrs = fnAttrs.removeAttributeAtIndex(ctx, i, Attribute::ImmArg);
+    }
+  } else {
+    // We are using the called Value instead of F because it might contain
+    // a bitcast or something, which makes the function types different.
+    CallInst *c =
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", activeBlock);
+    c->setCallingConv(cc);
+    c->setAttributes(callAttrs);
+    PhiOperands.push_back({c, activeBlock});
+    BranchInst::Create(mergeBlock, activeBlock);
+  }
+  newFunction->setCallingConv(cc);
+  newFunction->setAttributes(fnAttrs);
+
+  // Get the last argument (the mask) and use it as our branch predicate as to
+  // the live blocks or a no-op
+  Value *mask = newFunction->arg_end() - 1;
+  BranchInst::Create(activeBlock, mergeBlock, mask, entryBlock);
+
+  Type *returnTy = F->getReturnType();
+  if (returnTy != Type::getVoidTy(ctx)) {
+    PHINode *result = PHINode::Create(returnTy, 2, "", mergeBlock);
+    for (auto &phiOp : PhiOperands) {
+      result->addIncoming(phiOp.first, phiOp.second);
+    }
+    result->addIncoming(getDefaultValue(returnTy), entryBlock);
+    ReturnInst::Create(ctx, result, mergeBlock);
+  } else {
+    ReturnInst::Create(ctx, mergeBlock);
+  }
+
+  MaskedVersions.insert(std::make_pair(newFName, newFunction));
+  insertMaskedFunction(newFunction, F);
+  return newFunction;
+}
+
+namespace {
+multi_llvm::Optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
+    StringRef fnName, Type *const ty) {
+  compiler::utils::Lexer L(fnName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return multi_llvm::None;
+  }
+  if (!L.Consume("sub_group_scan_")) {
+    return multi_llvm::None;
+  }
+  bool isInt = ty->isIntOrIntVectorTy();
+  bool isInclusive = L.Consume("inclusive_");
+  if (isInclusive || L.Consume("exclusive_")) {
+    StringRef OpKind;
+    if (L.ConsumeAlpha(OpKind)) {
+      RecurKind opKind;
+      if (OpKind == "add") {
+        opKind = isInt ? RecurKind::Add : RecurKind::FAdd;
+      } else if (OpKind == "min") {
+        assert(!isInt && "unexpected internal scan builtin");
+        opKind = RecurKind::FMin;
+      } else if (OpKind == "max") {
+        assert(!isInt && "unexpected internal scan builtin");
+        opKind = RecurKind::FMax;
+      } else if (OpKind == "smin") {
+        opKind = RecurKind::SMin;
+      } else if (OpKind == "smax") {
+        opKind = RecurKind::SMax;
+      } else if (OpKind == "umin") {
+        opKind = RecurKind::UMin;
+      } else if (OpKind == "umax") {
+        opKind = RecurKind::UMax;
+      } else if (OpKind == "mul") {
+        opKind = isInt ? RecurKind::Mul : RecurKind::FMul;
+      } else if (OpKind == "and") {
+        opKind = RecurKind::And;
+        assert(isInt && "unexpected internal scan builtin");
+      } else if (OpKind == "or") {
+        opKind = RecurKind::Or;
+        assert(isInt && "unexpected internal scan builtin");
+      } else if (OpKind == "xor") {
+        opKind = RecurKind::Xor;
+        assert(isInt && "unexpected internal scan builtin");
+      } else {
+        return multi_llvm::None;
+      }
+      bool isVP = L.Consume("_vp");
+      return std::make_tuple(isInclusive, opKind, isVP);
+    }
+  }
+  return multi_llvm::None;
+}
+};  // namespace
+
+bool VectorizationContext::defineInternalBuiltin(Function *F) {
+  assert(F->isDeclaration() && "builtin is already defined");
+
+  // Handle masked memory loads and stores.
+  if (multi_llvm::Optional<MemOpDesc> Desc =
+          MemOpDesc::analyzeMemOpFunction(*F)) {
+    if (Desc->isMaskedMemOp()) {
+      return emitMaskedMemOpBody(*F, *Desc);
+    }
+
+    // Handle interleaved memory loads and stores.
+    if (Desc->isInterleavedMemOp()) {
+      return emitInterleavedMemOpBody(*F, *Desc);
+    }
+
+    // Handle masked interleaved memory loads and stores
+    if (Desc->isMaskedInterleavedMemOp()) {
+      return emitMaskedInterleavedMemOpBody(*F, *Desc);
+    }
+
+    // Handle scatter stores and gather loads.
+    if (Desc->isScatterGatherMemOp()) {
+      return emitScatterGatherMemOpBody(*F, *Desc);
+    }
+
+    // Handle masked scatter stores and gather loads.
+    if (Desc->isMaskedScatterGatherMemOp()) {
+      return emitMaskedScatterGatherMemOpBody(*F, *Desc);
+    }
+  }
+
+  // Handle subgroup scan operations.
+  if (auto scanInfo = isSubgroupScan(F->getName(), F->getReturnType())) {
+    bool isInclusive = std::get<0>(*scanInfo);
+    RecurKind opKind = std::get<1>(*scanInfo);
+    bool isVP = std::get<2>(*scanInfo);
+    return emitSubgroupScanBody(*F, isInclusive, opKind, isVP);
+  }
+
+  return false;
+}
+
+bool VectorizationContext::emitMaskedMemOpBody(Function &F,
+                                               MemOpDesc const &Desc) const {
+  Value *Data = Desc.getDataOperand(&F);
+  Value *Ptr = Desc.getPointerOperand(&F);
+  Value *Mask = Desc.getMaskOperand(&F);
+  Value *VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
+  Type *DataTy = Desc.isLoad() ? F.getReturnType() : Data->getType();
+  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
+      cast<PointerType>(Ptr->getType()), DataTy));
+
+  BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
+  IRBuilder<> B(Entry);
+  Value *Result = nullptr;
+  if (Desc.isLoad()) {
+    Result =
+        VTI.createMaskedLoad(B, DataTy, Ptr, Mask, VL, Desc.getAlignment());
+    B.CreateRet(Result);
+  } else {
+    Result = VTI.createMaskedStore(B, Data, Ptr, Mask, VL, Desc.getAlignment());
+    B.CreateRetVoid();
+  }
+  VECZ_FAIL_IF(!Result);
+  return true;
+}
+
+bool VectorizationContext::emitInterleavedMemOpBody(
+    Function &F, MemOpDesc const &Desc) const {
+  return emitMaskedInterleavedMemOpBody(F, Desc);
+}
+
+bool VectorizationContext::emitMaskedInterleavedMemOpBody(
+    Function &F, MemOpDesc const &Desc) const {
+  Value *Data = Desc.getDataOperand(&F);
+  auto *const Ptr = Desc.getPointerOperand(&F);
+  VECZ_FAIL_IF(!isa<VectorType>(Desc.getDataType()) || !Ptr);
+
+  auto *const Mask = Desc.getMaskOperand(&F);
+  auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
+  auto const Align = Desc.getAlignment();
+  auto const Stride = Desc.getStride();
+
+  BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
+  IRBuilder<> B(Entry);
+
+  // If the mask is missing, assume that this is a normal interleaved memop that
+  // we want to emit as an unmasked interleaved memop
+  if (Desc.isLoad()) {
+    auto *const Result =
+        Mask ? VTI.createMaskedInterleavedLoad(B, F.getReturnType(), Ptr, Mask,
+                                               Stride, VL, Align)
+             : VTI.createInterleavedLoad(B, F.getReturnType(), Ptr, Stride, VL,
+                                         Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRet(Result);
+  } else {
+    auto *const Result =
+        Mask ? VTI.createMaskedInterleavedStore(B, Data, Ptr, Mask, Stride, VL,
+                                                Align)
+             : VTI.createInterleavedStore(B, Data, Ptr, Stride, VL, Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRetVoid();
+  }
+  return true;
+}
+
+bool VectorizationContext::emitScatterGatherMemOpBody(
+    Function &F, MemOpDesc const &Desc) const {
+  return emitMaskedScatterGatherMemOpBody(F, Desc);
+}
+
+bool VectorizationContext::emitMaskedScatterGatherMemOpBody(
+    Function &F, MemOpDesc const &Desc) const {
+  Value *Data = Desc.getDataOperand(&F);
+  auto *const VecDataTy = dyn_cast<VectorType>(Desc.getDataType());
+  auto *const Ptr = Desc.getPointerOperand(&F);
+  VECZ_FAIL_IF(!VecDataTy || !Ptr);
+
+  auto *const Mask = Desc.getMaskOperand(&F);
+  auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
+  auto const Align = Desc.getAlignment();
+
+  BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
+  IRBuilder<> B(Entry);
+
+  // If the mask is missing, assume that this is a normal scatter/gather memop
+  // that we want to emit as an unmasked scatter/gather memop
+  if (Desc.isLoad()) {
+    auto *const Result =
+        Mask ? VTI.createMaskedGatherLoad(B, VecDataTy, Ptr, Mask, VL, Align)
+             : VTI.createGatherLoad(B, VecDataTy, Ptr, VL, Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRet(Result);
+  } else {
+    auto *const Result =
+        Mask ? VTI.createMaskedScatterStore(B, Data, Ptr, Mask, VL, Align)
+             : VTI.createScatterStore(B, Data, Ptr, VL, Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRetVoid();
+  }
+  return true;
+}
+
+// Emit a subgroup scan operation.
+// If the vectorization factor is fixed, we can do a scan in log2(N) steps,
+// by noting that an inclusive scan can be split into two, and recombined into
+// a single result by adding the last element of the first half onto every
+// element of the second half. To deal with exclusive scans, we rotate the
+// result by one element and insert the neutral element at the beginning.
+//
+// For now, when using scalable vectorization factor, this takes the form of a
+// simple loop that accumulates the scan operation in scalar form, extracting
+// and inserting elements of the resulting vector on each iteration:
+//   %v = <A,B,C,D>
+//   Iteration 0:
+//     %e.0 = extractelement %v, 0          (A)
+//     %s.0 = add N, %e.0                   (A)
+//     %v.0 = insertelement undef, %s.0, 0  (<A,U,U,U>)
+//   Iteration 1:
+//     %e.1 = extractelement %v, 1          (B)
+//     %s.1 = add %s.0, %e.1                (A+B)
+//     %v.1 = insertelement  %v.0, %s.1, 1  (<A,A+B,U,U>)
+//   Iteration 2:
+//     %e.2 = extractelement %v, 2          (C)
+//     %s.2 = add %s.1, %e.2                (A+B+C)
+//     %v.2 = insertelement  %v.1, %s.2, 2  (<A,A+B,A+B+C,U>)
+//   Iteration 3:
+//     %e.3 = extractelement %v, 3          (D)
+//     %s.3 = add %s.2, %e.2                (A+B+C+D)
+//     %v.3 = insertelement  %v.2, %s.3, 3  (<A,A+B,A+B+C,A+B+C+D>)
+//   Result:
+//     %v.3 = <A,A+B,A+B+C,A+B+C+D>
+//
+// Exclusive scans operate by pre-filling the vector with the neutral value,
+// looping from 1 onwards, and extracting from one less than the current
+// iteration:
+//   %z = insertelement undef, N, 0
+//   Iteration 0:
+//     %e.0 = extractelement %v, 0          (A)
+//     %s.0 = add N, %e.0                   (A)
+//     %v.0 = insertelement %z, %s.0, 1     (<N,A,U,U>)
+// This loop operates up to the VL input, if it is a vector-predicated scan.
+// Elements past the vector length will receive a default zero value.
+// Note: This method is not optimal for fixed-length code, but serves as a way
+// of producing scalable- and fixed-length vector code equivalently.
+bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
+                                                RecurKind OpKind,
+                                                bool IsVP) const {
+  LLVMContext &Ctx = F.getContext();
+
+  auto *const Entry = BasicBlock::Create(Ctx, "entry", &F);
+  IRBuilder<> B(Entry);
+
+  Type *const VecTy = F.getReturnType();
+  Type *const EltTy = multi_llvm::getVectorElementType(VecTy);
+  ElementCount EC = multi_llvm::getVectorElementCount(VecTy);
+
+  Function::arg_iterator Arg = F.arg_begin();
+
+  Value *const Vec = Arg;
+  Value *const VL = IsVP ? ++Arg : nullptr;
+
+  // If it's not a scalable vector, we can do it the fast way.
+  if (!EC.isScalable() && !IsVP) {
+    auto *const NeutralVal = compiler::utils::getNeutralVal(OpKind, EltTy);
+    auto const Width = EC.getFixedValue();
+    auto *const UndefVal = UndefValue::get(VecTy);
+
+    // Put the Neutral element in a vector so we can shuffle it in.
+    auto *const NeutralVec =
+        B.CreateInsertElement(UndefVal, NeutralVal, B.getInt64(0));
+
+    auto *Result = Vec;
+    unsigned N = 1u;
+
+    SmallVector<int, 16> mask(Width);
+    while (N < Width) {
+      // Build shuffle mask.
+      // The sequence of masks will be, for a width of 16
+      // (in hexadecimal for concision, where x represents the neutral value
+      // element):
+      //
+      // x0x2x4x6x8xAxCxE
+      // xx11xx55xx99xxDD
+      // xxxx3333xxxxBBBB
+      // xxxxxxxx77777777
+      //
+      auto const N2 = N << 1u;
+      auto MaskIt = mask.begin();
+      for (size_t i = 0; i < Width; i += N2) {
+        for (size_t j = 0; j < N; ++j) {
+          *MaskIt++ = Width;
+        }
+
+        auto const k = i + N - 1;
+        for (size_t j = 0; j < N; ++j) {
+          *MaskIt++ = k;
+        }
+      }
+      N = N2;
+      auto *const Shuffle =
+          createOptimalShuffle(B, Result, NeutralVec, mask, Twine("scan_impl"));
+      Result = multi_llvm::createBinOpForRecurKind(B, Result, Shuffle, OpKind);
+    }
+
+    if (!IsInclusive) {
+      // If it is an exclusive scan, rotate the result.
+      auto *const IdentityVal = compiler::utils::getIdentityVal(OpKind, EltTy);
+      VECZ_FAIL_IF(!IdentityVal);
+      Result = VTI.createVectorSlideUp(B, Result, IdentityVal, VL);
+    }
+
+    B.CreateRet(Result);
+    return true;
+  }
+
+  // If the vector is scalable, we don't know the number of iterations required,
+  // so we have to use a loop and shuffle masks generated from the step vector.
+
+  auto *const IVTy = B.getInt32Ty();
+  auto *const IndexTy = VectorType::get(IVTy, EC);
+  auto *const Step = B.CreateStepVector(IndexTy, "step");
+  auto *const VZero = Constant::getNullValue(IndexTy);
+
+  auto *const Loop = BasicBlock::Create(Ctx, "loop", &F);
+  auto *const Exit = BasicBlock::Create(Ctx, "exit", &F);
+
+  // The length of the vector.
+  Value *Width = nullptr;
+  if (IsVP) {
+    Width = VL;
+  } else if (EC.isScalable()) {
+    Width = B.CreateVScale(ConstantInt::get(IVTy, EC.getKnownMinValue()));
+  } else {
+    Width = ConstantInt::get(IVTy, EC.getFixedValue());
+  }
+
+  B.CreateBr(Loop);
+
+  // Loop induction starts at 1 and doubles each time.
+  auto *const IVStart = ConstantInt::get(IVTy, 1);
+
+  // Create the loop instructions
+  B.SetInsertPoint(Loop);
+
+  // The induction variable (IV) which determines both our loop bounds and our
+  // vector indices.
+  auto *N = B.CreatePHI(IVTy, 2, "iv");
+  N->addIncoming(IVStart, Entry);
+
+  // A vector phi representing the vectorized value we're building up.
+  auto *VecPhi = B.CreatePHI(VecTy, 2, "vec");
+  VecPhi->addIncoming(Vec, Entry);
+
+  // A vector phi representing the vectorized value we're building up.
+  auto *MaskPhi = B.CreatePHI(IndexTy, 2, "mask.phi");
+  MaskPhi->addIncoming(Step, Entry);
+
+  // This will create shuffle masks like the following sequence:
+  //
+  // 1032547698BADCFE = (0123456789ABCDEF ^ splat(1))
+  // 33117755BB99FFDD = (1032547698BADCFE ^ splat(2)) | splat(1)
+  // 77773333FFFFBBBB = (33117755BB99FFDD ^ splat(4)) | splat(2)
+  // FFFFFFFF77777777 = (77773333FFFFBBBB ^ splat(8)) | splat(4)
+  //
+  // We don't mix the neutral element into the vector in this case, but use a
+  // Select instruction to choose between the updated or original value, so that
+  // backends can lower it as a masked binary operation. The select condition
+  // therefore needs to be like the following sequence:
+  //
+  // 0101010101010101
+  // 0011001100110011
+  // 0000111100001111
+  // 0000000011111111
+
+  auto *const SplatN = B.CreateVectorSplat(EC, N, "splatN");
+  auto *const Mask = B.CreateXor(MaskPhi, SplatN, "mask");
+  auto *const Shuffle = VTI.createVectorShuffle(B, VecPhi, Mask, VL);
+  auto *const Accum =
+      multi_llvm::createBinOpForRecurKind(B, VecPhi, Shuffle, OpKind);
+
+  auto *const NBit = B.CreateAnd(MaskPhi, SplatN, "isolate");
+  auto *const Which = B.CreateICmpNE(NBit, VZero, "which");
+  auto *const NewVec = B.CreateSelect(Which, Accum, VecPhi, "newvec");
+
+  auto *const NewMask = B.CreateOr(Mask, SplatN, "newmask");
+  auto *const N2 = B.CreateShl(N, ConstantInt::get(IVTy, 1), "N2",
+                               /*HasNUW*/ true, /*HasNSW*/ true);
+
+  VecPhi->addIncoming(NewVec, Loop);
+  MaskPhi->addIncoming(NewMask, Loop);
+  N->addIncoming(N2, Loop);
+
+  // Loop exit condition
+  auto *const Cond = B.CreateICmpULT(N2, Width, "iv.cmp");
+  B.CreateCondBr(Cond, Loop, Exit);
+
+  // Function exit instructions:
+  B.SetInsertPoint(Exit);
+
+  // Create an LCSSA PHI node.
+  auto *const ResultPhi = B.CreatePHI(VecTy, 1, "res.phi");
+  ResultPhi->addIncoming(NewVec, Loop);
+
+  Value *Result = ResultPhi;
+  if (!IsInclusive) {
+    // If it is an exclusive scan, rotate the result.
+    auto *const IdentityVal = compiler::utils::getIdentityVal(OpKind, EltTy);
+    VECZ_FAIL_IF(!IdentityVal);
+    Result = VTI.createVectorSlideUp(B, Result, IdentityVal, VL);
+  }
+
+  B.CreateRet(Result);
+  return true;
+}
+
+Function *VectorizationContext::getInternalVectorEquivalent(
+    Function *ScalarFn, unsigned SimdWidth) {
+  // Handle masked memory loads and stores.
+  if (!ScalarFn) {
+    return nullptr;
+  }
+  if (auto Desc = MemOpDesc::analyzeMaskedMemOp(*ScalarFn)) {
+    auto *NewDataTy = FixedVectorType::get(Desc->getDataType(), SimdWidth);
+    return getOrCreateMaskedMemOpFn(
+        *this, NewDataTy, cast<PointerType>(Desc->getPointerType()),
+        Desc->getAlignment(), Desc->isLoad(), Desc->isVLOp());
+  }
+
+  return nullptr;
+}
+
+bool VectorizationContext::isMaskedFunction(const llvm::Function *F) const {
+  return MaskedFunctionsMap.count(F) > 0;
+}
+
+bool VectorizationContext::insertMaskedFunction(llvm::Function *F,
+                                                llvm::Function *WrappedF) {
+  auto result = MaskedFunctionsMap.insert({F, WrappedF});
+  return result.second;
+}
+
+llvm::Function *VectorizationContext::getOriginalMaskedFunction(
+    llvm::Function *F) {
+  auto Iter = MaskedFunctionsMap.find(F);
+  if (Iter != MaskedFunctionsMap.end()) {
+    return dyn_cast_or_null<llvm::Function>(Iter->second);
+  }
+
+  return nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+char DefineInternalBuiltinsPass::PassID = 0;
+
+PreservedAnalyses DefineInternalBuiltinsPass::run(Module &M,
+                                                  ModuleAnalysisManager &AM) {
+  llvm::FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  // Remove internal builtins that may not be needed any more.
+  SmallVector<Function *, 4> ToRemove;
+
+  bool NonePreserved = false;
+  // Implement internal builtins that we now know are needed.
+  // We find all declarations that should be builtins, and then define them if
+  // they have users that have associated vectorization units.
+  // On failure to define, we notify those vectorization units of failure
+  // and remove any partially defined body.
+  // Unused declarations are removed
+  for (Function &F : M.functions()) {
+    if (!F.isDeclaration() || !VectorizationContext::isInternalBuiltin(&F)) {
+      continue;
+    }
+    if (F.use_empty()) {
+      ToRemove.push_back(&F);
+      NonePreserved = true;
+      continue;
+    }
+    llvm::SmallPtrSet<VectorizationUnit *, 1> UserVUs;
+    for (Use &U : F.uses()) {
+      if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) {
+        auto R = FAM.getResult<VectorizationUnitAnalysis>(*CI->getFunction());
+        if (R.hasResult()) {
+          UserVUs.insert(&R.getVU());
+        }
+      }
+    }
+    if (std::all_of(UserVUs.begin(), UserVUs.end(),
+                    [](VectorizationUnit *VU) { return VU->failed(); })) {
+      // If the vectorization has failed, we do not want to define the internal
+      // builtins, both because its a waste of time and because we might try to
+      // instantiate some invalid builtin that would have been replaced by the
+      // packetization process.
+      continue;
+    }
+
+    VectorizationContext &Ctx = (*UserVUs.begin())->context();
+    bool DefinedBuiltin = Ctx.defineInternalBuiltin(&F);
+    if (!DefinedBuiltin) {
+      // If we've failed to define this builtin, ensure we clean up the
+      // half-complete body. We can't simply delete it because it will have
+      // uses in the vector kernel. This will revert it to a declaration, which
+      // will be cleaned up later by the global optimizer.
+      if (!F.isDeclaration()) {
+        // defineInternalBuiltin may have partially defined the function body.
+        // Clean it up. FIXME defineInternalBuiltin should probably clean up
+        // after itself if there is a failure condition
+        F.deleteBody();
+      }
+      for (VectorizationUnit *VU : UserVUs) {
+        VU->setFailed("failed to define an internal builtin");
+      }
+      continue;
+    }
+    NonePreserved = true;
+  }
+
+  for (Function *F : ToRemove) {
+    F->eraseFromParent();
+  }
+
+  return NonePreserved ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
new file mode 100644
index 0000000000000..e8b7dc9360f7f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -0,0 +1,395 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_helpers.h"
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/metadata.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/Support/Debug.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+
+Function *declareFunction(VectorizationUnit const &VU) {
+  Module &Module = VU.context().module();
+  Function const *const ScalarFn = VU.scalarFunction();
+  ElementCount SimdWidth = VU.width();
+
+  // For kernels, the vectorized function type is is the same as the original
+  // scalar function type, since function arguments are uniform. We no longer
+  // use Vectorization Units for builtins.
+  FunctionType *VectorizedFnType = VU.scalarFunction()->getFunctionType();
+  VECZ_FAIL_IF(!VectorizedFnType);
+  std::string VectorizedName =
+      getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, VU.choices());
+  Module.getOrInsertFunction(VectorizedName, VectorizedFnType);
+  auto *const VectorizedFn = Module.getFunction(VectorizedName);
+  if (VectorizedFn) {
+    VectorizedFn->setCallingConv(ScalarFn->getCallingConv());
+  }
+  return VectorizedFn;
+}
+
+/// @brief Clone the OpenCL named metadata node with name NodeName
+/// @param[in] NodeName The name of the node to clone
+///
+/// This function works with nodes that follow a specific pattern,
+/// specifically nodes that have as their operands other metadata nodes, which
+/// in turn have their first operand set to the OpenCL kernel Function. It
+/// searches for the node that contains the scalar kernel, and copies all its
+/// metadata, which the exception of the Function itself, which is replaced by
+/// the vectorized kernel.
+void cloneOpenCLNamedMetadataHelper(VectorizationUnit const &VU,
+                                    const std::string &NodeName) {
+  Module &M = VU.context().module();
+
+  // Try to get the OpenCL metadata
+  NamedMDNode *KernelsMD = M.getNamedMetadata(NodeName);
+  if (!KernelsMD) {
+    return;
+  }
+
+  // Find which metadata node contains the metadata for the scalar function
+  MDNode *ScalarKernelMD = nullptr;
+  for (auto *KernelMD : KernelsMD->operands()) {
+    // The function name is the first operand
+    if (KernelMD->getNumOperands() > 0) {
+      // Get the Constant containing the function
+      ConstantAsMetadata *KernelNameMD =
+          dyn_cast_or_null<ConstantAsMetadata>(KernelMD->getOperand(0));
+      if (KernelNameMD) {
+        // Check if the function in the metadata is the original OpenCL kernel
+        if (KernelNameMD->getValue() == VU.scalarFunction()) {
+          ScalarKernelMD = KernelMD;
+          break;
+        }
+      }
+    }
+  }
+
+  // Did we find the correct metadata?
+  if (!ScalarKernelMD) {
+    return;
+  }
+
+  // Replace the kernel name and clone the rest of the metadata
+  SmallVector<llvm::Metadata *, 5> KernelMDArgs;
+  KernelMDArgs.push_back(
+      llvm::ConstantAsMetadata::get(VU.vectorizedFunction()));
+  auto MDIt = ScalarKernelMD->op_begin() + 1;
+  auto MDEnd = ScalarKernelMD->op_end();
+  for (; MDIt != MDEnd; ++MDIt) {
+    KernelMDArgs.push_back(*MDIt);
+  }
+
+  // Create a new metadata node and add it to the opencl.kernels node
+  llvm::MDNode *KernelMDNode =
+      llvm::MDNode::get(VU.context().module().getContext(), KernelMDArgs);
+  KernelsMD->addOperand(KernelMDNode);
+}
+
+/// @brief Create placeholder instructions for arguments that will be
+/// vectorized. This is necessary to clone the original function's scalar code
+/// into the vectorized function.
+///
+/// @param[in,out] ValueMap Map to update with the arguments.
+SmallVector<Instruction *, 2> createArgumentPlaceholders(
+    VectorizationUnit const &VU, Function *VecFunc,
+    ValueToValueMapTy &ValueMap) {
+  SmallVector<Instruction *, 2> Placeholders;
+  auto const &Arguments = VU.arguments();
+  unsigned i = 0u;
+  for (Argument &DstArg : VecFunc->args()) {
+    Argument *SrcArg = Arguments[i++].OldArg;
+    DstArg.setName(SrcArg->getName());
+    if (DstArg.getType() != SrcArg->getType()) {
+      // Map old argument to a temporary placeholder to work around the
+      // difference in argument types. This usually happens when vectorizing
+      // builtin functions.
+      Type *IndexTy = Type::getInt32Ty(VecFunc->getParent()->getContext());
+      Constant *Index = Constant::getNullValue(IndexTy);
+      auto *const Placeholder = ExtractElementInst::Create(&DstArg, Index);
+      ValueMap[SrcArg] = Placeholder;
+      Placeholders.push_back(Placeholder);
+    } else {
+      ValueMap[SrcArg] = &DstArg;
+    }
+  }
+  return Placeholders;
+}
+
+}  // namespace
+
+namespace vecz {
+std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF,
+                                      VectorizationChoices Choices) {
+  Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v");
+  Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_");
+  return (Twine("__vecz_") + Prefix + Twine(VF.getKnownMinValue()) + IsVP +
+          ScalarName)
+      .str();
+}
+
+Function *cloneFunctionToVector(VectorizationUnit const &VU) {
+  auto *const VectorizedFn = declareFunction(VU);
+  VECZ_ERROR_IF(!VectorizedFn, "declareFunction failed to initialize");
+
+  auto *const ScalarFn = VU.scalarFunction();
+
+  // Map the old arguments to the new ones.
+  ValueToValueMapTy ValueMap;
+  auto Placeholders = createArgumentPlaceholders(VU, VectorizedFn, ValueMap);
+
+  // Clone the function to preserve instructions that do not need vectorization.
+  SmallVector<ReturnInst *, 4> Returns;
+
+  // Setting `moduleChanges` to true allows `llvm::CloneFunctionInto()` to do
+  // the work of cloning debug info across translation unit boundaries.
+  // However there can be issues with inlined kernels if the inlined kernel
+  // still exists in the kernel, and also has a vectorized variant.
+  // This value was set to true in this code since LLVM_VERSION_MAJOR > 4 but as
+  // of llvm > 12 we need to be a bit more careful with that value as there is
+  // more nuance introduced in 22a52dfddc with requisite assertions
+  const bool moduleChanges = VectorizedFn->getParent() != ScalarFn->getParent();
+  auto cloneMode = moduleChanges ? CloneFunctionChangeType::DifferentModule
+                                 : CloneFunctionChangeType::LocalChangesOnly;
+  CloneFunctionInto(VectorizedFn, ScalarFn, ValueMap, cloneMode, Returns);
+
+  // Remove unwanted return value attributes.
+  if (VectorizedFn->getReturnType()->isVectorTy()) {
+    LLVMContext &Ctx = VectorizedFn->getContext();
+    AttributeList PAL = VectorizedFn->getAttributes();
+    bool RemovedAttribute = false;
+    for (Attribute::AttrKind Kind : {Attribute::ZExt, Attribute::SExt}) {
+      if (PAL.hasRetAttr(Kind)) {
+        PAL = PAL.removeRetAttribute(Ctx, Kind);
+        RemovedAttribute = true;
+      }
+    }
+    if (RemovedAttribute) {
+      VectorizedFn->setAttributes(PAL);
+    }
+  }
+
+  // Override the base function name component for the vectorized function.
+  compiler::utils::setBaseFnName(*VectorizedFn, VectorizedFn->getName());
+
+  // Drop any metadata where the scalar kernel already serves as the base or
+  // result of vectorization: this vectorized function does not serve as such:
+  // not directly in the case of 'derived' metadata, anyway: that relationship
+  // will be transitive.
+  compiler::utils::dropVeczOrigMetadata(*VectorizedFn);
+  compiler::utils::dropVeczDerivedMetadata(*VectorizedFn);
+
+  // Add any 'argument placeholder' instructions to the entry block.
+  // Skip over Alloca instructions if there are any.
+  BasicBlock &BB = VectorizedFn->getEntryBlock();
+  auto InsertPt = BB.getFirstInsertionPt();
+  while (isa<AllocaInst>(*InsertPt)) {
+    ++InsertPt;
+  }
+
+  for (auto *Placeholder : Placeholders) {
+    Placeholder->insertBefore(&*InsertPt);
+  }
+
+  return VectorizedFn;
+}
+
+void cloneDebugInfo(VectorizationUnit const &VU) {
+  DISubprogram *const ScalarDI = VU.scalarFunction()->getSubprogram();
+  // We don't have debug info
+  if (!ScalarDI) {
+    return;
+  }
+
+  // Create a DISubprogram entry for the vectorized kernel
+  DIBuilder DIB(*VU.scalarFunction()->getParent(), false);
+  DICompileUnit *CU =
+      DIB.createCompileUnit(dwarf::DW_LANG_OpenCL, ScalarDI->getFile(), "",
+                            ScalarDI->isOptimized(), "", 0);
+  DISubprogram *const VectorDI = DIB.createFunction(
+      CU->getFile(), ScalarDI->getName(),
+      StringRef(), /* Don't need a linkage name */
+      CU->getFile(), ScalarDI->getLine(), ScalarDI->getType(),
+      ScalarDI->getScopeLine(), ScalarDI->getFlags(), ScalarDI->getSPFlags());
+
+  // Point kernel function to a parent compile unit
+  VectorDI->replaceUnit(ScalarDI->getUnit());
+
+  VU.vectorizedFunction()->setSubprogram(VectorDI);
+
+  DIB.finalize();
+
+  // Iterate over all the instructions in the kernel looking for
+  // intrinsics containing debug info metadata that must be updated.
+  // Changing the scope to point to the new vectorized function, rather
+  // than the scalar function.
+
+  std::vector<Instruction *> DIIntrinsicsToDelete;
+  std::vector<Metadata *> VectorizedLocals;
+
+  for (auto &BBItr : *VU.vectorizedFunction()) {
+    for (auto &InstItr : BBItr) {
+      // Instruction is a llvm.dbg.value() or llvm.dbg.declare() intrinsic
+      // TODO CA-1115 - Support llvm.dbg.addr() intrinsic
+      if (DbgInfoIntrinsic *const DII = dyn_cast<DbgInfoIntrinsic>(&InstItr)) {
+        // Delete this intrinsic later
+        DIIntrinsicsToDelete.push_back(DII);
+
+        // Generate a new DebugLoc pointing to vectorized function
+        const DebugLoc &ScalarLoc = DII->getDebugLoc();
+
+        // If location is inlined, we need to change the function it's inlined
+        // into to our vectorized kernel, keeping the base location the same.
+        DebugLoc VectorLoc;
+        const DILocation *InlinedLoc = ScalarLoc.getInlinedAt();
+        DISubprogram *OriginalFunc = VectorDI;
+
+        if (InlinedLoc) {
+          OriginalFunc = ScalarLoc->getScope()->getSubprogram();
+          if (InlinedLoc->getInlinedAt()) {
+            // We don't support nested inlined locations currently, abandon
+            // creating dbg intrinsic as otherwise it will fail in validation.
+            continue;
+          }
+
+          const DebugLoc InlinedAtLoc = multi_llvm::getDILocation(
+              InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI);
+          VectorLoc =
+              multi_llvm::getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(),
+                                        ScalarLoc.getScope(), InlinedAtLoc);
+        } else {
+          VectorLoc = multi_llvm::getDILocation(ScalarLoc.getLine(),
+                                                ScalarLoc.getCol(), VectorDI);
+        }
+
+        // New DILocalVariable in the scope of vectorized function
+        DILocalVariable *VectorLocal = nullptr;
+        if (DbgValueInst *const DVI = dyn_cast<DbgValueInst>(DII)) {
+          if (!DVI->getValue()) {
+            // Debug value has been optimized out
+            continue;
+          }
+
+          // Find DILocalVariable the intrinsic references
+          const DILocalVariable *const ScalarLocal = DVI->getVariable();
+
+          // Create a copy of DILocalVariable but in vectorized function scope
+          if (ScalarLocal->getArg() == 0) {
+            VectorLocal = DIB.createAutoVariable(
+                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getFile(),
+                ScalarLocal->getLine(),
+                dyn_cast<DIType>(ScalarLocal->getType()));
+          } else {
+            VectorLocal = DIB.createParameterVariable(
+                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getArg(),
+                ScalarLocal->getFile(), ScalarLocal->getLine(),
+                dyn_cast<DIType>(ScalarLocal->getType()));
+          }
+
+          // New llvm.dbg.value() with correct scope
+          DIB.insertDbgValueIntrinsic(DVI->getValue(), VectorLocal,
+                                      DVI->getExpression(), VectorLoc, DVI);
+        } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(DII)) {
+          // Find DILocalVariable the intrinsic references
+          const DILocalVariable *const ScalarLocal = DDI->getVariable();
+
+          // Create a copy of DILocalVariable but in vectorized function scope
+          if (ScalarLocal->getArg() == 0) {
+            VectorLocal = DIB.createAutoVariable(
+                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getFile(),
+                ScalarLocal->getLine(),
+                dyn_cast<DIType>(ScalarLocal->getType()));
+          } else {
+            VectorLocal = DIB.createParameterVariable(
+                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getArg(),
+                ScalarLocal->getFile(), ScalarLocal->getLine(),
+                dyn_cast<DIType>(ScalarLocal->getType()));
+          }
+
+          // New llvm.dbg.declare() with correct scope
+          DIB.insertDeclare(DDI->getAddress(), VectorLocal,
+                            DDI->getExpression(), VectorLoc, DDI);
+        } else {
+          continue;  // No other DbgInfoIntrinsic subclasses
+        }
+
+        if (VectorizedLocals.end() == std::find(VectorizedLocals.begin(),
+                                                VectorizedLocals.end(),
+                                                VectorLocal)) {
+          VectorizedLocals.push_back(VectorLocal);
+        }
+      } else if (InstItr.getDebugLoc()) {
+        // Update debug info line numbers to have vectorized kernel scope,
+        // taking care to preserve inlined locations.
+        const DebugLoc &ScalarLoc = InstItr.getDebugLoc();
+        DebugLoc VectorLoc;
+        if (DILocation *const InlinedLoc = ScalarLoc.getInlinedAt()) {
+          // Don't support nested inlined locations for now
+          if (!InlinedLoc->getInlinedAt()) {
+            const DebugLoc VectorKernel = multi_llvm::getDILocation(
+                InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI);
+            VectorLoc = multi_llvm::getDILocation(
+                ScalarLoc.getLine(), ScalarLoc.getCol(), ScalarLoc.getScope(),
+                VectorKernel);
+          }
+        } else {
+          VectorLoc = multi_llvm::getDILocation(ScalarLoc.getLine(),
+                                                ScalarLoc.getCol(), VectorDI);
+        }
+        InstItr.setDebugLoc(VectorLoc);
+      }
+    }
+  }
+
+  // Delete intrinsics we have replaced
+  for (auto Instr : DIIntrinsicsToDelete) {
+    Instr->eraseFromParent();
+  }
+
+  // Replace temporary MDNode with list of vectorized DILocals we have created
+  // In LLVM 7.0 the variables attribute of DISubprogram was changed to
+  // retainedNodes
+  auto *VectorizedKernelVariables = VectorDI->getRetainedNodes().get();
+  assert(VectorizedKernelVariables && "Could not get retained nodes");
+  if (VectorizedKernelVariables->isTemporary()) {
+    auto NewLocals = MDTuple::getTemporary(
+        VectorizedKernelVariables->getContext(), VectorizedLocals);
+    VectorizedKernelVariables->replaceAllUsesWith(NewLocals.get());
+  }
+
+  return;
+}
+
+void cloneOpenCLMetadata(VectorizationUnit const &VU) {
+  cloneOpenCLNamedMetadataHelper(VU, "opencl.kernels");
+  cloneOpenCLNamedMetadataHelper(VU, "opencl.kernel_wg_size_info");
+}
+
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
new file mode 100644
index 0000000000000..24132efaabc1e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -0,0 +1,390 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_heuristics.h"
+
+#include <compiler/utils/cl_builtin_info.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Instructions.h>
+
+#include <unordered_set>
+
+#include "vectorization_context.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+class Heuristics {
+  enum class BrClauseKind { None = 0, True, False };
+
+ public:
+  Heuristics(llvm::Function &F, VectorizationContext &Ctx, ElementCount VF,
+             unsigned SimdDimIdx)
+      : F(F), Ctx(Ctx), SimdWidth(VF), SimdDimIdx(SimdDimIdx) {}
+
+  /// @brief Look through the scalar code to find patterns that indicate
+  ///        we should not vectorize the kernel; e.g.:
+  ///        __kernel Type FuncName(Params) {
+  ///          if (get_global_id(0) == 0) {
+  ///            // Do something.
+  ///          }
+  ///          // Do nothing.
+  ///        }
+  /// @return Whether we should vectorize the function or not.
+  bool shouldVectorize();
+
+ private:
+  /// @brief Passthrough to CmpInst.
+  ///
+  /// @param[in] Comp The instruction to inspect.
+  ///
+  /// @return The branch's path not to vectorize, if any.
+  BrClauseKind shouldVectorizeVisitBr(const llvm::Value *Comp) const;
+  /// @brief Visit a Cmp to check if it involves a call to an opencl builtin.
+  ///
+  /// @param[in] Cmp The comparison instruction to inspect.
+  ///
+  /// @return The branch's path not to vectorize, if any.
+  BrClauseKind shouldVectorizeVisitCmp(const llvm::CmpInst *Cmp) const;
+  /// @brief Visit the operand of a Cmp to strip it down to a
+  ///        CallInst or ConstantInt, if possible.
+  ///
+  /// @param[in] Val The instruction to inspect.
+  /// @param[in] Cmp The comparison instruction Val belongs to.
+  /// @param[in] Cache A map containing previously generated results.
+  ///
+  /// @return A CallInst or ConstantInt, nullptr otherwise.
+  const llvm::Value *shouldVectorizeVisitCmpOperand(
+      const llvm::Value *Val, const llvm::CmpInst *Cmp,
+      DenseMap<const Value *, const Value *> &Cache) const;
+  /// @brief Inspect the predicate and the operand that is compared against an
+  ///        opencl builtin to determine if it's better not to vectorize the
+  ///        kernel.
+  ///
+  /// @param[in] RHS  The operand compared against an opencl builtin.
+  /// @param[in] Pred The kind of comparison.
+  ///
+  /// @return The branch's path not to vectorize, if any.
+  BrClauseKind shouldVectorizeVisitCmpOperands(
+      const llvm::Value *RHS, llvm::CmpInst::Predicate Pred) const;
+
+  /// @brief The function to analyze.
+  llvm::Function &F;
+
+  /// @brief The vectorization context.
+  VectorizationContext &Ctx;
+
+  /// @brief Vectorization factor to use.
+  ElementCount SimdWidth;
+
+  /// @brief Vectorization dimension to use.
+  unsigned SimdDimIdx;
+};
+
+Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmpOperands(
+    const Value *RHS, CmpInst::Predicate Pred) const {
+  // If we have an `EQ` comparison, the single lane computation happens on
+  // the true successor.
+  if (Pred == CmpInst::Predicate::ICMP_EQ) {
+    return BrClauseKind::True;
+  }
+
+  // If we have an `NE` comparison, the single lane computation happens on
+  // the false successor.
+  if (Pred == CmpInst::Predicate::ICMP_NE) {
+    return BrClauseKind::False;
+  }
+
+  if (!RHS) {
+    return BrClauseKind::None;
+  }
+
+  // If the value we compare against the opencl builtin call is a constant,
+  // determine if it is worth it to vectorize based on the chances to hit a
+  // branch.
+  if (const ConstantInt *Val = dyn_cast<const ConstantInt>(RHS)) {
+    // If we have a branch whose condition only applies for at most half of the
+    // simd width, it is not worth vectorizing it.
+    switch (Pred) {
+      default:
+        break;
+      // If we have a `GT` or `GE` comparison, if the constant we compare the
+      // opencl builtin against is greater than half of the simd width, we will
+      // not take the true branch as often as the false branch.
+      case CmpInst::Predicate::ICMP_UGT:
+      case CmpInst::Predicate::ICMP_UGE:
+      case CmpInst::Predicate::ICMP_SGT:
+      case CmpInst::Predicate::ICMP_SGE:
+        if (SimdWidth.isScalable()) {
+          return BrClauseKind::True;
+        } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
+          return BrClauseKind::True;
+        } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
+          return BrClauseKind::False;
+        }
+        break;
+      // If we have an `LT` or `LE` comparison, if the constant we compare the
+      // opencl builtin against is smaller than half of the simd width, we will
+      // not take the true branch as often as the false branch.
+      case CmpInst::Predicate::ICMP_ULT:
+      case CmpInst::Predicate::ICMP_ULE:
+      case CmpInst::Predicate::ICMP_SLT:
+      case CmpInst::Predicate::ICMP_SLE:
+        if (SimdWidth.isScalable()) {
+          return BrClauseKind::False;
+        } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
+          return BrClauseKind::True;
+        } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
+          return BrClauseKind::False;
+        }
+        break;
+    }
+  }
+
+  return BrClauseKind::None;
+}
+
+const Value *Heuristics::shouldVectorizeVisitCmpOperand(
+    const Value *Val, const CmpInst *Cmp,
+    DenseMap<const Value *, const Value *> &Cache) const {
+  auto const It = Cache.find(Val);
+  if (It != Cache.end()) {
+    return It->second;
+  }
+
+  // If we are visiting a binary operator, inspect both its operands.
+  if (const BinaryOperator *BO = dyn_cast<const BinaryOperator>(Val)) {
+    const Value *LHS =
+        shouldVectorizeVisitCmpOperand(BO->getOperand(0), Cmp, Cache);
+    const Value *RHS =
+        shouldVectorizeVisitCmpOperand(BO->getOperand(1), Cmp, Cache);
+
+    auto &Result = Cache[Val];
+
+    // If any of LHS and RHS are null and the comparison instruction is not
+    // an equality, Val is not constant and used in a relational comparison.
+    // We don't want to work with that.
+    if ((!LHS || !RHS) && !Cmp->isEquality()) {
+      return (Result = nullptr);
+    }
+
+    // If the operands of the BinaryOperator are a CallInst and anything else
+    // we do not want to keep going. We wish to avoid such comparisons:
+    // if ((get_local_id(0) & Constant) == Constant) {}
+    if (dyn_cast_or_null<const CallInst>(LHS)) {
+      return (Result = nullptr);
+    }
+    if (dyn_cast_or_null<const CallInst>(RHS)) {
+      return (Result = nullptr);
+    }
+
+    // Up to this point, LHS and RHS are either ConstantInt or null.
+    if (LHS) {
+      return (Result = LHS);
+    }
+    return (Result = RHS);
+  }
+
+  // If we are visiting an unary operator, inspect its operand.
+  if (const UnaryInstruction *UI = dyn_cast<const UnaryInstruction>(Val)) {
+    return shouldVectorizeVisitCmpOperand(UI->getOperand(0), Cmp, Cache);
+  }
+
+  if (const CallInst *CI = dyn_cast<const CallInst>(Val)) {
+    // We only care if the CallInst does involve a call to a work-item builtin.
+    compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+    auto const Uniformity = BI.analyzeBuiltinCall(*CI, SimdDimIdx).uniformity;
+    if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+        Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+      return (Cache[Val] = CI);
+    }
+  }
+
+  if (const ConstantInt *CI = dyn_cast<const ConstantInt>(Val)) {
+    return (Cache[Val] = CI);
+  }
+
+  return (Cache[Val] = nullptr);
+}
+
+Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmp(
+    const CmpInst *Cmp) const {
+  // The following two calls return either a CallInst, a ConstantInt, or
+  // nullptr otherwise. If it returns a CallInst, it necessarily is a call to
+  // get_{global|local}_id, because otherwise we don't care.
+  DenseMap<const Value *, const Value *> Cache;
+  const Value *LHS =
+      shouldVectorizeVisitCmpOperand(Cmp->getOperand(0), Cmp, Cache);
+  const Value *RHS =
+      shouldVectorizeVisitCmpOperand(Cmp->getOperand(1), Cmp, Cache);
+
+  CmpInst::Predicate pred = Cmp->getPredicate();
+
+  BrClauseKind vectorize = BrClauseKind::None;
+  // The CmpInst may involve two CallInst, or it may involve only one but
+  // we don't know on which side it may be.
+  if (llvm::isa_and_nonnull<const CallInst>(LHS)) {
+    vectorize = shouldVectorizeVisitCmpOperands(RHS, pred);
+  }
+  if (llvm::isa_and_nonnull<const CallInst>(RHS)) {
+    BrClauseKind RHSStatus = shouldVectorizeVisitCmpOperands(LHS, pred);
+    // This should never happen but in case it does, we want to "void" the
+    // result and vectorize!
+    if (vectorize != BrClauseKind::None && vectorize != RHSStatus) {
+      return BrClauseKind::None;
+    }
+    vectorize = RHSStatus;
+  }
+  return vectorize;
+}
+
+Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitBr(
+    const Value *Comp) const {
+  // If we are visiting a binary operator, inspect both its operands to
+  // perhaps find CmpInsts.
+  // E.g.: %and = and ...
+  //       br i1 %and, ...
+  if (const BinaryOperator *BO = dyn_cast<const BinaryOperator>(Comp)) {
+    return (static_cast<BrClauseKind>(
+        static_cast<int>(shouldVectorizeVisitBr(BO->getOperand(0))) &&
+        static_cast<int>(shouldVectorizeVisitBr(BO->getOperand(1)))));
+  }
+
+  if (const CmpInst *CI = dyn_cast<const CmpInst>(Comp)) {
+    return shouldVectorizeVisitCmp(CI);
+  }
+
+  return BrClauseKind::None;
+}
+
+bool Heuristics::shouldVectorize() {
+  BasicBlock &BB = F.getEntryBlock();
+
+  // Weights computed by the kind of instructions.
+  // For the moment, we only consider stores/loads and function calls as being
+  // expensive, without looking at what function call it is
+  // (except for work item calls).
+  //
+  // Ultimately, it feels like this check should be done at some point during
+  // the vectorization process, so that we have a better overview on how bad
+  // the vectorized kernel is compared to the scalar one.
+  //
+  // We should most likely check only for instructions that have varying
+  // operands.
+  auto getWeight = [this](BasicBlock &B) {
+    unsigned weight = 0;
+    for (Instruction &I : B) {
+      if (isa<StoreInst>(&I) || isa<LoadInst>(&I)) {
+        weight++;
+      } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+        if (Function *Callee = CI->getCalledFunction()) {
+          auto const builtin = BI.analyzeBuiltin(*Callee);
+          if (!(builtin.properties &
+                compiler::utils::eBuiltinPropertyWorkItem)) {
+            weight++;
+          }
+        }
+      }
+    }
+    return weight;
+  };
+
+  // If the program is laid out such that it may not be worth to vectorize
+  // based only on the comparison of the entry block, we also have to make
+  // sure that the entry block does not do as many expensive work as its
+  // successors, in which case it might still be worth to vectorize.
+  // We want to check if the entry block does some computation and store
+  // them. Basically, if the kernel looks like:
+  //
+  // __kernel void FuncName(Params) {
+  //   // (1) Do something.
+  //   // (2) Store that something.
+  //   if (get_global_id(0) == 0) {
+  //     // (3) Do something.
+  //   }
+  //   // (4) Do nothing.
+  // }
+  //
+  // then we might still want to vectorize it because (1) might be eligible for
+  // great vectorization improvements.
+  // If (2) is not present in the kernel, then we will probably not want to
+  // vectorize the kernel as (1) will then either be useless or only be used
+  // in (3). The former implies that it will never be used and the latter
+  // implies that it will be used only once per lane, so not worth vectorizing!
+  const unsigned entryBlockWeight = getWeight(BB);
+
+  Instruction *TI = BB.getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional()) {
+      BrClauseKind clause = shouldVectorizeVisitBr(BI->getCondition());
+      unsigned succWeight = 0;
+      if (clause != BrClauseKind::None) {
+        BasicBlock *start = nullptr;
+        BasicBlock *terminatingBlock = nullptr;
+        if (clause == BrClauseKind::True) {
+          start = BI->getSuccessor(0);
+          terminatingBlock = BI->getSuccessor(1);
+        } else {
+          start = BI->getSuccessor(1);
+          terminatingBlock = BI->getSuccessor(0);
+        }
+        assert(terminatingBlock &&
+               "Failed to get terminating block of branch inst");
+
+        std::unordered_set<BasicBlock *> visited;
+        std::vector<BasicBlock *> worklist{start};
+        visited.insert(start);
+        while (!worklist.empty()) {
+          BasicBlock *cur = worklist.back();
+          worklist.pop_back();
+          succWeight += getWeight(*cur);
+          for (BasicBlock *succ : successors(cur)) {
+            if (succ == terminatingBlock) {
+              continue;
+            }
+            if (visited.insert(succ).second) {
+              worklist.push_back(succ);
+            }
+          }
+        }
+
+        // We don't want to vectorize if the path that will be taken the most
+        // is the exit block of the function and does nothing else but return.
+        if (isa<ReturnInst>(terminatingBlock->getTerminator()) &&
+            (terminatingBlock->size() == 1) &&
+            // Arbitrary limit.
+            (entryBlockWeight < succWeight)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+}  // namespace
+
+namespace vecz {
+bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
+                     ElementCount VF, unsigned SimdDimIdx) {
+  Heuristics VH(F, Ctx, VF, SimdDimIdx);
+  return VH.shouldVectorize();
+}
+}  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
new file mode 100644
index 0000000000000..98403fd40d7d8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
@@ -0,0 +1,175 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_unit.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/PassManagerImpl.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "debugging.h"
+#include "vectorization_context.h"
+#include "vectorization_helpers.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+VectorizationUnit::VectorizationUnit(Function &F, ElementCount Width,
+                                     unsigned Dimension,
+                                     VectorizationContext &Ctx,
+                                     const VectorizationChoices &Ch)
+    : Ctx(Ctx),
+      Choices(Ch),
+      ScalarFn(&F),
+      VectorizedFn(nullptr),
+      SimdWidth(ElementCount()),
+      LocalSize(0),
+      AutoSimdWidth(false),
+      SimdDimIdx(Dimension),
+      FnFlags(eFunctionNoFlag) {
+  // Gather information about the function's arguments.
+  for (Argument &Arg : F.args()) {
+    VectorizerTargetArgument TargetArg;
+    TargetArg.OldArg = &Arg;
+    TargetArg.NewArg = nullptr;
+    TargetArg.IsVectorized = false;
+    TargetArg.PointerRetPointeeTy = nullptr;
+    TargetArg.Placeholder = nullptr;
+    Arguments.push_back(TargetArg);
+  }
+
+  // Set the desired SIMD width and try to look up the vectorized function.
+  setWidth(Width);
+}
+
+VectorizationUnit::~VectorizationUnit() {}
+
+Function &VectorizationUnit::function() {
+  if (VectorizedFn) {
+    return *VectorizedFn;
+  } else {
+    return *ScalarFn;
+  }
+}
+
+const Function &VectorizationUnit::function() const {
+  if (VectorizedFn) {
+    return *VectorizedFn;
+  } else {
+    return *ScalarFn;
+  }
+}
+
+void VectorizationUnit::setWidth(ElementCount NewWidth) {
+  if (NewWidth == SimdWidth) {
+    return;
+  }
+  SimdWidth = NewWidth;
+
+  // Determine the vectorized function's name and try to look it up.
+  std::string VectorizedName =
+      getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, Choices);
+  if (VectorizedFn) {
+    VectorizedFn->setName(VectorizedName);
+  } else {
+    setVectorizedFunction(Ctx.module().getFunction(VectorizedName));
+  }
+}
+
+void VectorizationUnit::setScalarFunction(llvm::Function *NewFunction) {
+  if (!NewFunction) {
+    return;
+  }
+  ScalarFn = NewFunction;
+  unsigned i = 0;
+  for (Argument &Arg : NewFunction->args()) {
+    VectorizerTargetArgument &TargetArg = Arguments[i];
+    TargetArg.OldArg = &Arg;
+    i++;
+  }
+}
+
+void VectorizationUnit::setVectorizedFunction(llvm::Function *NewFunction) {
+  VectorizedFn = NewFunction;
+  ArgumentPlaceholders.clear();
+  if (!NewFunction) {
+    for (unsigned i = 0; i < Arguments.size(); i++) {
+      VectorizerTargetArgument &TargetArg = Arguments[i];
+      TargetArg.NewArg = nullptr;
+      TargetArg.Placeholder = nullptr;
+    }
+  } else {
+    unsigned i = 0;
+    for (Argument &Arg : NewFunction->args()) {
+      VectorizerTargetArgument &TargetArg = Arguments[i];
+      TargetArg.NewArg = &Arg;
+
+      Instruction *Placeholder = nullptr;
+      if (TargetArg.IsVectorized && !TargetArg.PointerRetPointeeTy &&
+          !Arg.user_empty()) {
+        // A vectorized argument will be used only by its placeholder extract
+        // element instruction
+        Placeholder = cast<Instruction>(*Arg.user_begin());
+      }
+
+      TargetArg.Placeholder = Placeholder;
+      if (Placeholder) {
+        // Mark the extract to distinguish them from other instructions.
+        ArgumentPlaceholders.insert(Placeholder);
+      }
+      i++;
+    }
+  }
+}
+
+vecz::internal::AnalysisFailResult VectorizationUnit::setFailed(
+    const char *remark, const llvm::Function *F, const llvm::Value *V) {
+  setFlag(eFunctionVectorizationFailed);
+  emitVeczRemarkMissed(F ? F : &function(), V, remark);
+  return vecz::internal::AnalysisFailResult();
+}
+
+VectorizationResult VectorizationUnit::getResult() const {
+  VectorizationResult res;
+  res.func = VectorizedFn;
+
+  for (const VectorizerTargetArgument &TargetArg : Arguments) {
+    Type *pointerRetPointeeTy = nullptr;
+    VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR;
+    if (auto *ty = TargetArg.PointerRetPointeeTy) {
+      pointerRetPointeeTy = ty;
+      kind = VectorizationResult::Arg::POINTER_RETURN;
+    } else if (TargetArg.IsVectorized) {
+      kind = VectorizationResult::Arg::VECTORIZED;
+    }
+    res.args.emplace_back(kind, TargetArg.NewArg->getType(),
+                          pointerRetPointeeTy);
+  }
+  return res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
new file mode 100644
index 0000000000000..e5d875c644493
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -0,0 +1,364 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorizer.h"
+
+#include <compiler/utils/metadata.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <memory>
+#include <unordered_set>
+
+#include "analysis/vectorizable_function_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_context.h"
+#include "vectorization_helpers.h"
+#include "vectorization_heuristics.h"
+#include "vectorization_unit.h"
+#include "vecz/pass.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+static cl::opt<bool> VeczDumpReport(
+    "vecz-dump-report", cl::desc("report the post-vectorization status"));
+// static cl options allow us to access these options from other cpp files,
+// such as vectorization_unit.cpp
+
+}  // namespace
+
+// Statistics
+STATISTIC(VeczSuccess, "Number of kernels successfully vectorized [ID#V80]");
+STATISTIC(VeczFail, "Number of kernels that failed to vectorize [ID#V81]");
+STATISTIC(VeczBail,
+          "Number of kernels where vectorization was not attempted [ID#V82]");
+
+STATISTIC(ScalarInstructions,
+          "Number of instructions in the scalar kernel [ID#V00]");
+STATISTIC(ScalarLoadStores,
+          "Number of loads and stores in the scalar kernel [ID#V01]");
+STATISTIC(ScalarVectorInsts,
+          "Number of vector instructions in the scalar kernel [ID#V02]");
+STATISTIC(ScalarMaxVectorWidth,
+          "The width of the bigger vector instruction found in the scalar "
+          "kernel [ID#V13]");
+STATISTIC(VeczInstructions,
+          "Number of instructions in the vectorized kernel [ID#V03]");
+STATISTIC(VeczScalarInstructions,
+          "Number of scalar instructions in the vectorized kernel [ID#V04]");
+STATISTIC(VeczVectorInstructions,
+          "Number of vector instructions in the vectorized kernel [ID#V05]");
+STATISTIC(VeczInsertExtract,
+          "Number of insert/extractelement instructions in the vectorized "
+          "kernel [ID#V06]");
+STATISTIC(VeczSplats,
+          "Number of vector splats in the vectorized kernel [ID#V07]");
+STATISTIC(
+    VeczScalarMemOp,
+    "Number of scalar loads and stores in the vectorized kernel [ID#V0A]");
+STATISTIC(
+    VeczVectorMemOp,
+    "Number of vector loads and stores in the vectorized kernel [ID#V0B]");
+STATISTIC(
+    VeczMaskedMemOps,
+    "Number of masked memory operations in the vectorized kernel [ID#V0C]");
+STATISTIC(VeczInterleavedMemOps,
+          "Number of interleaved memory operations in the vectorized kernel "
+          "[ID#V0D]");
+STATISTIC(VeczMaskedInterleavedMemOps,
+          "Number of masked interleaved memory operations in the vectorized "
+          "kernel [ID#V0E]");
+STATISTIC(VeczScatterGatherMemOps,
+          "Number of scatter/gather memory operations in the vectorized kernel "
+          "[ID#V10]");
+STATISTIC(VeczMaskedScatterGatherMemOps,
+          "Number of masked scatter/gather operations in the vectorized "
+          "kernel [ID#V11]");
+STATISTIC(VeczVectorWidth, "Vector width of the vectorized kernel [ID#V12]");
+STATISTIC(Ratio, "Normalized ratio of theoretical speedup[ID#V13]");
+
+namespace {
+/// @brief Calculate vectorization related statistics from the kernels
+///
+/// @param[in] VU The Vectorization Unit we are working on
+/// @param[in] Scalar The scalar function that we have vectorized
+/// @param[in] Vectorized The vectorized version of the scalar function
+void collectStatistics(VectorizationUnit &VU, Function *Scalar,
+                       Function *Vectorized) {
+  // Do not gather statistics if we failed to vectorize, if we're doing
+  // scalable vectorization, or if statistics aren't enabled in the first
+  // place.
+  if (!Scalar || !Vectorized || !AreStatisticsEnabled() ||
+      VU.width().isScalable()) {
+    return;
+  }
+
+  VeczVectorWidth = VU.width().getFixedValue();
+
+  // Function to check if an instruction is a vector instruction or not
+  auto isVectorInst = [](Instruction &I) -> bool {
+    Type *Ty = I.getType();
+
+    // Insert/extractelement are not really vector instructions
+    if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I)) {
+      return false;
+    }
+    // Instructions that return a vector
+    if (isa<FixedVectorType>(Ty)) {
+      return true;
+    }
+    // Store instructions that store a vector value
+    if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      auto *ValOp = SI->getValueOperand();
+      assert(ValOp && "Could not get value operand");
+      return isa<FixedVectorType>(ValOp->getType());
+    }
+    // Internal builtins that work on vectors. This is relevant for stores only,
+    // as loads return a vector type and will be caught earlier on.
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (auto Op = MemOp::get(CI)) {
+        // With the exception of masked loads and stores, every other internal
+        // builtin works with vectors
+        if (!Op->isMaskedMemOp()) {
+          return true;
+        }
+        // Masked loads are handled earlier on as they return a vector type.
+        // We need to check if masked stores are storing vectors or not.
+        if (Op->isStore() && isa<FixedVectorType>(Op->getDataType())) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  };
+
+  unsigned MaxScalarVectorWidth = 0;
+  // Collect the scalar kernel's statistics
+  for (auto &BB : *Scalar) {
+    for (auto &I : BB) {
+      ++ScalarInstructions;
+      ScalarLoadStores += (isa<LoadInst>(I) || isa<StoreInst>(I));
+      ScalarVectorInsts += isVectorInst(I);
+      // Find out how wide is the widest vector used in the scalar kernel
+      if (auto *VecTy = dyn_cast<FixedVectorType>(I.getType())) {
+        if (VecTy->getNumElements() > MaxScalarVectorWidth) {
+          MaxScalarVectorWidth = VecTy->getNumElements();
+        }
+      }
+    }
+  }
+  ScalarMaxVectorWidth = MaxScalarVectorWidth;
+
+  // Collect the vectorized kernel's statistics
+  for (auto &BB : *Vectorized) {
+    for (auto &I : BB) {
+      // Count instructions
+      ++VeczInstructions;
+
+      // Detect vector splats
+      // Count insert/extractelement instructions
+      if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I)) {
+        if (I.getName().startswith(".splatinsert")) {
+          ++VeczSplats;
+        }
+        ++VeczInsertExtract;
+      }
+
+      // Count vector and scalar instructions
+      if (isVectorInst(I)) {
+        ++VeczVectorInstructions;
+      } else {
+        ++VeczScalarInstructions;
+      }
+
+      // Count memory operation types
+      if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+        // Normal scalar/vector loads and stores
+        if (isVectorInst(I)) {
+          ++VeczVectorMemOp;
+        } else {
+          ++VeczScalarMemOp;
+        }
+      } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        Function *F = CI->getCalledFunction();
+        if (!F) {
+          continue;
+        }
+        // Subtract 1 for the call instruction, since we are inlining
+        --VeczInstructions;
+
+        for (auto &BB : *F) {
+          for (auto &Inst : BB) {
+            VeczInstructions += !isa<CallInst>(&Inst);
+          }
+        }
+        // Internal builtin memory operations
+        if (auto Op = MemOp::get(&I)) {
+          VeczMaskedMemOps += Op->isMaskedMemOp();
+          VeczInterleavedMemOps += Op->getDesc().isInterleavedMemOp();
+          VeczMaskedInterleavedMemOps += Op->isMaskedInterleavedMemOp();
+          VeczScatterGatherMemOps += Op->getDesc().isScatterGatherMemOp();
+          VeczMaskedScatterGatherMemOps += Op->isMaskedScatterGatherMemOp();
+        }
+      }
+    }
+  }
+
+  // Ratio = Normalized Scalar Insts / Vector Insts
+  // Normalized Scalar Insts = Simd Width * Scalar Insts
+  // IK - Input Kernel
+  // Scalar Insts = IK's Scalar Insts + IK's Vec Insts * IK's VecWidth
+  unsigned SimdWidth = VU.width().getFixedValue();
+  Ratio = (SimdWidth * (ScalarInstructions - ScalarVectorInsts +
+                        ScalarVectorInsts * MaxScalarVectorWidth)) /
+          VeczInstructions;
+}
+}  // namespace
+
+VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx,
+                                                 Function *Kernel,
+                                                 const VeczPassOptions &Opts,
+                                                 FunctionAnalysisManager &FAM,
+                                                 bool Check) {
+  unsigned SimdDimIdx = Opts.vec_dim_idx;
+  unsigned LocalSize = Opts.local_size;
+  bool Auto = Opts.vecz_auto;
+  auto VF =
+      ElementCount::get(Opts.factor.getKnownMin(), Opts.factor.isScalable());
+
+  if (!Kernel || VF.isScalar()) {
+    ++VeczBail;
+    VECZ_FAIL();
+  }
+
+  // Up to MAX_SIMD_DIM supported dimensions
+  VECZ_ERROR_IF(SimdDimIdx >= MAX_SIMD_DIM,
+                "Specified vectorization dimension is invalid");
+
+  VECZ_ERROR_IF(VF.getKnownMinValue() == 0, "Vectorization factor of zero");
+
+  // Adjust VF if the local size is known to vectorize more often.
+  if (LocalSize && !VF.isScalable()) {
+    // If we know the vectorized loop will never be entered, because the
+    // vectorization factor is too large, then vectorizing is a waste of time.
+    // It is better instead to vectorize by a smaller factor. Keep on halfing
+    // the vector width until a useable value is found (worst case this value
+    // will be 1, because that evenly divides everything).
+    unsigned FixedSimdWidth = VF.getFixedValue();
+    // Note FixedSimdWidth is either a power of two or 3. If FixedSimdWidth
+    // was 1 then we would not enter the body of the loop (as X%1 is 0 for all
+    // X), if FixedSimdWidth is a greater power of two then dividing it by 2
+    // gives another power of two, 3 divided by 2 gives 1, a power of two. Thus
+    // if this loop runs at least once then FixedSimdWidth will be a power of
+    // 2.
+    assert(FixedSimdWidth == 3 || llvm::isPowerOf2_32(FixedSimdWidth));
+    while (FixedSimdWidth != 1 && FixedSimdWidth > LocalSize) {
+      FixedSimdWidth /= 2;
+      assert(FixedSimdWidth > 0 && "Cannot vectorize (or modulo) by 0.");
+    }
+    if (FixedSimdWidth == 1) {
+      ++VeczBail;
+      emitVeczRemarkMissed(Kernel, nullptr,
+                           "requested Vectorization factor of 1");
+      return nullptr;
+    }
+    VF = ElementCount::get(FixedSimdWidth, false);
+  }
+
+  bool canVectorize = true;
+  if (Check) {
+    auto Res = FAM.getResult<VectorizableFunctionAnalysis>(*Kernel);
+    canVectorize = Res.canVectorize;
+  }
+
+  if (canVectorize &&
+      (!Auto || shouldVectorize(*Kernel, Ctx, VF, SimdDimIdx))) {
+    auto VU =
+        Ctx.createVectorizationUnit(*Kernel, VF, SimdDimIdx, Opts.choices);
+    VU->setAutoWidth(Auto);
+    VU->setLocalSize(Opts.local_size);
+    return VU;
+  }
+  return nullptr;
+}
+
+void vecz::trackVeczSuccessFailure(VectorizationUnit &VU) {
+  Function *Fn = VU.scalarFunction();
+  Function *vectorizedFn = VU.vectorizedFunction();
+  bool failed = VU.failed();
+  VeczFail += failed;
+  VeczSuccess += !failed;
+  collectStatistics(VU, Fn, vectorizedFn);
+
+  if (VeczDumpReport) {
+    auto const VF = VU.width();
+    auto FnName = Fn->getName();
+    if (vectorizedFn) {
+      errs() << "vecz: Vectorization succeeded for kernel '" << FnName
+             << "' << (" << (VF.isScalable() ? "scalable-vector" : "SIMD")
+             << " factor: " << VF.getKnownMinValue() << ") "
+             << *vectorizedFn->getType() << "\n";
+    } else {
+      errs() << "vecz: Vectorization failed for kernel '" << FnName << "'\n";
+    }
+  }
+}
+
+bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) {
+  Function *fn = vu.scalarFunction();
+  Function *vectorizedFn = vu.vectorizedFunction();
+  if (vu.failed()) {
+    vectorizedFn = nullptr;
+  } else {
+    // If vectorization succeeded, clone the OpenCL related metadata from the
+    // scalar kernel. We do not do this while cloning the kernel because if
+    // vectorization fails we will have metadata pointing to non-existing
+    // kernels.
+    cloneOpenCLMetadata(vu);
+  }
+  auto const vf = vu.width();
+  auto const dim = vu.dimension();
+
+  // emit output metadata based on vectorization result
+  auto finalVF = compiler::utils::VectorizationFactor(vf.getKnownMinValue(),
+                                                      vf.isScalable());
+
+  compiler::utils::VectorizationInfo info{finalVF, dim,
+                                          vu.choices().vectorPredication()};
+
+  if (vectorizedFn && vectorizedFn != fn) {  // success
+    // Link the original function to the vectorized one.
+    compiler::utils::linkOrigToVeczFnMetadata(*fn, *vectorizedFn, info);
+
+    // Link the vectorized function back to the original one.
+    compiler::utils::linkVeczToOrigFnMetadata(*vectorizedFn, *fn, info);
+  } else {  // fail or bail
+    compiler::utils::encodeVectorizationFailedMetadata(*fn, info);
+  }
+  return vectorizedFn;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
new file mode 100644
index 0000000000000..fa9d5cb1373cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -0,0 +1,274 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vecz_pass_builder.h"
+
+#include <llvm/Analysis/AssumptionCache.h>
+#include <llvm/Analysis/BasicAliasAnalysis.h>
+#include <llvm/Analysis/DominanceFrontier.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/MemoryDependenceAnalysis.h>
+#include <llvm/Analysis/MemorySSA.h>
+#include <llvm/Analysis/PhiValues.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Analysis/ScalarEvolution.h>
+#include <llvm/Analysis/TargetLibraryInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Config/llvm-config.h>
+#include <llvm/IR/PassManagerImpl.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h>
+#include <llvm/Transforms/InstCombine/InstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Scalar/ADCE.h>
+#include <llvm/Transforms/Scalar/DCE.h>
+#include <llvm/Transforms/Scalar/EarlyCSE.h>
+#include <llvm/Transforms/Scalar/FlattenCFG.h>
+#include <llvm/Transforms/Scalar/GVN.h>
+#include <llvm/Transforms/Scalar/IndVarSimplify.h>
+#include <llvm/Transforms/Scalar/SimplifyCFG.h>
+#include <llvm/Transforms/Scalar/Sink.h>
+#include <llvm/Transforms/Utils/BreakCriticalEdges.h>
+#include <llvm/Transforms/Utils/FixIrreducible.h>
+#include <llvm/Transforms/Utils/LowerSwitch.h>
+#include <llvm/Transforms/Utils/Mem2Reg.h>
+#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+
+#include <cassert>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/liveness_analysis.h"
+#include "analysis/packetization_analysis.h"
+#include "analysis/simd_width_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorizable_function_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/common_gep_elimination_pass.h"
+#include "transform/control_flow_conversion_pass.h"
+#include "transform/inline_post_vectorization_pass.h"
+#include "transform/interleaved_group_combine_pass.h"
+#include "transform/packetization_helpers.h"
+#include "transform/packetization_pass.h"
+#include "transform/passes.h"
+#include "transform/scalarization_pass.h"
+#include "transform/ternary_transform_pass.h"
+
+#define DEBUG_TYPE "vecz"
+using namespace llvm;
+using namespace vecz;
+
+VeczPassMachinery::VeczPassMachinery(
+    llvm::LLVMContext &llvmCtx, llvm::TargetMachine *TM,
+    VectorizationContext &Ctx, bool verifyEach,
+    compiler::utils::DebugLogging debugLogLevel)
+    : compiler::utils::PassMachinery(llvmCtx, TM, verifyEach, debugLogLevel),
+      Ctx(Ctx) {}
+
+void VeczPassMachinery::registerPasses() {
+  // Register standard passes
+  compiler::utils::PassMachinery::registerPasses();
+
+  FAM.registerPass([&] { return VectorizationContextAnalysis(Ctx); });
+  FAM.registerPass([&] { return VectorizationUnitAnalysis(Ctx); });
+  FAM.registerPass([&] { return VectorizableFunctionAnalysis(); });
+  FAM.registerPass([] { return StrideAnalysis(); });
+  FAM.registerPass([] { return UniformValueAnalysis(); });
+  FAM.registerPass([] { return LivenessAnalysis(); });
+  FAM.registerPass([] { return PacketizationAnalysis(); });
+  FAM.registerPass([] { return CFGAnalysis(); });
+  FAM.registerPass([] { return DivergenceAnalysis(); });
+
+  if (!TM) {
+    FAM.registerPass([] { return TargetIRAnalysis(); });
+  } else {
+    FAM.registerPass(
+        [this] { return TargetIRAnalysis(TM->getTargetIRAnalysis()); });
+    FAM.registerPass([] { return SimdWidthAnalysis(); });
+  }
+}
+
+void VeczPassMachinery::addClassToPassNames() {
+  {
+#define MODULE_PASS(NAME, CREATE_PASS) \
+  PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define FUNCTION_PASS(NAME, CREATE_PASS) \
+  PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define LOOP_PASS(NAME, CREATE_PASS) \
+  PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#include "passes.def"
+  }
+
+  // Register a callback which skips all passes once we've failed to vectorize
+  // a function.
+  PIC.registerShouldRunOptionalPassCallback([&](StringRef, llvm::Any IR) {
+#if LLVM_VERSION_GREATER_EQUAL(16, 0)
+    const Function **FPtr = any_cast<const Function *>(&IR);
+    const Function *F = FPtr ? *FPtr : nullptr;
+    if (!F) {
+      if (const auto **L = any_cast<const Loop *>(&IR)) {
+        F = (*L)->getHeader()->getParent();
+      } else {
+        // Always run module passes
+        return true;
+      }
+    }
+#else
+    const Function *F = nullptr;
+    if (any_isa<const Function *>(IR)) {
+      F = any_cast<const Function *>(IR);
+    } else if (any_isa<const Loop *>(IR)) {
+      F = any_cast<const Loop *>(IR)->getHeader()->getParent();
+    } else {
+      // Always run module passes
+      return true;
+    }
+#endif
+    // FIXME: This is repeating the job of the VectorizationUnitAnalysis.
+    // We should track 'failure' more directly in the
+    // Function/VectorizationContext?
+    auto const *const VU = Ctx.getActiveVU(F);
+    if (!VU) {
+      // Don't run on anything without a VU since it's not currently being
+      // vectorized.
+      return false;
+    }
+    return !VU->failed();
+  });
+}
+
+void VeczPassMachinery::registerPassCallbacks() {
+  // Add a backwards-compatible way of supporting simplifycfg, which used
+  // to be called simplify-cfg before LLVM 12.
+  PB.registerPipelineParsingCallback(
+      [](StringRef Name, ModulePassManager &PM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+#define MODULE_PASS(NAME, CREATE_PASS) \
+  if (Name == NAME) {                  \
+    PM.addPass(CREATE_PASS);           \
+    return true;                       \
+  }
+#define FUNCTION_PASS(NAME, CREATE_PASS)                        \
+  if (Name == NAME) {                                           \
+    PM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS)); \
+    return true;                                                \
+  }
+#define LOOP_PASS(NAME, CREATE_PASS)                    \
+  if (Name == NAME) {                                   \
+    PM.addPass(createModuleToFunctionPassAdaptor(       \
+        createFunctionToLoopPassAdaptor(CREATE_PASS))); \
+    return true;                                        \
+  }
+#include "passes.def"
+        return false;
+      });
+}
+
+bool vecz::buildPassPipeline(ModulePassManager &PM) {
+  // Preparation passes
+  PM.addPass(BuiltinInliningPass());
+  // Lower switches after builtin inlining, incase the builtins had switches.
+  PM.addPass(createModuleToFunctionPassAdaptor(LowerSwitchPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(FixIrreduciblePass()));
+
+  // We have to run LLVM's Mem2Reg pass in case the front end didn't
+  PM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
+  // LLVM's own Mem2Reg pass doesn't always get everything
+  PM.addPass(createModuleToFunctionPassAdaptor(BasicMem2RegPass()));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(PreLinearizePass()));
+  // If pre-linearization created any unnecessary Hoist Guards,
+  // Instruction Combining Pass will handily clean them up.
+  PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(UnifyFunctionExitNodesPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(LoopSimplifyPass()));
+  // Lower switches again because CFG simplifcation can create them.
+  PM.addPass(createModuleToFunctionPassAdaptor(LowerSwitchPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(
+      createFunctionToLoopPassAdaptor(VeczLoopRotatePass())));
+  // IndVarSimplify can create a lot of duplicate instructions when there
+  // are unrolled loops. EarlyCSE is there to clear them up. However,
+  // this can destroy LCSSA form, so we need to restore it.
+  PM.addPass(createModuleToFunctionPassAdaptor(
+      createFunctionToLoopPassAdaptor(IndVarSimplifyPass())));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(EarlyCSEPass()));
+  // We run this last because EarlyCSE can actually create infinite loops
+  // (with a "conditional" branch on true)
+  PM.addPass(createModuleToFunctionPassAdaptor(
+      createFunctionToLoopPassAdaptor(SimplifyInfiniteLoopPass())));
+
+  // Verify that the preparation passes cleaned up after themselves.
+  PM.addPass(VerifierPass());
+
+  PM.addPass(createModuleToFunctionPassAdaptor(RemoveIntPtrPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SquashSmallVectorsPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(UniformReassociationPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(TernaryTransformPass()));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(BreakCriticalEdgesPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(LCSSAPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(ControlFlowConversionPass()));
+  PM.addPass(VerifierPass());
+  PM.addPass(createModuleToFunctionPassAdaptor(DivergenceCleanupPass()));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(CommonGEPEliminationPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(ScalarizationPass()));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(ADCEPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyMaskedMemOpsPass()));
+
+  // Having multiple GEP instructions that perform the same operation
+  // greatly amplifies the code generated by the packetizer as it duplicates
+  // the amount of extractelement instructions, so we want to remove what
+  // is unnecessary.
+  PM.addPass(createModuleToFunctionPassAdaptor(CommonGEPEliminationPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(PacketizationPass()));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(InlinePostVectorizationPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(FlattenCFGPass()));
+  PM.addPass(
+      createModuleToFunctionPassAdaptor(GVNPass(GVNOptions().setMemDep(true))));
+  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(ADCEPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SinkingPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
+
+  PM.addPass(createModuleToFunctionPassAdaptor(
+      InterleavedGroupCombinePass(eInterleavedStore)));
+  PM.addPass(createModuleToFunctionPassAdaptor(
+      InterleavedGroupCombinePass(eInterleavedLoad)));
+  PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
+  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyMaskedMemOpsPass()));
+  PM.addPass(DefineInternalBuiltinsPass());
+
+  PM.addPass(VerifierPass());
+
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..90dd000ea6e29
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
@@ -0,0 +1,18 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
new file mode 100644
index 0000000000000..193613154c125
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idx.ext = sext i32 %mul3 to i64
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i32, i32 addrspace(1)* %add.ptr, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr, i64 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx4, align 4
+  %add5 = add nsw i32 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add5, i32 addrspace(1)* %arrayidx8, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
new file mode 100644
index 0000000000000..ae54f068548a0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %conv4 = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %conv4
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add5 = or i64 %conv4, 1
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add5
+  %1 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %add7 = add nsw i32 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add7, i32 addrspace(1)* %arrayidx10, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
new file mode 100644
index 0000000000000..a87c25a1fc8da
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4
+  %add10 = add nsw i32 %1, %0
+  %idxprom13 = sext i32 %add to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom13
+  store i32 %add10, i32 addrspace(1)* %arrayidx14, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
new file mode 100644
index 0000000000000..ec0539d5851c0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = or i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4
+  %sub = sub nsw i32 %0, %1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12
+  store i32 %sub, i32 addrspace(1)* %arrayidx13, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
new file mode 100644
index 0000000000000..793fdce9984a1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4
+  %add13 = add nsw i32 %mul3, 2
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom14
+  %2 = load i32, i32 addrspace(1)* %arrayidx15, align 4
+  %add19 = add nsw i32 %mul3, 3
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom20
+  %3 = load i32, i32 addrspace(1)* %arrayidx21, align 4
+  %add22 = add nsw i32 %1, %0
+  %add23 = add nsw i32 %add22, %2
+  %add24 = add nsw i32 %add23, %3
+  %idxprom27 = sext i32 %add to i64
+  %arrayidx28 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom27
+  store i32 %add24, i32 addrspace(1)* %arrayidx28, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD1:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD1]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD1]], 1
+; CHECK: [[LOAD2:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD2]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD2]], 1
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
new file mode 100644
index 0000000000000..4f20d63d9e6d7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
@@ -0,0 +1,59 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = add nsw i32 %mul3, 3
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %shl = shl i32 %0, 1
+  %add8 = add nsw i32 %mul3, 2
+  %idxprom9 = sext i32 %add8 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %sub = sub nsw i32 %shl, %1
+  %idxprom13 = sext i32 %add to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom13
+  store i32 %sub, i32 addrspace(1)* %arrayidx14, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
new file mode 100644
index 0000000000000..185d4863140ac
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
@@ -0,0 +1,150 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k boscc_killer -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+@boscc_killer.shared = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @boscc_killer(float addrspace(1)* %A, float addrspace(1)* %B, i32 %N, i32 %lda) {
+entry:
+  %gid0 = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %cmp0 = icmp eq i64 %gid0, 0
+  br i1 %cmp0, label %if.then, label %if.end
+
+if.then:                                        ; preds = %if.end24
+  store i32 %N, i32 addrspace(3)* @boscc_killer.shared, align 4
+  br label %if.end
+
+if.end:                                         ; preds = %for.end, %if.end24
+  %ldl.a = load i32, i32 addrspace(3)* @boscc_killer.shared, align 4
+  %ldl.b = trunc i64 %gid0 to i32
+  %ldl = add i32 %ldl.a, %ldl.b
+  %cmp1 = icmp eq i32 %ldl, 0
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.else:                                       ; preds = %if.end
+  %cmp2 = icmp slt i32 %ldl, %N
+  br i1 %cmp2, label %for.body, label %exit
+
+for.body:                                   ; preds = %for.inc, %if.end227
+  %acc = phi i32 [ %update2, %for.inc ], [ 1, %if.else ]
+  %acc_shl = shl nuw nsw i32 %acc, 2
+  %update = add i32 %ldl, %acc_shl
+  %cmp3 = icmp slt i32 %update, %ldl
+  br i1 %cmp3, label %for.if.then, label %for.inc
+
+for.if.then:                                    ; preds = %for.body
+  %mul297.us = mul nsw i32 %update, %lda
+  %add298.us = add nsw i32 %mul297.us, %ldl
+  %idxprom299.us = sext i32 %add298.us to i64
+  %arrayidx300.us = getelementptr inbounds float, float addrspace(1)* %A, i64 %idxprom299.us
+  store float zeroinitializer, float addrspace(1)* %arrayidx300.us, align 16
+  br label %for.inc
+
+for.inc:                                     ; preds = %for.if.then, %for.body
+  %update2 = add nuw nsw i32 %acc, 1
+  %cmp4 = icmp ult i32 %acc, 4
+  br i1 %cmp4, label %for.body, label %exit
+
+if.then2:                                        ; preds = %if.end
+  %gid0_trunc = trunc i64 %gid0 to i32
+  %cmp5 = icmp sgt i32 %ldl, %gid0_trunc
+  br i1 %cmp5, label %if.then3, label %exit
+
+if.then3:                             ; preds = %for.cond.exit, %if.then53
+  %arrayidxB = getelementptr inbounds float, float addrspace(1)* %B, i64 %gid0
+  %v23 = load float, float addrspace(1)* %arrayidxB, align 16
+  %arrayidxA = getelementptr inbounds float, float addrspace(1)* %A, i64 %gid0
+  store float %v23, float addrspace(1)* %arrayidxA, align 16
+  %call149 = tail call spir_func i64 @_Z14get_local_sizej(i32 0) #6
+  %conv152 = add i64 %call149, %gid0
+  %cmp71 = icmp slt i64 %conv152, 0
+  br label %exit
+
+exit:                                          ; preds = %for.inc, %if.end227, %for.cond.exit, %if.then53, %entry
+  ret void
+}
+
+; We mostly want to check that it succeeded since this CFG crashed the block
+; ordering algorithm, however it seems it is not easy to create a UnitCL test
+; for this, since the CFG gets changed into something that doesn't cause the
+; crash. This bug was identified from an Ecosystem failure, however, so it must
+; be possible to do somehow.
+;
+; CHECK: spir_kernel void @__vecz_v4_boscc_killer
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %entry.boscc_indir
+; CHECK: if.then.uniform:
+; CHECK: br label %if.end
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end, label %if.then
+; CHECK: if.then:
+; CHECK: br label %if.end
+; CHECK: if.end:
+; CHECK: br i1 %{{.+}}, label %if.then2.uniform, label %if.end.boscc_indir
+; CHECK: if.else.uniform:
+; CHECK: br i1 %{{.+}}, label %for.body.preheader.uniform, label %if.else.uniform.boscc_indir
+; CHECK: for.body.preheader.uniform:
+; CHECK: br label %for.body.uniform
+; CHECK: if.else.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %exit, label %for.body.preheader
+; CHECK: for.body.uniform:
+; CHECK: br i1 %{{.+}}, label %for.if.then.uniform, label %for.body.uniform.boscc_indir
+; CHECK: for.if.then.uniform:
+; CHECK: br label %for.inc.uniform
+; CHECK: for.body.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.inc.uniform, label %for.body.uniform.boscc_store
+; CHECK: for.body.uniform.boscc_store:
+; CHECK: br label %for.if.then
+; CHECK: for.inc.uniform:
+; CHECK: br i1 %{{.+}}, label %for.body.uniform, label %exit.loopexit.uniform
+; CHECK: exit.loopexit.uniform:
+; CHECK: br label %exit
+; CHECK: if.then2.uniform:
+; CHECK: br i1 %{{.+}}, label %if.then3.uniform, label %if.then2.uniform.boscc_indir
+; CHECK: if.end.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.else.uniform, label %if.else
+; CHECK: if.then3.uniform:
+; CHECK: br label %exit
+; CHECK: if.then2.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %exit, label %if.then3
+; CHECK: if.else:
+; CHECK: br label %for.body.preheader
+; CHECK: for.body.preheader:
+; CHECK: br label %for.body
+; CHECK: for.body:
+; CHECK: br label %for.if.then
+; CHECK: for.if.then:
+; CHECK: br label %for.inc
+; CHECK: for.inc:
+; CHECK: br i1 %{{.+}}, label %for.body, label %exit.loopexit
+; CHECK: if.then2:
+; CHECK: br label %if.then3
+; CHECK: if.then3:
+; CHECK: br label %exit
+; CHECK: exit.loopexit:
+; CHECK: br label %if.then2
+; CHECK: exit:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
new file mode 100644
index 0000000000000..64b4c3539143e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -0,0 +1,298 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k boscc_merge -vecz-passes="function(instcombine),function(simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32) #0
+declare spir_func i64 @_Z14get_local_sizej(i32) #0
+
+define spir_kernel void @boscc_merge(i32 %n, float addrspace(1)* %out, i64 %x) {
+entry:
+  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %lsize = tail call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %out_ptr = getelementptr inbounds float, float addrspace(1)* %out, i64 %x
+  %lid_sum_lsize = add i64 %lid, %lsize
+  %cmp1 = icmp ult i64 %lsize, %x
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                              ; preds = %entry
+  %cmp2 = icmp ult i64 %lid, %x
+  br i1 %cmp2, label %if.then2, label %if.else2.preheader
+
+if.else2.preheader:                                   ; preds = %if.then
+  store float 0.000000e+00, float addrspace(1)* %out_ptr, align 4 ; just so it's non-trivial for BOSCC
+  br label %if.else2
+
+if.then2:                                              ; preds = %if.then
+  %cmp3 = icmp ugt i64 %lsize, %x
+  br i1 %cmp3, label %if.then3.preheader, label %if.else3.preheader
+
+if.else3.preheader:                                    ; preds = %if.then2
+  br label %if.else3
+
+if.then3.preheader:                                    ; preds = %if.then2
+  br label %if.then3
+
+if.then3:                                              ; preds = %if.then3.preheader, %if.else5
+  %cmp4 = icmp ugt i64 %lid, %x
+  br i1 %cmp4, label %if.then4.preheader, label %if.else4.preheader
+
+if.else4.preheader:                                    ; preds = %if.then3
+  br label %if.else4
+
+if.then4.preheader:                                    ; preds = %if.then3
+  br label %if.then4
+
+if.else4:                                              ; preds = %if.else4.preheader, %if.else4
+  %cmp5 = icmp ult i64 %lid, %x
+  br i1 %cmp5, label %if.else4, label %if.else5.loopexit1
+
+if.else5.loopexit:                                     ; preds = %if.then4
+  br label %if.else5
+
+if.else5.loopexit1:                                    ; preds = %if.else4
+  br label %if.else5
+
+if.else5:                                              ; preds = %if.else5.loopexit1, %if.else5.loopexit
+  %cmp6 = icmp ult i64 %lid, %x
+  br i1 %cmp6, label %if.then3, label %if.else.loopexit
+
+if.then4:                                              ; preds = %if.then4.preheader, %if.then4
+  %cmp7 = icmp ult i64 %lid_sum_lsize, %x
+  br i1 %cmp7, label %if.then4, label %if.else5.loopexit
+
+if.else3:                                              ; preds = %if.else3.preheader, %if.else3
+  %cmp8 = icmp ult i64 %lid_sum_lsize, %x
+  br i1 %cmp8, label %if.else3, label %if.else.loopexit2
+
+if.else2:                                             ; preds = %if.else2.preheader, %if.else2
+  %cmp9 = icmp ult i64 %lid_sum_lsize, %x
+  br i1 %cmp9, label %if.else2, label %if.else.loopexit3
+
+if.else.loopexit:                                    ; preds = %if.else5
+  br label %if.else
+
+if.else.loopexit2:                                   ; preds = %if.else3
+  br label %if.else
+
+if.else.loopexit3:                                   ; preds = %if.else2
+  br label %if.else
+
+if.else:                                             ; preds = %if.else.loopexit3, %if.else.loopexit2, %if.else.loopexit, %entry
+  %cmp10 = icmp ult i64 %lid, %x
+  br i1 %cmp10, label %if.then5, label %if.else6
+
+if.then5:                                             ; preds = %if.else
+  %cmp11 = icmp eq i64 %x, 0
+  br i1 %cmp11, label %if.then6, label %if.else7
+
+if.else7:                                             ; preds = %if.then5
+  %load = load float, float addrspace(1)* %out, align 4
+  br label %if.then6
+
+if.then6:                                             ; preds = %if.else7, %if.then5
+  %ret = phi float [ 0.000000e+00, %if.then5 ], [ %load, %if.else7 ]
+  store float %ret, float addrspace(1)* %out_ptr, align 4
+  br label %if.else6
+
+if.else6:                                             ; preds = %if.then6, %if.else
+  ret void
+}
+
+; CHECK: spir_kernel void @__vecz_v4_boscc_merge
+; CHECK: %[[CMP1:.+]] = icmp
+; CHECK:  br i1 %[[CMP1]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE2PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[IFELSE2UNIFORM:.+]]
+
+; CHECK: [[IFELSE2UNIFORM]]:
+; CHECK:  br i1 %{{.+}}, label %[[IFELSE2UNIFORM]], label %[[IFELSE2UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE2UNIFORMBOSCCINDIR]]:
+; CHECK:  br i1 %{{.+}}, label %[[IFELSELOOPEXIT3UNIFORM:.+]], label %[[IFELSE2UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE2UNIFORMBOSCCSTORE]]:
+; CHECK:  br label %[[IFELSE2:.+]]
+
+; CHECK: [[IFELSELOOPEXIT3UNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFTHEN2UNIFORM]]:
+; CHECK: %[[CMP3UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP3UNIFORM]], label %[[IFTHEN3PREHEADERUNIFORM:.+]], label %[[IFELSE3PREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE2PREHEADERUNIFORM]], label %[[IFELSE2PREHEADER:.+]]
+
+; CHECK: [[IFELSE3PREHEADERUNIFORM]]:
+; CHECK: br label %[[IFELSE3UNIFORM:.+]]
+
+; CHECK: [[IFELSE3UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE3UNIFORM]], label %[[IFELSE3UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE3UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXIT2UNIFORM:.+]], label %[[IFELSE3UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE3UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFELSE3:.+]]
+
+; CHECK: [[IFELSELOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM]]
+
+; CHECK: [[IFTHEN3PREHEADERUNIFORM]]:
+; CHECK: br label %[[IFTHEN3UNIFORM:.+]]
+
+; CHECK: [[IFTHEN3UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4PREHEADERUNIFORM:.+]], label %[[IFTHEN3UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE5UNIFORMBOSCCINDIR:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXITUNIFORM:.+]], label %[[IFELSE5UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE5UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN3:.+]]
+
+; CHECK: [[IFELSE4PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[IFELSE4UNIFORM:.+]]
+
+; CHECK: [[IFELSE4UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE4UNIFORM]], label %[[IFELSE4UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE4UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE5LOOPEXIT1UNIFORM:.+]], label %[[IFELSE4UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE4UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFELSE4:.+]]
+
+; CHECK: [[IFELSE5LOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[IFELSE5UNIFORM:.+]]
+
+; CHECK: [[IFTHEN4PREHEADERUNIFORM]]:
+; CHECK: br label %[[IFTHEN4UNIFORM:.+]]
+
+; CHECK: [[IFTHEN3UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE4PREHEADERUNIFORM]], label %[[IFTHEN3UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFTHEN3UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFELSE4PREHEADER:.+]]
+
+; CHECK: [[IFTHEN4UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM]], label %[[IFTHEN4UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN4UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE5LOOPEXITUNIFORM:.+]], label %[[IFTHEN4UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFTHEN4UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN4:.+]]
+
+; CHECK: [[IFELSE5LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFELSE5UNIFORM]]
+
+; CHECK: [[IFELSE5UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN3UNIFORM]], label %[[IFELSE5UNIFORMBOSCCINDIR]]
+
+; CHECK: [[IFELSELOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFELSE]]
+
+; CHECK: [[IFELSE2PREHEADER]]:
+; CHECK: br label %[[IFELSE2]]
+
+; CHECK: [[IFTHEN2:.+]]:
+; CHECK: %[[CMP3:.+]] = icmp
+; CHECK: br i1 %[[CMP3]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]]
+
+; CHECK: [[IFELSE3PREHEADER]]:
+; CHECK: br label %[[IFELSE3]]
+
+; CHECK: [[IFTHEN3PREHEADER]]:
+; CHECK: br label %[[IFTHEN3]]
+
+; CHECK: [[IFTHEN3]]:
+; CHECK: br label %[[IFELSE4PREHEADER]]
+
+; CHECK: [[IFELSE4PREHEADER]]:
+; CHECK: br label %[[IFELSE4]]
+
+; CHECK: [[IFTHEN4PREHEADER:.+]]:
+; CHECK: br label %[[IFTHEN4]]
+
+; CHECK: [[IFELSE4]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE4]], label %[[IFELSE4PUREEXIT:.+]]
+
+; CHECK: [[IFELSE4PUREEXIT]]:
+; CHECK: br label %[[IFELSE5LOOPEXIT1:.+]]
+
+; CHECK: [[IFELSE5LOOPEXIT:.+]]:
+; CHECK: br label %[[IFELSE5:.+]]
+
+; CHECK: [[IFELSE5LOOPEXIT1]]:
+; CHECK: br label %[[IFTHEN4PREHEADER]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN3]], label %[[IFTHEN3PUREEXIT:.+]]
+
+; CHECK: [[IFTHEN3PUREEXIT]]:
+; CHECK: br label %[[IFELSELOOPEXIT:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4]], label %[[IFTHEN4PUREEXIT:.+]]
+
+; CHECK: [[IFTHEN4PUREEXIT]]:
+; CHECK: br label %[[IFELSE5LOOPEXIT]]
+
+; CHECK: [[IFELSE3]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE3]], label %[[IFELSE3PUREEXIT:.+]]
+
+; CHECK: [[IFELSE3PUREEXIT]]:
+; CHECK: br label %[[IFELSELOOPEXIT2:.+]]
+
+; CHECK: [[IFELSE2]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE2]], label %[[IFELSE2PUREEXIT:.+]]
+
+; CHECK: [[IFELSE2PUREEXIT]]:
+; CHECK: br label %[[IFELSELOOPEXIT3:.+]]
+
+; CHECK: [[IFELSELOOPEXIT]]:
+; CHECK: br label %[[IFELSE]]
+
+; CHECK: [[IFELSELOOPEXIT2]]:
+; CHECK: br label %[[IFELSE]]
+
+; CHECK: [[IFELSELOOPEXIT3]]:
+; CHECK: br label %[[IFTHEN2]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE7UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE7UNIFORM]]:
+; CHECK: br label %[[IFELSE6:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE6]], label %[[IFELSE7:.+]]
+
+; CHECK: [[IFELSE7]]:
+; CHECK: br label %[[IFELSE6]]
+
+; CHECK: [[IFELSE6]]:
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
new file mode 100644
index 0000000000000..e9f805c0cf2a5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
@@ -0,0 +1,173 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k boscc_merge2 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare void @__mux_work_group_barrier(i32, i32, i32) #3
+declare spir_func float @_Z3maxff(float, float) #1
+declare spir_func i64 @_Z12get_local_idj(i32) #1
+declare spir_func i64 @_Z12get_group_idj(i32) #1
+
+@fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared = internal addrspace(3) global [640 x float] undef, align 4
+@fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared = internal addrspace(3) global [1152 x float] undef, align 4
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @boscc_merge2(float addrspace(1)* noalias %input0, float addrspace(1)* noalias %input1, float addrspace(1)* noalias %tensor, float addrspace(1)* noalias %input2) #2 {
+entry:
+  %compute = alloca [28 x float], align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc2, %for.inc ]
+  %cmp1 = icmp ult i32 %storemerge, 16
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                      ; preds = %for.cond
+  %call1 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call2 = call spir_func i64 @_Z12get_group_idj(i32 1) #5
+  %idx1 = getelementptr inbounds [640 x float], [640 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared, i64 0, i64 %call1
+  store float 0.000000e+00, float addrspace(3)* %idx1, align 4
+  %cmp2 = icmp sgt i64 %call2, %call1
+  br i1 %cmp2, label %if.then2, label %land.lhs.true1
+
+land.lhs.true1:                                 ; preds = %if.then
+  %call3 = call spir_func i64 @_Z12get_group_idj(i32 1) #5
+  %call4 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %cmp3 = icmp slt i64 %call3, %call4
+  br i1 %cmp3, label %land.lhs.true2, label %if.then2
+
+land.lhs.true2:                                 ; preds = %land.lhs.true1
+  %call5 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call6 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %cmp4 = icmp sgt i64 %call6, %call5
+  br i1 %cmp4, label %if.then2, label %land.lhs.true3
+
+land.lhs.true3:                                 ; preds = %land.lhs.true2
+  %call7 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %call8 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %cmp5 = icmp slt i64 %call7, %call8
+  br i1 %cmp5, label %cond.true4, label %if.then2
+
+cond.true4:                                     ; preds = %land.lhs.true3
+  %call9 = call spir_func i64 @_Z12get_local_idj(i32 1) #5
+  %idx2 = getelementptr inbounds float, float addrspace(1)* %input0, i64 %call9
+  br label %if.then2
+
+if.then2:                                      ; preds = %cond.true4, %land.lhs.true3, %land.lhs.true2, %land.lhs.true1, %if.then
+  %call10 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %conv = trunc i64 %call10 to i32
+  %idx3 = sext i32 %conv to i64
+  %idx4 = getelementptr inbounds [1152 x float], [1152 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared, i64 0, i64 %idx3
+  %idx5 = getelementptr inbounds float, float addrspace(1)* %input1, i64 %idx3
+  %load1 = load float, float addrspace(1)* %idx5, align 4
+  store float %load1, float addrspace(3)* %idx4, align 4
+  call void @__mux_work_group_barrier(i32 0, i32 1, i32 272) #4
+  br label %for.cond2
+
+for.cond2:                                     ; preds = %for.body, %if.then2
+  %storemerge1 = phi i32 [ 0, %if.then2 ], [ %inc1, %for.body ]
+  %cmp6 = icmp ult i32 %storemerge1, 4
+  br i1 %cmp6, label %for.body, label %for.inc
+
+for.body:                                     ; preds = %for.cond2
+  %load2 = load float, float addrspace(3)* %idx4, align 4
+  %fmul = call float @llvm.fmuladd.f32(float %load2, float %load2, float %load2)
+  %idx6 = getelementptr inbounds [28 x float], [28 x float]* %compute, i64 0, i64 27
+  store float %fmul, float* %idx6, align 4
+  %inc1 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond2
+
+for.inc:                                      ; preds = %for.cond2
+  %inc2 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.else:                                      ; preds = %for.cond
+  %idx7 = getelementptr inbounds [28 x float], [28 x float]* %compute, i64 0, i64 0
+  %load3 = load float, float* %idx7, align 4
+  %storemerge_sext = sext i32 %storemerge to i64
+  %idx8 = getelementptr inbounds float, float addrspace(1)* %tensor, i64 %storemerge_sext
+  store float %load3, float addrspace(1)* %idx8, align 4
+  ret void
+}
+
+attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+; CHECK: spir_kernel void @__vecz_v4_boscc_merge2
+; CHECK:  br label %[[IFTHEN:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[LANDLHSTRUE1UNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE2UNIFORM:.+]], label %[[LANDLHSTRUE1UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[LANDLHSTRUE2UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[LANDLHSTRUE2UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[LANDLHSTRUE1UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[LANDLHSTRUE2:.+]]
+
+; CHECK: [[LANDLHSTRUE3UNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[CONDTRUE4UNIFORM:.+]], label %[[LANDLHSTRUE3UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[CONDTRUE4UNIFORM]]:
+; CHECK: br label %[[IFTHEN2]]
+
+; CHECK: [[LANDLHSTRUE3UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[CONDTRUE4:.+]]
+
+; CHECK: [[LANDLHSTRUE1:.+]]:
+; CHECK: br label %[[LANDLHSTRUE2]]
+
+; CHECK: [[LANDLHSTRUE2]]:
+; CHECK: br label %[[LANDLHSTRUE3:.+]]
+
+; CHECK: [[LANDLHSTRUE3]]:
+; CHECK: br label %[[CONDTRUE4]]
+
+; CHECK: [[CONDTRUE4]]:
+; CHECK: br label %[[IFTHEN2]]
+
+; CHECK: [[IFTHEN2]]:
+; CHECK: br label %[[FORCOND2:.+]]
+
+; CHECK: [[LANDLHSTRUE2UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE3UNIFORM]], label %[[LANDLHSTRUE3]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE1UNIFORM]], label %[[LANDLHSTRUE1]]
+
+; CHECK: [[FORCOND2]]:
+; CHECK: %[[EXITCOND:.+]] = icmp
+; CHECK: br i1 %[[EXITCOND]], label %[[FORBODY:.+]], label %[[FORINC:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND2]]
+
+; CHECK: [[FORINC]]:
+; CHECK: %[[EXITCOND4:.+]] = icmp
+; CHECK: br i1 %[[EXITCOND4]], label %[[IFTHEN]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
new file mode 100644
index 0000000000000..26f1ee02a1fe4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -0,0 +1,130 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k boscc_merge3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z13get_global_idj(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64, float addrspace(1)*)
+
+define spir_kernel void @boscc_merge3(float addrspace(1)* %out, i64 %n, float %m) {
+entry:
+  %gid0 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
+  %gid1 = tail call spir_func i64 @_Z13get_global_idj(i32 1) #0
+  %cmp1 = icmp slt i64 %gid0, %n
+  br i1 %cmp1, label %if.then1, label %end
+
+if.then1:                                     ; preds = %entry
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %gid1
+  %cmp2 = fcmp une float %m, 0.000000e+00
+  br i1 %cmp2, label %if.then2, label %if.end1
+
+if.then2:                                     ; preds = %if.then1
+  %cmp3 = icmp sge i64 %gid1, %n
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %gid0
+  br i1 %cmp3, label %if.then3, label %if.else3
+
+if.then3:                                     ; preds = %x51
+  %load1 = load float, float addrspace(1)* %gep2, align 4
+  %ie_load1 = insertelement <4 x float> undef, float %load1, i32 0
+  br label %if.end2
+
+if.else3:                                    ; preds = %x51
+  %vload1 = tail call spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64 0, float addrspace(1)* %gep2)
+  %cmp4 = icmp slt i64 %gid0, %n
+  br i1 %cmp4, label %if.then4, label %if.end2
+
+if.then4:                                    ; preds = %x175
+  %vload2 = tail call spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64 4, float addrspace(1)* %gep2)
+  br label %if.end2
+
+if.end2:                                    ; preds = %x274, %x271, %if.then4, %x175, %x155, %x132
+  %phi_gep2_load = phi <4 x float> [ %ie_load1, %if.then3 ], [ %vload2, %if.then4 ], [ %vload1, %if.else3 ]
+  %ie_m = insertelement <4 x float> undef, float %m, i32 0
+  %shuffle_ie_m = shufflevector <4 x float> %ie_m, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmul = fmul <4 x float> %shuffle_ie_m, %phi_gep2_load
+  br label %if.end1
+
+if.end1:                                    ; preds = %if.end2, %if.then1
+  %phi_fmul = phi <4 x float> [ %fmul, %if.end2 ], [ zeroinitializer, %if.then1 ]
+  %ee0 = extractelement <4 x float> %phi_fmul, i32 0
+  store float %ee0, float addrspace(1)* %gep1, align 4
+  br label %end
+
+end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
+; CHECK: spir_kernel void @__vecz_v4_boscc_merge3
+; CHECK: entry:
+; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp1)
+; CHECK: br i1 %[[BOSCC]], label %if.then1.uniform, label %entry.boscc_indir
+
+; CHECK: if.then1.uniform:
+; CHECK: %gep1.uniform =
+; CHECK: br i1 %cmp2.uniform, label %if.then2.uniform, label %if.end1.uniform
+
+; CHECK: if.else3.uniform:
+; CHECK: %[[BOSCC2:.+]] = call i1 @__vecz_b_divergence_all(i1 %{{if.then4.uniform.exit_mask|cmp4.uniform}})
+; CHECK: br i1 %[[BOSCC2]], label %if.then4.uniform, label %if.else3.uniform.boscc_indir
+
+; CHECK: if.else3.uniform.boscc_indir:
+; CHECK: %[[BOSCC3:.+]] = call i1 @__vecz_b_divergence_all(i1 %if.end2.uniform.exit_mask)
+; CHECK: br i1 %[[BOSCC3]], label %if.end2.uniform, label %if.then4
+
+; CHECK: if.then1:
+; CHECK: %gep1 =
+; CHECK: br i1 %cmp2, label %if.then2, label %if.end1
+
+; Generalizing the expected %cmp3 value because the 'icmp' could go off
+; by one BB between LLVM versions. Therefore we can get %cmp3.not.
+; CHECK: if.then2:
+; CHECK: br i1 %cmp3{{(.+)?}}, label %if.else3, label %if.then3
+
+; CHECK: if.then3:
+; CHECK: br label %if.end2
+
+; CHECK: if.else3:
+; CHECK: br label %if.then4
+
+; CHECK: if.then4:
+; CHECK: %gep1.boscc_blend = phi ptr addrspace(1) [ %gep1.uniform, %if.else3.uniform.boscc_indir ], [ %gep1, %if.else3 ]
+; CHECK: br label %if.end2
+
+; CHECK: if.end2:
+
+; Check we have correctly blended the instruction during the BOSCC connection
+; rather than while repairing the SSA form.
+; CHECK-NOT: %gep1.boscc_blend.merge{{.*}} = phi
+; CHECK: %gep1.boscc_blend{{[0-9]*}} = phi ptr addrspace(1) [ %gep1.boscc_blend{{[0-9]*}}, %if.then4 ], [ %gep1, %if.then3 ]
+; CHECK: br label %if.end1
+
+; CHECK: if.end1:
+
+; Check we have correctly blended the instruction during the BOSCC connection
+; rather than while repairing the SSA form.
+; CHECK-NOT: %gep1.boscc_blend.merge{{.*}} = phi
+; CHECK: %gep1.boscc_blend{{[0-9]*}} = phi ptr addrspace(1) [ %gep1.boscc_blend{{[0-9]*}}, %if.end2 ], [ %gep1, %if.then1 ]
+; CHECK: br label %end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
new file mode 100644
index 0000000000000..153a1fe65b35d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
@@ -0,0 +1,134 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; This test checks that we create a new preheader that blends the preheader
+; of the uniform and the predicated paths for a loop that has not been
+; duplicated (because of the barrier in it).
+
+; RUN: %veczc -k duplicate_preheader -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: noduplicate
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+define spir_kernel void @duplicate_preheader(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %id = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %cmp = icmp sgt i64 %id, 3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                     ; preds = %entry
+  br label %for.cond
+
+for.cond:
+  %ret.0 = phi i64 [ 0, %if.then ], [ %inc, %for.body ]
+  %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp uge i32 %storemerge8, %mul
+  br i1 %cmp2, label %for.body, label %if.end
+
+for.body:
+  %inc = add nsw i64 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge8, 1
+  br label %for.cond
+
+if.end:                                     ; preds = %if.then, %entry
+  %idx.blend = phi i64 [ %id, %entry ], [ %ret.0, %for.cond ]
+  %gep_var = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx.blend
+  br label %barrier
+
+barrier:                                     ; preds = %latch, %if.end
+  call void @__mux_work_group_barrier(i32 0, i32 1, i32 272)
+  br i1 %cmp, label %body, label %latch
+
+body:                                     ; preds = %barrier
+  %gep_uni = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  %ret = load i32, i32 addrspace(1)* %gep_uni, align 16
+  store i32 %ret, i32 addrspace(1)* %gep_var, align 16
+  br label %latch
+
+latch:                                     ; preds = %body, %barrier
+  %cmp3 = icmp sgt i32 %n, 10
+  br i1 %cmp3, label %exit, label %barrier
+
+exit:                                     ; preds = %latch
+  ret void
+}
+
+attributes #1 = { noduplicate }
+
+; CHECK: spir_kernel void @__vecz_v4_duplicate_preheader
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; Make sure we have both the uniform and non-uniform versions of the for loop.
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND:.+]], label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[IFENDLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[IFENDLOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[IFENDLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[IFENDLOOPEXIT]]:
+; CHECK: br label %[[IFEND]]
+
+; Make sure we're reconverging here from the uniform and predicated paths before
+; branching to the barrier.
+; CHECK: [[IFEND]]:{{.*}}preds
+; CHECK-DAG: %[[IFENDLOOPEXIT]]
+; CHECK-DAG: %[[IFENDLOOPEXITUNIFORM]]
+; CHECK: br label %[[BARRIER:.+]]
+
+; CHECK: [[BARRIER]]:
+; CHECK: br i1 %{{.+}}, label %[[BODYUNIFORM:.+]], label %[[BARRIERBOSCCINDIR:.+]]
+
+; CHECK: [[BODYUNIFORM]]:
+; CHECK: br label %[[LATCHUNIFORM:.+]]
+
+; CHECK: [[BARRIERBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[LATCH:.+]], label %[[BODY:.+]]
+
+; CHECK: [[BODY]]:
+; CHECK: br label %[[LATCH]]
+
+; CHECK: [[LATCH]]:
+; CHECK: %[[CMP3:.+]] = icmp
+; CHECK: br i1 %[[CMP3]], label %[[EXIT:.+]], label %[[BARRIER]]
+
+; CHECK: [[EXIT]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
new file mode 100644
index 0000000000000..ecfd84c823353
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
@@ -0,0 +1,198 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k nested_loops1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z13get_global_idj(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z15get_global_sizej(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func float @_Z3madfff(float, float, float) #0
+
+; Function Attrs: nounwind
+define spir_kernel void @nested_loops1(i32 %n, float addrspace(1)* %out) #1 {
+entry:
+  %gid = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
+  %gsize = tail call spir_func i64 @_Z15get_global_sizej(i32 0) #0
+  %trunc_gid = trunc i64 %gid to i32
+  %trunc_gsize = trunc i64 %gsize to i32
+  %cmp1 = icmp slt i32 %trunc_gid, %n
+  br i1 %cmp1, label %if.then1, label %end
+
+if.then1:                                     ; preds = %16
+  %cmp2 = icmp slt i32 %n, 0
+  %cmp3 = icmp slt i32 %n, 0
+  %cmp4 = icmp sgt i32 %n, 0
+  %cmp5 = icmp slt i32 %n, 1
+  br label %for.cond
+
+for.cond:                                     ; preds = %if.else4, %if.then1
+  %trunc_gid_phi = phi i32 [ %trunc_gid, %if.then1 ], [ %add3, %if.else4 ]
+  %cmp6 = icmp eq i32 %trunc_gid_phi, -2147483648
+  %select1 = select i1 %cmp6, i32 1, i32 %n
+  %div1 = sdiv i32 %trunc_gid_phi, %select1
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.else2:                                     ; preds = %for.cond
+  %cmp7 = icmp eq i32 %n, 0
+  %select2 = select i1 %cmp7, i32 1, i32 %n
+  %div2 = sdiv i32 %n, %select2
+  br label %if.then2
+
+if.then2:                                     ; preds = %if.else2, %for.cond
+  br i1 %cmp3, label %if.then3, label %if.else3
+
+if.else3:                                     ; preds = %if.then2
+  %cmp8 = icmp eq i32 %n, 0
+  %select3 = select i1 %cmp8, i32 1, i32 %n
+  %div3 = sdiv i32 %n, %select3
+  br label %if.then3
+
+if.then3:                                     ; preds = %if.else3, %if.then2
+  br i1 %cmp4, label %if.then4, label %if.else4
+
+if.then4:                                     ; preds = %if.then3
+  br i1 %cmp5, label %if.else4, label %if.else5
+
+if.else5:                                     ; preds = %if.then4
+  %sext_div1 = sext i32 %div1 to i64
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_div1
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_div1
+  br label %for.cond2
+
+for.cond2:                                    ; preds = %if.else6, %if.else5
+  %float_idx = phi float [ 0.000000e+00, %if.else5 ], [ %phi_phi_mad, %if.else6 ]
+  %phi_div1_1 = phi i32 [ %div1, %if.else5 ], [ %add2, %if.else6 ]
+  %i32_idx = phi i32 [ 0, %if.else5 ], [ %add2, %if.else6 ]
+  %cmp9 = icmp slt i32 %phi_div1_1, %n
+  br i1 %cmp9, label %if.then6, label %if.else6
+
+if.then6:                                    ; preds = %for.cond2
+  br label %for.cond3
+
+for.cond3:                                    ; preds = %if.else7, %if.then6
+  %phi_float_idx = phi float [ %float_idx, %if.then6 ], [ %phi_mad, %if.else7 ]
+  %phi_div1_2 = phi i32 [ %div1, %if.then6 ], [ %add1, %if.else7 ]
+  %phi_i32_idx = phi i32 [ %i32_idx, %if.then6 ], [ %add1, %if.else7 ]
+  %cmp10 = icmp sgt i32 %phi_div1_2, -1
+  br i1 %cmp10, label %if.then7, label %if.else7
+
+if.then7:                                    ; preds = %for.cond3
+  %sext_phi_div1_2 = sext i32 %phi_div1_2 to i64
+  %gep3 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %sext_phi_div1_2
+  %load1 = load float, float addrspace(1)* %gep3, align 4
+  %sext_phi_i32_idx = sext i32 %phi_i32_idx to i64
+  %gep4 = getelementptr inbounds float, float addrspace(1)* %gep2, i64 %sext_phi_i32_idx
+  %load2 = load float, float addrspace(1)* %gep4, align 4
+  %mad = tail call spir_func float @_Z3madfff(float %load1, float %load2, float %phi_float_idx) #0
+  br label %if.else7
+
+if.else7:                                    ; preds = %if.then7, %for.cond3
+  %phi_mad = phi float [ %mad, %if.then7 ], [ %phi_float_idx, %for.cond3 ]
+  %add1 = add nsw i32 %phi_i32_idx, %n
+  %cmp11 = icmp slt i32 %add1, %div1
+  br i1 %cmp11, label %for.cond3, label %if.else6
+
+if.else6:                                    ; preds = %if.else7, %for.cond2
+  %phi_phi_mad = phi float [ %float_idx, %for.cond2 ], [ %phi_mad, %if.else7 ]
+  %add2 = add nsw i32 %i32_idx, %div1
+  %cmp12 = icmp slt i32 %add2, %div1
+  br i1 %cmp12, label %for.cond2, label %if.else4
+
+if.else4:                                    ; preds = %if.else8, %if.then4, %if.then3
+  %phi_phi_float_idx = phi float [ 0.000000e+00, %if.then3 ], [ 0.000000e+00, %if.then4 ], [ %phi_phi_mad, %if.else6 ]
+  %sext_trunc_gid_phi = sext i32 %trunc_gid_phi to i64
+  %gep5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_trunc_gid_phi
+  store float %phi_phi_float_idx, float addrspace(1)* %gep5, align 4
+  %add3 = add nsw i32 %trunc_gid_phi, %trunc_gsize
+  %cmp13 = icmp slt i32 %add3, %n
+  br i1 %cmp13, label %for.cond, label %end
+
+end:                                    ; preds = %if.else4, %16
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+; The purpose of this test is to make sure we correctly blend all the loops
+; live through at each entry point of the divergent loops and don't create
+; merge instructions for them.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops1
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %if.then1.uniform, label %entry.boscc_indir
+
+; CHECK: if.then1.uniform:
+; CHECK: br label %for.cond.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end, label %if.then1
+
+; CHECK: for.cond2.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.preheader.uniform, label %for.cond2.uniform.boscc_indir
+
+; CHECK: for.cond2.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.else6.uniform, label %for.cond2.uniform.boscc_store
+
+; CHECK: for.cond2.uniform.boscc_store:
+; CHECK: br label %for.cond3.preheader
+
+; CHECK: for.cond3.uniform:
+; CHECK: br i1 %{{.+}}, label %if.then7.uniform, label %for.cond3.uniform.boscc_indir
+
+; CHECK: for.cond3.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.else7.uniform, label %for.cond3.uniform.boscc_store
+
+; CHECK: for.cond3.uniform.boscc_store:
+; CHECK: br label %if.then7
+
+; CHECK: end.loopexit.uniform:
+; CHECK: br label %end
+
+; CHECK: for.cond:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: for.cond2:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: for.cond3:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: if.then7:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: if.else4:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: end.loopexit:
+; CHECK: br label %end
+
+; CHECK: end:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
new file mode 100644
index 0000000000000..97ca530d4c5ae
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
@@ -0,0 +1,140 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k nested_loops2 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @nested_loops2(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %conv, 16
+  br i1 %cmp, label %if.then, label %if.end25
+
+if.then:                                          ; preds = %entry
+  %mul2 = mul nsw i32 %conv, %n
+  %0 = icmp eq i32 %mul2, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div3 = sdiv i32 %mul2, %5
+  %add = add nsw i32 %div3, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.then
+  %ret.0 = phi i32 [ 0, %if.then ], [ %ret.2, %for.inc ]
+  %storemerge = phi i32 [ 0, %if.then ], [ %inc24, %for.inc ]
+  %cmp7 = icmp slt i32 %storemerge, %n
+  br i1 %cmp7, label %for.body, label %if.end25
+
+for.body:                                         ; preds = %for.cond
+  %cmp9 = icmp slt i32 %conv, 9
+  br i1 %cmp9, label %while.body, label %for.inc
+
+while.body:                                       ; preds = %while.body, %for.body
+  %ret.1 = phi i32 [ %ret.0, %for.body ], [ %add17, %while.body ]
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc18, %while.body ]
+  %mul13 = mul nsw i32 %mul2, %mul2
+  %6 = icmp eq i32 %n, 0
+  %7 = select i1 %6, i32 1, i32 %n
+  %div14 = sdiv i32 %mul13, %7
+  %reass.add = add i32 %div14, %add
+  %reass.mul = mul i32 %reass.add, 8
+  %add6 = add i32 %mul2, 1
+  %add16 = add i32 %add6, %add
+  %inc = add i32 %add16, %ret.1
+  %add17 = add i32 %inc, %reass.mul
+  %inc18 = add nuw nsw i32 %j.0, 1
+  %add19 = add nsw i32 %j.0, %conv
+  %cmp20 = icmp sgt i32 %add19, 3
+  br i1 %cmp20, label %for.inc, label %while.body
+
+for.inc:                                          ; preds = %for.body, %while.body
+  %ret.2 = phi i32 [ %ret.0, %for.body ], [ %add17, %while.body ]
+  %inc24 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.end25:                                         ; preds = %for.cond, %entry
+  %ret.3 = phi i32 [ 0, %entry ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i32 addrspace(1)*, i32)* @nested_loops2, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!8 = !{!"kernel_arg_type_qual", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"n"}
+
+; The purpose of this test is to make sure we correctly add a boscc connection
+; at a div causing latch from the uniform region.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops2
+; CHECK: entry:
+; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp)
+; CHECK: br i1 %[[BOSCC]], label %if.then.uniform, label %entry.boscc_indir
+
+; CHECK: if.then.uniform:
+; CHECK: br i1 %cmp71.uniform, label %for.body.lr.ph.uniform, label %if.end25.loopexit.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: %[[BOSCC2:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp.not{{.*}})
+; CHECK: br i1 %[[BOSCC2]], label %if.end25, label %if.then
+
+; CHECK: for.body.lr.ph.uniform:
+; CHECK: br label %for.body.uniform
+
+; CHECK: for.body.uniform:
+; CHECK: br i1 %[[LBLCOND:.+]], label %while.body.preheader.uniform, label %for.body.uniform.boscc_indir
+
+; CHECK: while.body.preheader.uniform:
+; CHECK: br label %while.body.uniform
+
+; CHECK: for.body.uniform.boscc_indir:
+; CHECK: %[[BOSCC3:.+]] = call i1 @__vecz_b_divergence_all(i1 %for.inc.uniform.exit_mask)
+; CHECK: br i1 %[[BOSCC3]], label %for.inc.uniform, label %for.body.uniform.boscc_store
+
+; CHECK: while.body.uniform:
+; CHECK: %cmp20.uniform = icmp sgt i32 %add19.uniform, 3
+; CHECK-NOT: br i1 %[[LBLCOND3:.+]], label %for.inc.loopexit.uniform, label %while.body.uniform
+; CHECK: br i1 %[[LBLCOND2:.+]], label %for.inc.loopexit.uniform, label %while.body.uniform.boscc_indir
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
new file mode 100644
index 0000000000000..95236e3d1aad1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
@@ -0,0 +1,149 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k nested_loops3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #0
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @nested_loops3(float addrspace(1)* %symmat, float addrspace(1)* %data, i32 %m, i32 %n) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %conv = trunc i64 %call to i32
+  %sub = add nsw i32 %m, -1
+  %cmp = icmp sgt i32 %sub, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul = mul nsw i32 %conv, %m
+  %add = add nsw i32 %mul, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom
+  store float 1.000000e+00, float addrspace(1)* %arrayidx, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %if.then
+  %storemerge.in = phi i32 [ %conv, %if.then ], [ %storemerge, %for.end ]
+  %storemerge = add nsw i32 %storemerge.in, 1
+  %cmp3 = icmp slt i32 %storemerge, %m
+  br i1 %cmp3, label %for.cond5, label %if.end
+
+for.cond5:                                        ; preds = %for.body8, %for.cond
+  %storemerge1 = phi i32 [ %inc, %for.body8 ], [ 0, %for.cond ]
+  %cmp6 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp6, label %for.body8, label %for.end
+
+for.body8:                                        ; preds = %for.cond5
+  %mul9 = mul nsw i32 %storemerge1, %m
+  %add10 = add nsw i32 %mul9, %conv
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds float, float addrspace(1)* %data, i64 %idxprom11
+  %0 = load float, float addrspace(1)* %arrayidx12, align 4
+  %mul13 = mul nsw i32 %storemerge1, %m
+  %add14 = add nsw i32 %mul13, %storemerge
+  %idxprom15 = sext i32 %add14 to i64
+  %arrayidx16 = getelementptr inbounds float, float addrspace(1)* %data, i64 %idxprom15
+  %1 = load float, float addrspace(1)* %arrayidx16, align 4
+  %mul18 = mul nsw i32 %conv, %m
+  %add19 = add nsw i32 %storemerge, %mul18
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom20
+  %2 = load float, float addrspace(1)* %arrayidx21, align 4
+  %3 = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  store float %3, float addrspace(1)* %arrayidx21, align 4
+  %inc = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond5
+
+for.end:                                          ; preds = %for.cond5
+  %mul22 = mul nsw i32 %conv, %m
+  %add23 = add nsw i32 %storemerge, %mul22
+  %idxprom24 = sext i32 %add23 to i64
+  %arrayidx25 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom24
+  %4 = load float, float addrspace(1)* %arrayidx25, align 4
+  %mul26 = mul nsw i32 %storemerge, %m
+  %add27 = add nsw i32 %mul26, %conv
+  %idxprom28 = sext i32 %add27 to i64
+  %arrayidx29 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom28
+  store float %4, float addrspace(1)* %arrayidx29, align 4
+  br label %for.cond
+
+if.end:                                           ; preds = %for.cond, %entry
+  ret void
+}
+
+; The purpose of this test is to make sure we correctly set the incoming value
+; of a boscc_blend instruction (in a loop header) from the latch as being the
+; value defined in the latch iteself.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops3
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %entry.boscc_indir
+
+; CHECK: if.then.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond5.preheader.lr.ph.uniform, label %if.then.uniform.boscc_indir
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end, label %if.then
+
+; CHECK: for.cond5.preheader.lr.ph.uniform:
+; CHECK: br label %for.cond5.preheader.uniform
+
+; CHECK: if.then.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end.loopexit.uniform, label %for.cond5.preheader.lr.ph
+
+; CHECK: for.cond5.preheader.uniform:
+; CHECK: br label %for.cond5.uniform
+
+; CHECK: for.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond.if.end.loopexit_crit_edge.uniform, label %for.end.uniform.boscc_store
+
+; CHECK: for.end.uniform.boscc_store:
+; CHECK: br label %for.cond5.preheader
+
+; CHECK: if.then:
+; CHECK: br label %for.cond5.preheader.lr.ph
+
+; CHECK: for.cond5.preheader.lr.ph:
+; CHECK: br label %for.cond5.preheader
+
+; CHECK: for.cond5.preheader:
+
+; This is the important bit of the test
+; Note that the LCSSA PHI node got cleaned up!
+; For some reason LIT needs these checks to be split across two lines
+; CHECK: %[[LATCH_VALUE1:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE1]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE2:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE2]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE3:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE3]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE4:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE4]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE5:.+\.boscc_blend[0-9]*]] = phi i1 [ true, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE5]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
new file mode 100644
index 0000000000000..726f2619141d1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
@@ -0,0 +1,190 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k nested_loops4 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z13get_global_idj(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z15get_global_sizej(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func float @_Z3dotDv2_fS_(<2 x float>, <2 x float>) #0
+
+declare spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64, float addrspace(1)*)
+
+; Function Attrs: nounwind readnone
+declare spir_func i32 @_Z6mul_hijj(i32, i32) #0
+
+define spir_kernel void @nested_loops4(i32 %n, float addrspace(1)* %out) {
+entry:
+  %gid = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
+  %gsize = tail call spir_func i64 @_Z15get_global_sizej(i32 0) #0
+  %trunc_gid = trunc i64 %gid to i32
+  %trunc_gsize = trunc i64 %gsize to i32
+  %cmp1 = icmp slt i32 %trunc_gid, %n
+  br i1 %cmp1, label %for.cond1, label %end
+
+for.cond1:                                     ; preds = %entry, %for.cond1.end
+  %phi_trunc_gid = phi i32 [ %trunc_gid, %entry ], [ %add2, %for.cond1.end ]
+  %mul_hi = tail call spir_func i32 @_Z6mul_hijj(i32 %phi_trunc_gid, i32 %n) #0
+  %wrong = sdiv i32 %mul_hi, %n
+  %sext_mul_hi = sext i32 %mul_hi to i64
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_mul_hi
+  %cmp2 = icmp slt i32 %mul_hi, %n
+  br i1 %cmp2, label %for.cond2, label %for.cond1.end
+
+for.cond2:                                    ; preds = %for.cond1, %for.cond2.end
+  %phi4_fadd = phi float [ %phi3_fadd, %for.cond2.end ], [ 0.000000e+00, %for.cond1 ]
+  %cmp3 = icmp slt i32 %mul_hi, %n
+  br i1 %cmp3, label %for.cond3.preheader, label %for.cond2.end
+
+for.cond3.preheader:                                    ; preds = %for.cond2
+  %add1 = add nsw i32 %mul_hi, %wrong
+  br label %for.cond3
+
+for.cond3:                                    ; preds = %for.cond3.preheader, %for.cond3.end
+  %phi_wrong_correct_correct = phi i32 [ %wrong, %for.cond3.preheader ], [ %correct, %for.cond3.end ]
+  %phi_add1 = phi i32 [ %add1, %for.cond3.preheader ], [ %phi_add1, %for.cond3.end ]
+  %phi2_fadd = phi float [ %phi4_fadd, %for.cond3.preheader ], [ %phi1_fadd, %for.cond3.end ]
+  %cmp4 = icmp slt i32 %phi_wrong_correct_correct, %n
+  br i1 %cmp4, label %for.cond3.body, label %for.cond3.end
+
+for.cond3.body:                                    ; preds = %for.cond3
+  %sext_phi_add1 = sext i32 %phi_add1 to i64
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %sext_phi_add1
+  %vload = tail call spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64 0, float addrspace(1)* %gep2)
+  %dot = tail call spir_func float @_Z3dotDv2_fS_(<2 x float> %vload, <2 x float> %vload) #0
+  %fadd = fadd float %phi2_fadd, %dot
+  br label %for.cond3.end
+
+for.cond3.end:                                    ; preds = %for.cond3.body, %for.cond3
+  %phi1_fadd = phi float [ %phi2_fadd, %for.cond3 ], [ %fadd, %for.cond3.body ]
+  %correct = add nsw i32 %phi_wrong_correct_correct, 1
+  %cmp5 = icmp slt i32 %wrong, %n
+  br i1 %cmp5, label %for.cond3, label %for.cond2.end
+
+for.cond2.end:                                    ; preds = %for.cond3.end, %for.cond2
+  %phi3_fadd = phi float [ %phi4_fadd, %for.cond2 ], [ %phi1_fadd, %for.cond3.end ]
+  %cmp6 = icmp slt i32 %mul_hi, %n
+  br i1 %cmp6, label %for.cond2, label %for.cond1.end
+
+for.cond1.end:                                    ; preds = %for.cond2.end, %for.cond1
+  %ret = phi float [ 0.000000e+00, %for.cond1 ], [ %phi3_fadd, %for.cond2.end ]
+  %sext_phi_trunc_gid = sext i32 %phi_trunc_gid to i64
+  %gep3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_phi_trunc_gid
+  store float %ret, float addrspace(1)* %gep3, align 4
+  %add2 = add nsw i32 %phi_trunc_gid, %trunc_gsize
+  %cmp7 = icmp slt i32 %add2, %n
+  br i1 %cmp7, label %for.cond1, label %end
+
+end:                                    ; preds = %for.cond1.end, %entry
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
+; The purpose of this test is to make sure we choose the correct incoming value
+; for a boscc blend instruction.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops4
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %for.cond1.preheader.uniform, label %entry.boscc_indir
+
+; CHECK: for.cond1.preheader.uniform:
+; CHECK: br label %for.cond1.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end, label %for.cond1.preheader
+
+; CHECK: for.cond1.uniform:
+; CHECK: %wrong.uniform = sdiv i32 %mul_hi.uniform, %n
+; CHECK: br i1 %{{.+}}, label %for.cond2.preheader.uniform, label %for.cond1.uniform.boscc_indir
+
+; CHECK: for.cond1.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end.loopexit.uniform, label %for.cond1.end.uniform.boscc_store
+
+; CHECK: for.cond1.end.uniform.boscc_store:
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond2.preheader.uniform:
+; CHECK: br label %for.cond2.uniform
+
+; CHECK: for.cond1.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond1.end.uniform, label %for.cond1.uniform.boscc_store
+
+; CHECK: for.cond1.uniform.boscc_store:
+;    LCSSA PHI nodes got cleaned up:
+; CHECK-NOT: %{{.*\.boscc_lcssa.*}}
+; CHECK: br label %for.cond2.preheader
+
+; CHECK: for.cond2.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.preheader.uniform, label %for.cond2.uniform.boscc_indir
+
+; CHECK: for.cond2.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond1.end.loopexit.uniform, label %for.cond2.end.uniform.boscc_store
+
+; CHECK: for.cond3.preheader.uniform:
+; CHECK: br label %for.cond3.uniform
+
+; CHECK: for.cond2.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond2.end.uniform, label %for.cond2.uniform.boscc_store
+
+; CHECK: for.cond3.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.body.uniform, label %for.cond3.uniform.boscc_indir
+
+; CHECK: for.cond3.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond2.end.loopexit.uniform, label %for.cond3.end.uniform.boscc_store
+
+; CHECK: for.cond3.end.uniform.boscc_store:
+;    LCSSA PHI nodes got cleaned up:
+; CHECK-NOT: %{{.*\.boscc_lcssa.*}}
+; CHECK: br label %for.cond3
+
+; CHECK: for.cond3.body.uniform:
+; CHECK: br label %for.cond3.end.uniform
+
+; CHECK: for.cond3.uniform.boscc_indir:
+; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %for.cond3.end.uniform.exit_mask)
+; CHECK: br i1 %[[BOSCC]], label %for.cond3.end.uniform, label %for.cond3.uniform.boscc_store
+
+; CHECK: for.cond3.end.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.uniform, label %for.cond3.end.uniform.boscc_indir
+
+; CHECK: for.cond1.preheader:
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond1:
+; CHECK: br label %for.cond2.preheader
+
+; CHECK: for.cond2.preheader:
+; CHECK: br label %for.cond2
+
+; CHECK: for.cond2:
+; CHECK: br label %for.cond3.preheader
+
+; CHECK: for.cond3.preheader:
+; CHECK: br label %for.cond3
+
+; CHECK: for.cond3:
+
+; This is the important part of the test.
+; CHECK: %phi_wrong_correct_correct = phi i32 [ %wrong.boscc_blend{{.+}}, %for.cond3.preheader ], [ %correct, %for.cond3.end ], [ %correct.uniform, %for.cond3.end.uniform.boscc_store ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
new file mode 100644
index 0000000000000..3eb72a6a10b6e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
@@ -0,0 +1,117 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k nested_loops5 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+define spir_kernel void @nested_loops5(float addrspace(1)*) {
+entry:
+  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %lsize = tail call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp1 = icmp ult i64 %lid, %lsize
+  br i1 %cmp1, label %loop, label %end
+
+loop:                                             ; preds = %if.end, %entry
+  %livethrough = phi i64 [ %add2, %if.end ], [ %lsize, %entry ]
+  %add1 = add i64 %livethrough, %lsize
+  %cmp2 = icmp ult i64 %add1, %lsize
+  br i1 %cmp2, label %if.then, label %if.else
+
+if.then:                                          ; preds = %if.then, %loop
+  %phi = phi i64 [ %add3, %if.then ], [ %lid, %loop ]
+  %add3 = add i64 %phi, %lsize
+  %cmp4 = icmp ult i64 %add3, %lsize
+  br i1 %cmp4, label %if.then, label %if.end
+
+if.else:                                          ; preds = %loop
+  %gep = getelementptr inbounds float, float addrspace(1)* %0, i64 %add1
+  store float 0.000000e+00, float addrspace(1)* %gep, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %if.else
+  %add2 = add i64 %livethrough, %lsize
+  %cmp3 = icmp ult i64 %add2, %lsize
+  br i1 %cmp3, label %loop, label %end
+
+end:                                              ; preds = %if.end, %entry
+  ret void
+}
+
+; The purpose of this test is to make sure we choose the correct incoming value
+; for a boscc blend instruction.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops5
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %loop.preheader.uniform, label %entry.boscc_indir
+
+; CHECK: loop.preheader.uniform:
+; CHECK: br label %loop.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end, label %loop.preheader
+
+; CHECK: loop.uniform:
+; CHECK: %livethrough.uniform = phi i64 [ %add2.uniform, %if.end.uniform ], [ %lsize, %loop.preheader.uniform ]
+; CHECK: br i1 %{{.+}}, label %if.then.preheader.uniform, label %if.else.uniform
+
+; CHECK: if.then.preheader.uniform:
+; CHECK: br label %if.then.uniform
+
+; CHECK: if.then.uniform:
+; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %if.then.uniform.boscc_indir
+
+; CHECK: if.then.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end.loopexit.uniform, label %if.then.uniform.boscc_store
+
+; CHECK: if.then.uniform.boscc_store:
+;    LCSSA PHI nodes got cleaned up:
+; CHECK-NOT: %{{.*\.boscc_lcssa.*}}
+; CHECK: br label %if.then
+
+; CHECK: loop.preheader:
+; CHECK: br label %loop
+
+; CHECK: loop:
+; CHECK: %livethrough = phi i64 [ %add2, %if.end ], [ %lsize, %loop.preheader ]
+; CHECK: br i1 %{{.+}}, label %if.then.preheader, label %if.else
+
+; CHECK: if.then.preheader:
+; CHECK: br label %if.then
+
+; CHECK: if.then:
+; CHECK: %livethrough.boscc_blend = phi i64 [ %livethrough.uniform, %if.then.uniform.boscc_store ], [ %livethrough.boscc_blend, %if.then ], [ %livethrough, %if.then.preheader ]
+; CHECK: br i1 %{{.+}}, label %if.then, label %if.then.pure_exit
+
+; CHECK: if.then.pure_exit:
+; CHECK: br label %if.end.loopexit
+
+; CHECK: if.else:
+; CHECK: br label %if.end
+
+; CHECK: if.end.loopexit:
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK-NOT: %livethrough.boscc_blend{{.+}}.merge = phi i64 [ %livethrough.boscc_blend, %if.end.loopexit ], [ 0, %if.else ]
+; CHECK: %livethrough.boscc_blend{{.+}} = phi i64 [ %livethrough.boscc_blend, %if.end.loopexit ], [ %livethrough, %if.else ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
new file mode 100644
index 0000000000000..ab362edb0f0db
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
@@ -0,0 +1,436 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;        a
+;       / \
+;      b   c
+;       \ /
+;        d
+;        |
+;        e
+;       / \
+;      /   \
+;     f     g
+;    / \   / \
+;   h   i j   k
+;    \ /   \ /
+;     l     m
+;      \   /
+;       \ /
+;        n
+;
+; * where node e is a uniform branch, and nodes a, f and g are varying
+;   branches.
+; * where nodes b, c, d, h, i, j, k, l, m are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a___
+;        / \  \
+;       b   c  c'
+;        \ /   |
+;         d    b'
+;         |    |
+;         |    d'
+;         |   /
+;          \ /
+;           e
+;          / \
+;         /   \
+;     ___f     g___
+;    /  / \   / \  \
+;   i' h   i j   k  k'
+;   |   \ /   \ /   |
+;   h'   l     m    j'
+;   |    |     |    |
+;   l'   |     |    m'
+;    \   |     |   /
+;     \ /       \ /
+;      & -> n <- &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization0(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 5 == 0) {
+;     for (int i = 0; i < n * 2; i++) ret++;
+;   } else {
+;     for (int i = 0; i < n / 4; i++) ret++;
+;   }
+;
+;   if (n > 10) { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 10; i++) ret *= 2;
+;     }
+;     ret += id * 10;
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 8; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 8; i++) ret *= 2;
+;     }
+;     ret += id / 2;
+;   }
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem = srem i32 %conv, 5
+  %cmp = icmp eq i32 %rem, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge8, %mul
+  br i1 %cmp2, label %for.body, label %if.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge8, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %entry
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.body9, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %inc10, %for.body9 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc12, %for.body9 ]
+  %div = sdiv i32 %n, 4
+  %cmp7 = icmp slt i32 %storemerge, %div
+  br i1 %cmp7, label %for.body9, label %if.end
+
+for.body9:                                        ; preds = %for.cond6
+  %inc10 = add nsw i32 %ret.1, 1
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond6
+
+if.end:                                           ; preds = %for.cond6, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond6 ]
+  %cmp14 = icmp sgt i32 %n, 10
+  %rem175 = and i32 %conv, 1
+  %cmp18 = icmp eq i32 %rem175, 0
+  br i1 %cmp14, label %if.then16, label %if.else44
+
+if.then16:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then20, label %if.else30
+
+if.then20:                                        ; preds = %if.then16
+  br label %for.cond22
+
+for.cond22:                                       ; preds = %for.body25, %if.then20
+  %ret.3 = phi i32 [ %ret.2, %if.then20 ], [ %inc26, %for.body25 ]
+  %storemerge7 = phi i32 [ 0, %if.then20 ], [ %inc28, %for.body25 ]
+  %add = add nsw i32 %n, 10
+  %cmp23 = icmp slt i32 %storemerge7, %add
+  br i1 %cmp23, label %for.body25, label %if.end41
+
+for.body25:                                       ; preds = %for.cond22
+  %inc26 = add nsw i32 %ret.3, 1
+  %inc28 = add nsw i32 %storemerge7, 1
+  br label %for.cond22
+
+if.else30:                                        ; preds = %if.then16
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.4 = phi i32 [ %ret.2, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge6 = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 10
+  %cmp34 = icmp slt i32 %storemerge6, %add33
+  br i1 %cmp34, label %for.body36, label %if.end41
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.4, 1
+  %inc39 = add nsw i32 %storemerge6, 1
+  br label %for.cond32
+
+if.end41:                                         ; preds = %for.cond32, %for.cond22
+  %ret.5 = phi i32 [ %ret.3, %for.cond22 ], [ %ret.4, %for.cond32 ]
+  %mul42 = mul nsw i32 %conv, 10
+  %add43 = add nsw i32 %ret.5, %mul42
+  br label %if.end73
+
+if.else44:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then48, label %if.else59
+
+if.then48:                                        ; preds = %if.else44
+  br label %for.cond50
+
+for.cond50:                                       ; preds = %for.body54, %if.then48
+  %ret.6 = phi i32 [ %ret.2, %if.then48 ], [ %inc55, %for.body54 ]
+  %storemerge4 = phi i32 [ 0, %if.then48 ], [ %inc57, %for.body54 ]
+  %add51 = add nsw i32 %n, 8
+  %cmp52 = icmp slt i32 %storemerge4, %add51
+  br i1 %cmp52, label %for.body54, label %if.end70
+
+for.body54:                                       ; preds = %for.cond50
+  %inc55 = add nsw i32 %ret.6, 1
+  %inc57 = add nsw i32 %storemerge4, 1
+  br label %for.cond50
+
+if.else59:                                        ; preds = %if.else44
+  br label %for.cond61
+
+for.cond61:                                       ; preds = %for.body65, %if.else59
+  %ret.7 = phi i32 [ %ret.2, %if.else59 ], [ %mul66, %for.body65 ]
+  %storemerge2 = phi i32 [ 0, %if.else59 ], [ %inc68, %for.body65 ]
+  %add62 = add nsw i32 %n, 8
+  %cmp63 = icmp slt i32 %storemerge2, %add62
+  br i1 %cmp63, label %for.body65, label %if.end70
+
+for.body65:                                       ; preds = %for.cond61
+  %mul66 = shl nsw i32 %ret.7, 1
+  %inc68 = add nsw i32 %storemerge2, 1
+  br label %for.cond61
+
+if.end70:                                         ; preds = %for.cond61, %for.cond50
+  %ret.8 = phi i32 [ %ret.6, %for.cond50 ], [ %ret.7, %for.cond61 ]
+  %div71 = sdiv i32 %conv, 2
+  %add72 = add nsw i32 %ret.8, %div71
+  br label %if.end73
+
+if.end73:                                         ; preds = %if.end70, %if.end41
+  %storemerge3 = phi i32 [ %add72, %if.end70 ], [ %add43, %if.end41 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization0, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization0
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND6PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND6UNIFORM:.+]]
+
+; CHECK: [[FORCOND6UNIFORM]]:
+; CHECK: %[[CMP7UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP7UNIFORM]], label %[[FORBODY9UNIFORM:.+]], label %[[IFENDLOOPEXIT3UNIFORM:.+]]
+
+; CHECK: [[FORBODY9UNIFORM]]:
+; CHECK: br label %[[FORCOND6UNIFORM]]
+
+; CHECK: [[IFENDLOOPEXIT3UNIFORM]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND6PREHEADERUNIFORM]], label %[[FORCOND6PREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[IFENDLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[IFENDLOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[FORCOND6PREHEADER]]:
+; CHECK: br label %[[FORCOND6:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND6]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[FORBODY9:.+]], label %[[IFENDLOOPEXIT3:.+]]
+
+; CHECK: [[FORBODY9]]:
+; CHECK: br label %[[FORCOND6]]
+
+; CHECK: [[IFENDLOOPEXIT]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[IFENDLOOPEXIT3]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br i1 %[[CMP14]], label %[[IFTHEN16:.+]], label %[[IFELSE44:.+]]
+
+; CHECK: [[IFTHEN16]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND22PREHEADERUNIFORM:.+]], label %[[IFTHEN16BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: %[[CMP34UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP34UNIFORM]], label %[[FORBODY36UNIFORM:.+]], label %[[IFEND41LOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[IFEND41LOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[IFEND41UNIFORM:.+]]
+
+; CHECK: [[FORCOND22PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND22UNIFORM:.+]]
+
+; CHECK: [[IFTHEN16BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM]], label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND22UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY25UNIFORM:.+]], label %[[IFEND41LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY25UNIFORM]]:
+; CHECK: br label %[[FORCOND22UNIFORM]]
+
+; CHECK: [[IFEND41LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND41:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND22PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND22:.+]]
+
+; CHECK: [[FORCOND22]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY25:.+]], label %[[IFEND41LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY25]]:
+; CHECK: br label %[[FORCOND22]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: %[[CMP34:.+]] = icmp
+; CHECK: br i1 %[[CMP34]], label %[[FORBODY36:.+]], label %[[IFEND41LOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[IFEND41LOOPEXIT]]:
+; CHECK: br label %[[IFEND41]]
+
+; CHECK: [[IFEND41LOOPEXIT1]]:
+; CHECK: br label %[[FORCOND22PREHEADER]]
+
+; CHECK: [[IFEND41]]:
+; CHECK: br label %[[IFEND73:.+]]
+
+; CHECK: [[IFELSE44]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND50PREHEADERUNIFORM:.+]], label %[[IFELSE44BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND61PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND61UNIFORM:.+]]
+
+; CHECK: [[FORCOND61UNIFORM]]:
+; CHECK: %[[CMP63UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP63UNIFORM]], label %[[FORBODY65UNIFORM:.+]], label %[[IFEND70LOOPEXIT2UNIFORM:.+]]
+
+; CHECK: [[FORBODY65UNIFORM]]:
+; CHECK: br label %[[FORCOND61UNIFORM]]
+
+; CHECK: [[IFEND70LOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[IFEND70UNIFORM:.+]]
+
+; CHECK: [[FORCOND50PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND50UNIFORM:.+]]
+
+; CHECK: [[IFELSE44BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND61PREHEADERUNIFORM]], label %[[FORCOND61PREHEADER:.+]]
+
+; CHECK: [[FORCOND50UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY54UNIFORM:.+]], label %[[IFEND70LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY54UNIFORM]]:
+; CHECK: br label %[[FORCOND50UNIFORM]]
+
+; CHECK: [[IFEND70LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND70:.+]]
+
+; CHECK: [[FORCOND61PREHEADER]]:
+; CHECK: br label %[[FORCOND61:.+]]
+
+; CHECK: [[FORCOND50PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND50:.+]]
+
+; CHECK: [[FORCOND50]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY54:.+]], label %[[IFEND70LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY54]]:
+; CHECK: br label %[[FORCOND50]]
+
+; CHECK: [[FORCOND61]]:
+; CHECK: %[[CMP63:.+]] = icmp
+; CHECK: br i1 %[[CMP63]], label %[[FORBODY65:.+]], label %[[IFEND70LOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY65]]:
+; CHECK: br label %[[FORCOND61]]
+
+; CHECK: [[IFEND70LOOPEXIT]]:
+; CHECK: br label %[[IFEND70]]
+
+; CHECK: [[IFEND70LOOPEXIT2]]:
+; CHECK: br label %[[FORCOND50PREHEADER]]
+
+; CHECK: [[IFEND70]]:
+; CHECK: br label %[[IFEND73]]
+
+; CHECK: [[IFEND73]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
new file mode 100644
index 0000000000000..a49b7b97e2cab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
@@ -0,0 +1,320 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes c and f are uniform branches, and node b is a varying
+;   branch.
+; * where nodes c, d, e, f, g and h are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-.  b' <.
+;      / \__|_ |   |
+;     c   d | `d'  |
+;    / \ /  |  |   |
+;   e   f --'  c'  |
+;    \  |      |   |
+;     \ g      f' -'
+;      \|      |
+;       h      g'
+;       |      |
+;       |      e'
+;       |      |
+;       |      h'
+;       \     /
+;        \   /
+;         \ /
+;          &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization1(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (id + i % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     }
+;     if (n <= 2) break;
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end14, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.2, %if.end14 ]
+  %cmp = icmp eq i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end10
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else
+  %ret.1 = phi i32 [ %ret.0, %if.else ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc9, %for.body ]
+  %add6 = add nsw i32 %n, 10
+  %cmp7 = icmp slt i32 %storemerge, %add6
+  br i1 %cmp7, label %for.body, label %if.end10
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc9 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.end10:                                         ; preds = %for.cond, %if.then
+  %ret.2 = phi i32 [ %ret.0, %if.then ], [ %ret.1, %for.cond ]
+  %cmp11 = icmp slt i32 %n, 3
+  br i1 %cmp11, label %while.end, label %if.end14
+
+if.end14:                                         ; preds = %if.end10
+  br label %while.body
+
+while.end:                                        ; preds = %if.end10
+  %mul = mul i32 %n, 2
+  %add15 = add nsw i32 %ret.2, %mul
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body21, %while.end
+  %ret.3 = phi i32 [ %add15, %while.end ], [ %sub, %for.body21 ]
+  %storemerge1 = phi i32 [ 0, %while.end ], [ %inc23, %for.body21 ]
+  %mul18 = shl nsw i32 %n, 1
+  %cmp19 = icmp slt i32 %storemerge1, %mul18
+  br i1 %cmp19, label %for.body21, label %for.end24
+
+for.body21:                                       ; preds = %for.cond17
+  %sub = sub nsw i32 %ret.3, %storemerge1
+  %inc23 = add nsw i32 %storemerge1, 1
+  br label %for.cond17
+
+for.end24:                                        ; preds = %for.cond17
+  %0 = icmp eq i32 %ret.3, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.3, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %e
+  %ret.4 = phi i32 [ %ret.0, %e ], [ %div31, %for.body30 ]
+  %storemerge3 = phi i32 [ 0, %e ], [ %inc33, %for.body30 ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %for.end34
+
+for.body30:                                       ; preds = %for.cond26
+  %div31 = sdiv i32 %ret.4, 2
+  %inc33 = add nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+for.end34:                                        ; preds = %for.cond26
+  %sub35 = sub nsw i32 %ret.4, %n
+  br label %early
+
+early:                                            ; preds = %for.end34, %for.end24
+  %storemerge2 = phi i32 [ %div, %for.end24 ], [ %sub35, %for.end34 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization1, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization1
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[IFEND10LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[IFEND10LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND10UNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FORCOND26PREHEADERUNIFORM:.+]], label %[[IFEND10UNIFORM]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[IFEND10UNIFORM]]:
+; CHECK: %[[CMP11UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP11UNIFORM]], label %[[WHILEENDUNIFORM:.+]], label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[WHILEENDUNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM:.+]]
+
+; CHECK: [[FORCOND17UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21UNIFORM:.+]], label %[[FOREND24UNIFORM:.+]]
+
+; CHECK: [[FORBODY21UNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM]]
+
+; CHECK: [[FOREND24UNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[FORCOND26PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM:.+]]
+
+; CHECK: [[FORCOND26UNIFORM]]:
+; CHECK: %[[CMP29UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP29UNIFORM]], label %[[FORBODY30UNIFORM:.+]], label %[[FOREND34UNIFORM:.+]]
+
+; CHECK: [[FORBODY30UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM]]
+
+; CHECK: [[FOREND34UNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND10:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFEND10LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[IFEND10LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND10]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[WHILEENDELSE:.+]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21:.+]], label %[[FOREND24:.+]]
+
+; CHECK: [[FORBODY21]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FOREND24]]:
+; CHECK: br label %[[WHILEENDELSE]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: %[[CMP29:.+]] = icmp
+; CHECK: br i1 %[[CMP29]], label %[[FORBODY30:.+]], label %[[FOREND34:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[FOREND34]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
new file mode 100644
index 0000000000000..683031b2e4574
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
@@ -0,0 +1,568 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;            a
+;            |
+;            b <-----.
+;           / \      |
+;          c   d     |
+;         / \ /      |
+;        /   e       |
+;       /    |       |
+;      /     g <---. |
+;     /     / \    | |
+;    /     h   i   | |
+;   f     / \ / \  | |
+;   |    j   k   l | |
+;   |   /|  / \ /  | |
+;   |  m | n   o --' |
+;   | /  |/          |
+;   |/   q ----------'
+;   p    |
+;    \   r
+;     \ /
+;      s
+;
+; * where nodes b, c, g, h, j, k and q are uniform branches, and node i is a
+;   varying branch.
+; * where nodes k, l, o, n, m, p, q, r and s are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;            a
+;            |
+;            b <-----.        b' <-----.
+;           / \      |       / \       |
+;          c   d     |      c'  d'     |
+;         / \ /      |     / \ /       |
+;        /   e       |    /   e'       |
+;       /    |       |   /    |        |
+;      /     g <---. |  /     g' <---. |
+;     /     / \    | | f'    / \     | |
+;    /     h   i___|_|_|____/__ \    | |
+;   f     / \ / \  | | |   h'  \ i'  | |
+;   |    j   k   l | | |  / \   \|   | |
+;   |   /|  / \ /  | | | j'  |   l'  | |
+;   |  m | n   o --' | | |    \ /    | |
+;   | /  |/          | | |     k'    | |
+;   |/   q ----------' |  \    |     | |
+;   p    |             |   \   o' ---' |
+;    \   r             |    \ /        |
+;     \ /              |     n'        |
+;      s                \    |         |
+;      |                 \   q' -------'
+;      |                  \ /
+;      |                   m'
+;      |                   |
+;      |                   r'
+;      |                   |
+;      |                   p'
+;      |                   |
+;      `-------> & <------ s'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization10(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           // j
+;           goto j;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // l
+;           ret /= n * n + ret;
+;           goto o;
+;         }
+;       }
+;       // k
+;       if (n & 1) {
+;         // n
+;         ret += n * ret;
+;         goto n;
+;       }
+;       // o
+; o:
+;       ret++;
+;     }
+; j:
+;     if (n < 2) {
+;       // m
+;       ret += n * 2 + 20;
+;       goto p;
+;     } else {
+;       goto q;
+;     }
+; n:
+;     ret *= 4;
+; q:
+;     if (n & 1) {
+;       // r
+;       ret++;
+;       goto r;
+;     }
+;   }
+;
+; r:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto s;
+;
+; f:
+;   ret /= n;
+;   goto p;
+;
+; p:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; s:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end55, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.5, %if.end55 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.1 = phi i32 [ %ret.0, %if.then ], [ %inc, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge5, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.2 = phi i32 [ %ret.0, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %o, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %o ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %j, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %o
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %o, label %if.then37
+
+if.then37:                                        ; preds = %if.end34
+  %mul38 = mul nsw i32 %storemerge1, %n
+  %add39 = add nsw i32 %mul38, %storemerge1
+  %mul50 = shl nsw i32 %add39, 2
+  br label %q
+
+o:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+j:                                                ; preds = %if.then21
+  %cmp42 = icmp eq i32 %n, 2
+  br i1 %cmp42, label %q, label %if.then44
+
+if.then44:                                        ; preds = %j
+  %mul45 = mul i32 %n, 2
+  %add46 = add nsw i32 %mul45, 20
+  %add47 = add nsw i32 %add46, %storemerge1
+  br label %p
+
+q:                                                ; preds = %j, %if.then37
+  %ret.5 = phi i32 [ %mul50, %if.then37 ], [ %storemerge1, %j ]
+  %and51 = and i32 %n, 1
+  %tobool52 = icmp eq i32 %and51, 0
+  br i1 %tobool52, label %if.end55, label %if.then53
+
+if.then53:                                        ; preds = %q
+  br label %for.cond57
+
+if.end55:                                         ; preds = %q
+  br label %while.body
+
+for.cond57:                                       ; preds = %for.body61, %if.then53
+  %ret.6.in = phi i32 [ %ret.5, %if.then53 ], [ %ret.6, %for.body61 ]
+  %storemerge2 = phi i32 [ 0, %if.then53 ], [ %inc64, %for.body61 ]
+  %ret.6 = add nsw i32 %ret.6.in, 1
+  %div58 = sdiv i32 %n, 4
+  %cmp59 = icmp slt i32 %storemerge2, %div58
+  br i1 %cmp59, label %for.body61, label %s
+
+for.body61:                                       ; preds = %for.cond57
+  %inc64 = add nsw i32 %storemerge2, 1
+  br label %for.cond57
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div66 = sdiv i32 %ret.1, %7
+  br label %p
+
+p:                                                ; preds = %f, %if.then44
+  %storemerge3 = phi i32 [ %add47, %if.then44 ], [ %div66, %f ]
+  br label %for.cond68
+
+for.cond68:                                       ; preds = %for.body72, %p
+  %ret.7 = phi i32 [ %storemerge3, %p ], [ %inc73, %for.body72 ]
+  %storemerge4 = phi i32 [ 0, %p ], [ %inc75, %for.body72 ]
+  %mul69 = shl nsw i32 %n, 1
+  %cmp70 = icmp slt i32 %storemerge4, %mul69
+  br i1 %cmp70, label %for.body72, label %s
+
+for.body72:                                       ; preds = %for.cond68
+  %inc73 = add nsw i32 %ret.7, 1
+  %inc75 = add nsw i32 %storemerge4, 1
+  br label %for.cond68
+
+s:                                                ; preds = %for.cond68, %for.cond57
+  %ret.8 = phi i32 [ %ret.6, %for.cond57 ], [ %ret.7, %for.cond68 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization10, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization10
+; CHECK: br i1 true, label %[[WHILBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[TOBOOL:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: %[[CMP22:.+]] = icmp
+; CHECK: br i1 %[[CMP22]], label %[[J:.+]], label %[[IFEND34:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND9PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND9PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM:.+]]
+
+; CHECK: [[FORCOND9UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[IFEND17LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM]]
+
+; CHECK: [[IFEND17LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND17UNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND17UNIFORM]]
+
+; CHECK: [[IFEND17UNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY20UNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[IFELSE26UNIFORM:.+]], label %[[IFTHEN21UNIFORM:.+]]
+
+; CHECK: [[IFTHEN21UNIFORM]]:
+; CHECK: %[[CMP22UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP22UNIFORM]], label %[[JUNIFORM:.+]], label %[[IFEND34UNIFORM:.+]]
+
+; CHECK: [[IFELSE26UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND34UNIFORM]], label %[[IFELSE26UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN29UNIFORM:.+]]:
+; CHECK: br label %[[OUNIFORM:.+]]
+
+; CHECK: [[IFEND34UNIFORM]]:
+; CHECK: %[[TOBOOL36UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL36UNIFORM]], label %[[OUNIFORM]], label %[[IFTHEN37UNIFORM:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN29UNIFORM]], label %[[IFELSE26UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN29]]
+
+; CHECK: [[OUNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM]]
+
+; CHECK: [[JUNIFORM]]:
+; CHECK: %[[CMP42UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP42UNIFORM]], label %[[QUNIFORM:.+]], label %[[IFTHEN44UNIFORM:.+]]
+
+; CHECK: [[IFTHEN37UNIFORM]]:
+; CHECK: br label %[[QUNIFORM]]
+
+; CHECK: [[QUNIFORM]]:
+; CHECK: %[[TOBOOL52UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL52UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FORCOND57PREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFTHEN44UNIFORM]]:
+; CHECK: br label %[[PUNIFORM:.+]]
+
+; CHECK: [[FORCOND57PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND57UNIFORM:.+]]
+
+; CHECK: [[FORCOND57UNIFORM]]:
+; CHECK: %[[CMP59UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP59UNIFORM]], label %[[FORBODY61UNIFORM:.+]], label %[[SLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY61UNIFORM]]:
+; CHECK: br label %[[FORCOND57UNIFORM]]
+
+; CHECK: [[SLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[SUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[PUNIFORM]]
+
+; CHECK: [[PUNIFORM]]:
+; CHECK: br label %[[FORCOND68UNIFORM:.+]]
+
+; CHECK: [[FORCOND68UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72UNIFORM:.+]], label %[[SLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY72UNIFORM]]:
+; CHECK: br label %[[FORCOND68UNIFORM]]
+
+; CHECK: [[SLOOPEXITUNIFORM]]:
+; CHECK: br label %[[S:.+]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[IFTHEN37:.+]]:
+; CHECK: br label %[[IFTHEN37ELSE:.+]]
+
+; CHECK: [[IFTHEN37ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[JELSE:.+]], label %[[JSPLIT:.+]]
+
+; CHECK: [[O]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[IFTHEN37]]
+
+; CHECK: [[J]]:
+; CHECK: br label %[[WHILEBODY20PUREEXIT]]
+
+; CHECK: [[JELSE]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[JSPLIT]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[IFTHEN44:.+]]:
+; CHECK: br label %[[IFTHEN44ELSE:.+]]
+
+; CHECK: [[IFTHEN44ELSE]]:
+; CHECK: br label %[[FORCOND57PREHEADER:.+]]
+
+; CHECK: [[Q]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN44]]
+
+; CHECK: [[FORCOND57PREHEADER]]:
+; CHECK: br label %[[FORCOND57:.+]]
+
+; CHECK: [[FORCOND57PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND57]]:
+; CHECK: %[[CMP59:.+]] = icmp
+; CHECK: br i1 %[[CMP59]], label %[[FORBODY61:.+]], label %[[SLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY61]]:
+; CHECK: br label %[[FORCOND57]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[P]]
+
+; CHECK: [[P]]:
+; CHECK: br label %[[FORCOND68:.+]]
+
+; CHECK: [[FORCOND68]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72:.+]], label %[[SLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY72]]:
+; CHECK: br label %[[FORCOND68]]
+
+; CHECK: [[SLOOPEXIT]]:
+; CHECK: br label %[[S]]
+
+; CHECK: [[SLOOPEXIT1]]:
+; CHECK: br label %[[FORCOND57PREHEADERELSE]]
+
+; CHECK: [[S]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
new file mode 100644
index 0000000000000..34a686ff4d7a6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
@@ -0,0 +1,425 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-------.
+;       |         |
+;       c <---.   |
+;      / \    |   |
+;     d   e   |   |
+;    / \ / \  |   |
+;   i   f   g |   |
+;   |  / \ / \|   |
+;   | j   h --'   |
+;   | |        \  |
+;   | |         k |
+;   |  \       /  |
+;   |   \     /   |
+;   |    \   /    |
+;   |     \ /     |
+;   |      l -----'
+;   |     /
+;    \   m
+;     \ /
+;      n
+;
+; * where nodes c, d, f, g, and l are uniform branches, and node e is a
+;   varying branch.
+; * where nodes i, f, g, j, h, k, l, m and n are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-------.    b' <----.
+;       |         |    |       |
+;       c <---.   |    c' <--. |
+;      / \    |   |   / \    | |
+;     d   e___|___|_ d'  e'  | |
+;    / \ / \  |   | \|__ |   | |
+;   i   f   g |   |  |  `g'  | |
+;   |  / \ / \|   |   \ /    | |
+;   | j   h --'   |    f'    | |
+;   | |        \  |    |     | |
+;   | |         k |    h' ---' |
+;   |  \       /  |    |       |
+;   |   \     /   |    k'      |
+;   |    \   /    |    |       |
+;   |     \ /     |    j'      |
+;   |      l -----'    |       |
+;   |     /            l' -----'
+;    \   m             |
+;     \ /              m'
+;      n               |
+;      |               i'
+;      |               |
+;      `-----> & <---- n'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization11(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     // b
+;     while (1) {
+;       if (n < 5) { // c
+;         // d
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         if (n <= 3) {
+;           // i
+;           goto i;
+;         }
+;       } else {
+;         // e
+;         if (ret + id >= n) {
+;           // g
+;           ret /= n * n + ret;
+;           if (n <= 10) {
+;             goto k;
+;           } else {
+;             goto h;
+;           }
+;         }
+;       }
+;       // f
+;       ret *= n;
+;       if (n & 1) {
+;         goto j;
+;       }
+;
+;       // h
+; h:
+;       ret++;
+;     }
+;
+; j:
+;     ret += n * 2 + 20;
+;     goto l;
+;
+; k:
+;     ret *= n;
+;     goto l;
+;
+; l:
+;     if (n & 1) {
+;       // m
+;       ret++;
+;       goto m;
+;     }
+;   }
+;
+; m:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto n;
+;
+; i:
+;   ret /= n;
+;
+; n:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end33, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %storemerge, %if.end33 ]
+  br label %while.body2
+
+while.body2:                                      ; preds = %h, %while.body
+  %ret.1 = phi i32 [ %ret.0, %while.body ], [ %inc24, %h ]
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.2 = phi i32 [ %ret.1, %if.then ], [ %inc, %for.body ]
+  %storemerge2 = phi i32 [ 0, %if.then ], [ %inc6, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.2, 1
+  %inc6 = add nsw i32 %storemerge2, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp7 = icmp slt i32 %n, 4
+  br i1 %cmp7, label %i44, label %if.end20
+
+if.else:                                          ; preds = %while.body2
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp slt i32 %add, %n
+  br i1 %cmp10, label %if.end20, label %if.then12
+
+if.then12:                                        ; preds = %if.else
+  %mul13 = mul nsw i32 %n, %n
+  %add14 = add nsw i32 %ret.1, %mul13
+  %0 = icmp eq i32 %ret.1, -2147483648
+  %1 = icmp eq i32 %add14, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %add14, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %add14
+  %div = sdiv i32 %ret.1, %5
+  %cmp15 = icmp slt i32 %n, 11
+  br i1 %cmp15, label %k, label %h
+
+if.end20:                                         ; preds = %if.else, %for.end
+  %ret.3 = phi i32 [ %ret.2, %for.end ], [ %ret.1, %if.else ]
+  %mul21 = mul nsw i32 %ret.3, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %h, label %j
+
+h:                                                ; preds = %if.end20, %if.then12
+  %ret.4 = phi i32 [ %div, %if.then12 ], [ %mul21, %if.end20 ]
+  %inc24 = add nsw i32 %ret.4, 1
+  br label %while.body2
+
+j:                                                ; preds = %if.end20
+  %mul25 = mul i32 %n, 2
+  %add26 = add nsw i32 %mul25, 20
+  %add27 = add nsw i32 %add26, %mul21
+  br label %l
+
+k:                                                ; preds = %if.then12
+  %mul28 = mul nsw i32 %div, %n
+  br label %l
+
+l:                                                ; preds = %k, %j
+  %storemerge = phi i32 [ %add27, %j ], [ %mul28, %k ]
+  %and29 = and i32 %n, 1
+  %tobool30 = icmp eq i32 %and29, 0
+  br i1 %tobool30, label %if.end33, label %if.then31
+
+if.then31:                                        ; preds = %l
+  br label %for.cond35
+
+if.end33:                                         ; preds = %l
+  br label %while.body
+
+for.cond35:                                       ; preds = %for.body39, %if.then31
+  %ret.5.in = phi i32 [ %storemerge, %if.then31 ], [ %ret.5, %for.body39 ]
+  %storemerge1 = phi i32 [ 0, %if.then31 ], [ %inc42, %for.body39 ]
+  %ret.5 = add nsw i32 %ret.5.in, 1
+  %div36 = sdiv i32 %n, 4
+  %cmp37 = icmp slt i32 %storemerge1, %div36
+  br i1 %cmp37, label %for.body39, label %n46
+
+for.body39:                                       ; preds = %for.cond35
+  %inc42 = add nsw i32 %storemerge1, 1
+  br label %for.cond35
+
+i44:                                              ; preds = %for.end
+  %6 = icmp eq i32 %ret.2, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %div45 = sdiv i32 %ret.2, %11
+  br label %n46
+
+n46:                                              ; preds = %i44, %for.cond35
+  %ret.6 = phi i32 [ %div45, %i44 ], [ %ret.5, %for.cond35 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization11, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization11
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[WHILEBODY2:.+]]
+
+; CHECK: [[WHILEBODY2]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br label %[[IFEND20:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN12:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: br label %[[WHILEBODY2UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY2UNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND20UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN12UNIFORM:.+]]:
+; CHECK: %[[CMP15UNIFORM:cmp.+]] = icmp
+; CHECK: br i1 %[[CMP15UNIFORM]], label %[[KUNIFORM:.+]], label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: %[[CMP7UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP7UNIFORM]], label %[[I44UNIFORM:.+]], label %[[IFEND20UNIFORM]]
+
+; CHECK: [[IFEND20UNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[HUNIFORM]], label %[[JUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN12UNIFORM]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN12]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[WHILEBODY2UNIFORM]]
+
+; CHECK: [[KUNIFORM]]:
+; CHECK: br label %[[LUNIFORM:.+]]
+
+; CHECK: [[JUNIFORM]]:
+; CHECK: br label %[[LUNIFORM]]
+
+; CHECK: [[LUNIFORM]]:
+; CHECK: %[[TOBOOL30UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL30UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FORCOND35PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND35PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND35UNIFORM:.+]]
+
+; CHECK: [[FORCOND35UNIFORM]]:
+; CHECK: %[[CMP37UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP37UNIFORM]], label %[[FORBODY39UNIFORM:.+]], label %[[N46LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY39UNIFORM]]:
+; CHECK: br label %[[FORCOND35UNIFORM]]
+
+; CHECK: [[N46LOOPEXITUNIFORM]]:
+; CHECK: br label %[[N46UNIFORM:.+]]
+
+; CHECK: [[I44UNIFORM]]:
+; CHECK: br label %[[N46:.+]]
+
+; CHECK: [[IFTHEN12]]:
+; CHECK: br label %[[IFEND20]]
+
+; CHECK: [[IFEND20]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY2]], label %[[WHILEBODY2PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY2PUREEXIT]]:
+; CHECK: br label %[[K:.+]]
+
+; CHECK: [[J:.+]]:
+; CHECK: br label %[[L:.+]]
+
+; CHECK: [[K]]:
+; CHECK: br label %[[KELSE:.+]]
+
+; CHECK: [[KELSE]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[L]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADER:.+]]
+
+; CHECK: [[FORCOND35PREHEADER]]:
+; CHECK: br label %[[FORCOND35:.+]]
+
+; CHECK: [[FORCOND35PREHEADERELSE:.+]]:
+; CHECK: br label %[[I44:.+]]
+
+; CHECK: [[FORCOND35]]:
+; CHECK: %[[CMP37:.+]] = icmp
+; CHECK: br i1 %[[CMP37]], label %[[FORBODY39:.+]], label %[[N46LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY39]]:
+; CHECK: br label %[[FORCOND35]]
+
+; CHECK: [[I44]]:
+; CHECK: br label %[[N46]]
+
+; CHECK: [[N46LOOPEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADERELSE]]
+
+; CHECK: [[N46]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
new file mode 100644
index 0000000000000..145b4a11627d0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
@@ -0,0 +1,782 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <-----.
+;             / \      |
+;            c   d     |
+;           / \ /      |
+;          /   e       |
+;         /    |       |
+;        /     g <---. |
+;       f     / \    | |
+;       |    h   i   | |
+;       |   /   / \  | |
+;       |  /   k   l | |
+;       | /    |\ /| | |
+;       |/     |/ \| | |
+;       j      m   n | |
+;      /|     / \ /  | |
+;     / |    o   p --' |
+;    /  |   /   /      |
+;   |   |  /   r       |
+;   |   | /    |       |
+;   |   |/     s ------'
+;   |   |     /
+;   |  /|    t
+;   | / |   /
+;   |/  |  /
+;   q   | /
+;   |   |/
+;   |   u
+;    \ /
+;     v
+;
+; * where nodes b, c, g, j, k, l, m, p and s are uniform branches,
+;   and node i is a varying branch.
+; * where nodes k, l, o, n, m, p, q, s, r, t and v are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;              a
+;              |
+;              b <-----.        b' <----.
+;             / \      |       / \      |
+;            c   d     |      c'  d'    |
+;           / \ /      |     / \ /      |
+;          /   e       |    /   e'      |
+;         /    |       |   /    |       |
+;        /     g <---. |  f'    g' <--. |
+;       f     / \    | |  |    / \    | |
+;       |    h   i___|_|__|_  h   i'  | |
+;       |   /   / \  | |  | \/___ |   | |
+;       |  /   k   l | |  | /    `l'  | |
+;       | /    |\ /| | |  |/      |   | |
+;       |/     |/ \| | |  j'      k'  | |
+;       j      m   n | |  |\      |   | |
+;      /|     / \ /  | |  | \     n'  | |
+;     / |    o   p --' |  |  \    |   | |
+;    /  |   /   /      |  |   |   m'  | |
+;   |   |  /   r       |  |   |   |   | |
+;   |   | /    |       |  |   |   p' -' |
+;   |   |/     s ------'  |   |  /      |
+;   |   |     /           |   | r'      |
+;   |  /|    t            |   | |       |
+;   | / |   /             |   | s' -----'
+;   |/  |  /              |   |/
+;   q   | /               |   o'
+;   |   |/                |  /
+;   |   u                 | t'
+;    \ /                  |/
+;     v                   u'
+;     |                   |
+;     |                   q'
+;     |                   |
+;     `-------> & <------ v'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization12(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n < 5) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n <= 2) { // g
+;         // h
+;         ret -= n * ret;
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         // j
+;         goto j;
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           if (n < 5) {
+;             // m
+;             ret -= n;
+;             goto m;
+;           } else {
+;             // n
+;             ret += n;
+;             goto n;
+;           }
+;         } else {
+;           // l
+;           if (n >= 5) {
+;             // m
+;             ret += n;
+;             goto m;
+;           } else {
+;             // n
+;             ret -= n;
+;             goto n;
+;           }
+;         }
+;       }
+;       // m
+; m:
+;       if (n & 1) {
+;         // o
+;         ret *= n;
+;         goto q;
+;       } else {
+;         // p
+;         goto p;
+;       }
+;
+;       // n
+; n:
+;       ret *= ret;
+;       // p
+; p:
+;       if (n > 3) {
+;         goto r;
+;       }
+;       ret++;
+;     }
+;
+;     // r
+; r:
+;     ret *= 4;
+;     for (int i = 0; i < n / 4; i++) ret++;
+;
+;     // s
+;     if (n & 1) {
+;       goto t;
+;     }
+;     ret++;
+;   }
+;
+; f:
+;   ret /= n;
+;   goto j;
+;
+; j:
+;   if (n == 2) {
+;     goto q;
+;   } else {
+;     goto u;
+;   }
+;
+; t:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto u;
+;
+; q:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto v;
+;
+; u:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; v:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end79, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc80, %if.end79 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %storemerge, %if.then ], [ %inc, %for.body ]
+  %storemerge10 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge10, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge10, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 5
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ %storemerge, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge1 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge1, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.1, 1
+  %inc15 = add nsw i32 %storemerge1, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.2 = phi i32 [ %ret.0, %for.end ], [ %ret.1, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %if.end63, %if.end17
+  %storemerge2.in = phi i32 [ %ret.2, %if.end17 ], [ %ret.4, %if.end63 ]
+  %storemerge2 = add nsw i32 %storemerge2.in, 1
+  %cmp21 = icmp slt i32 %n, 3
+  br i1 %cmp21, label %if.then23, label %if.else35
+
+if.then23:                                        ; preds = %while.body20
+  %mul24 = mul nsw i32 %storemerge2, %n
+  %sub = sub nsw i32 %storemerge2, %mul24
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %if.then23
+  %ret.3 = phi i32 [ %sub, %if.then23 ], [ %inc31, %for.body30 ]
+  %storemerge9 = phi i32 [ 0, %if.then23 ], [ %inc33, %for.body30 ]
+  %mul27 = shl nsw i32 %n, 1
+  %cmp28 = icmp slt i32 %storemerge9, %mul27
+  br i1 %cmp28, label %for.body30, label %j
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nsw i32 %ret.3, 1
+  %inc33 = add nsw i32 %storemerge9, 1
+  br label %for.cond26
+
+if.else35:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge2, %conv
+  %cmp36 = icmp slt i32 %add, %n
+  br i1 %cmp36, label %if.else48, label %if.then38
+
+if.then38:                                        ; preds = %if.else35
+  %mul39 = mul nsw i32 %n, %n
+  %add40 = add nsw i32 %storemerge2, %mul39
+  %0 = icmp eq i32 %add40, 0
+  %1 = select i1 %0, i32 1, i32 %add40
+  %div41 = sdiv i32 %storemerge2, %1
+  %cmp42 = icmp slt i32 %n, 5
+  br i1 %cmp42, label %if.then44, label %if.else46
+
+if.then44:                                        ; preds = %if.then38
+  %sub45 = sub nsw i32 %div41, %n
+  br label %m
+
+if.else46:                                        ; preds = %if.then38
+  %add47 = add nsw i32 %div41, %n
+  br label %n58
+
+if.else48:                                        ; preds = %if.else35
+  %cmp49 = icmp sgt i32 %n, 4
+  br i1 %cmp49, label %if.then51, label %if.else53
+
+if.then51:                                        ; preds = %if.else48
+  %add52 = add nsw i32 %storemerge2, %n
+  br label %m
+
+if.else53:                                        ; preds = %if.else48
+  %sub54 = sub nsw i32 %storemerge2, %n
+  br label %n58
+
+m:                                                ; preds = %if.then51, %if.then44
+  %storemerge7 = phi i32 [ %add52, %if.then51 ], [ %sub45, %if.then44 ]
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %p, label %if.then55
+
+if.then55:                                        ; preds = %m
+  %mul56 = mul nsw i32 %storemerge7, %n
+  br label %q
+
+n58:                                              ; preds = %if.else53, %if.else46
+  %storemerge3 = phi i32 [ %sub54, %if.else53 ], [ %add47, %if.else46 ]
+  %mul59 = mul nsw i32 %storemerge3, %storemerge3
+  br label %p
+
+p:                                                ; preds = %n58, %m
+  %ret.4 = phi i32 [ %mul59, %n58 ], [ %storemerge7, %m ]
+  %cmp60 = icmp sgt i32 %n, 3
+  br i1 %cmp60, label %r, label %if.end63
+
+if.end63:                                         ; preds = %p
+  br label %while.body20
+
+r:                                                ; preds = %p
+  %mul65 = shl nsw i32 %ret.4, 2
+  br label %for.cond67
+
+for.cond67:                                       ; preds = %for.body71, %r
+  %ret.5 = phi i32 [ %mul65, %r ], [ %inc72, %for.body71 ]
+  %storemerge4 = phi i32 [ 0, %r ], [ %inc74, %for.body71 ]
+  %div68 = sdiv i32 %n, 4
+  %cmp69 = icmp slt i32 %storemerge4, %div68
+  br i1 %cmp69, label %for.body71, label %for.end75
+
+for.body71:                                       ; preds = %for.cond67
+  %inc72 = add nsw i32 %ret.5, 1
+  %inc74 = add nsw i32 %storemerge4, 1
+  br label %for.cond67
+
+for.end75:                                        ; preds = %for.cond67
+  %and76 = and i32 %n, 1
+  %tobool77 = icmp eq i32 %and76, 0
+  br i1 %tobool77, label %if.end79, label %t
+
+if.end79:                                         ; preds = %for.end75
+  %inc80 = add nsw i32 %ret.5, 1
+  br label %while.body
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %n, 0
+  %3 = select i1 %2, i32 1, i32 %n
+  %div81 = sdiv i32 %ret.0, %3
+  br label %j
+
+j:                                                ; preds = %f, %for.cond26
+  %ret.6 = phi i32 [ %div81, %f ], [ %ret.3, %for.cond26 ]
+  %cmp82 = icmp eq i32 %n, 2
+  br i1 %cmp82, label %q, label %u
+
+t:                                                ; preds = %for.end75
+  br label %for.cond87
+
+for.cond87:                                       ; preds = %for.body91, %t
+  %ret.7 = phi i32 [ %ret.5, %t ], [ %inc92, %for.body91 ]
+  %storemerge5 = phi i32 [ 0, %t ], [ %inc94, %for.body91 ]
+  %cmp89 = icmp sgt i32 %storemerge5, %n
+  br i1 %cmp89, label %u, label %for.body91
+
+for.body91:                                       ; preds = %for.cond87
+  %inc92 = add nsw i32 %ret.7, 1
+  %inc94 = add nsw i32 %storemerge5, 1
+  br label %for.cond87
+
+q:                                                ; preds = %j, %if.then55
+  %ret.8 = phi i32 [ %mul56, %if.then55 ], [ %ret.6, %j ]
+  br label %for.cond97
+
+for.cond97:                                       ; preds = %for.body101, %q
+  %ret.9 = phi i32 [ %ret.8, %q ], [ %inc102, %for.body101 ]
+  %storemerge8 = phi i32 [ 0, %q ], [ %inc104, %for.body101 ]
+  %div98 = sdiv i32 %n, 4
+  %cmp99 = icmp slt i32 %storemerge8, %div98
+  br i1 %cmp99, label %for.body101, label %v
+
+for.body101:                                      ; preds = %for.cond97
+  %inc102 = add nsw i32 %ret.9, 1
+  %inc104 = add nsw i32 %storemerge8, 1
+  br label %for.cond97
+
+u:                                                ; preds = %for.cond87, %j
+  %ret.10 = phi i32 [ %ret.6, %j ], [ %ret.7, %for.cond87 ]
+  br label %for.cond107
+
+for.cond107:                                      ; preds = %for.body111, %u
+  %ret.11 = phi i32 [ %ret.10, %u ], [ %inc112, %for.body111 ]
+  %storemerge6 = phi i32 [ 0, %u ], [ %inc114, %for.body111 ]
+  %mul108 = shl nsw i32 %n, 1
+  %cmp109 = icmp slt i32 %storemerge6, %mul108
+  br i1 %cmp109, label %for.body111, label %v
+
+for.body111:                                      ; preds = %for.cond107
+  %inc112 = add nsw i32 %ret.11, 1
+  %inc114 = add nsw i32 %storemerge6, 1
+  br label %for.cond107
+
+v:                                                ; preds = %for.cond107, %for.cond97
+  %ret.12 = phi i32 [ %ret.9, %for.cond97 ], [ %ret.11, %for.cond107 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization12, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization12
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[CMP21:.+]] = icmp
+; CHECK: br i1 %[[CMP21]], label %[[IFTHEN23:.+]], label %[[IFELSE35:.+]]
+
+; CHECK: [[IFTHEN23]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHEN23ELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[IFTHEN23SPLIT:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[IFELSE35]]:
+; CHECK: br label %[[IFTHEN38:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND9PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND9PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM:.+]]
+
+; CHECK: [[FORCOND9UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[IFEND17LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM]]
+
+; CHECK: [[IFEND17LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND17UNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND17UNIFORM]]
+
+; CHECK: [[IFEND17UNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY20UNIFORM]]:
+; CHECK: %[[CMP21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP21UNIFORM]], label %[[IFTHEN23UNIFORM:.+]], label %[[IFELSE35UNIFORM:.+]]
+
+; CHECK: [[IFELSE35UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE48UNIFORM:.+]], label %[[IFELSE35UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN38UNIFORM:.+]]:
+; CHECK: %[[CMP42UNIFORM:cmp.+]] = icmp
+; CHECK: br i1 %[[CMP42UNIFORM]], label %[[IFTHEN44UNIFORM:.+]], label %[[IFELSE46UNIFORM:.+]]
+
+; CHECK: [[IFELSE46UNIFORM]]:
+; CHECK: br label %[[N58UNIFORM:.+]]
+
+; CHECK: [[IFTHEN44UNIFORM]]:
+; CHECK: br label %[[MUNIFORM:.+]]
+
+; CHECK: [[IFELSE48UNIFORM]]:
+; CHECK: %[[CMP49UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP49UNIFORM]], label %[[IFTHEN51UNIFORM:.+]], label %[[IFELSE53UNIFORM:.+]]
+
+; CHECK: [[IFELSE35UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN38UNIFORM]], label %[[IFELSE35UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE35UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN38]]
+
+; CHECK: [[IFELSE53UNIFORM]]:
+; CHECK: br label %[[N58UNIFORM]]
+
+; CHECK: [[IFTHEN51UNIFORM]]:
+; CHECK: br label %[[MUNIFORM]]
+
+; CHECK: [[N58UNIFORM]]:
+; CHECK: br label %[[PUNIFORM:.+]]
+
+; CHECK: [[MUNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[PUNIFORM]], label %[[IFTHEN55UNIFORM:.+]]
+
+; CHECK: [[PUNIFORM]]:
+; CHECK: %[[CMP60UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP60UNIFORM]], label %[[RUNIFORM:.+]], label %[[WHILEBODY20UNIFORM]]
+
+; CHECK: [[RUNIFORM]]:
+; CHECK: br label %[[FORCOND67UNIFORM:.+]]
+
+; CHECK: [[FORCOND67UNIFORM]]:
+; CHECK: %[[CMP69UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP69UNIFORM]], label %[[FORBODY71UNIFORM:.+]], label %[[FOREND75UNIFORM:.+]]
+
+; CHECK: [[FORBODY71UNIFORM]]:
+; CHECK: br label %[[FORCOND67UNIFORM]]
+
+; CHECK: [[FOREND75UNIFORM]]:
+; CHECK: %[[TOBOOL77UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL77UNIFORM]], label %[[IFEND79UNIFORM:.+]], label %[[FORCOND87PREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFEND79UNIFORM]]:
+; CHECK: br label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[IFEND55UNIFORM:.+]]:
+; CHECK: br label %[[QUNIFORM:.+]]
+
+; CHECK: [[FORCOND87PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND87UNIFORM:.+]]
+
+; CHECK: [[FORCOND87UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXITUNIFORM:.+]], label %[[FORBODY91UNIFORM:.+]]
+
+; CHECK: [[FORBODY91UNIFORM]]:
+; CHECK: br label %[[FORCOND87UNIFORM]]
+
+; CHECK: [[ULOOPEXITUNIFORM]]:
+; CHECK: br label %[[UUNIFORM:.+]]
+
+; CHECK: [[IFTHEN23UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM:.+]]
+
+; CHECK: [[FORCOND26UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30UNIFORM:.+]], label %[[JLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY30UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM]]
+
+; CHECK: [[JLOOPEXITUNIFORM]]:
+; CHECK: br label %[[JUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[JUNIFORM]]
+
+; CHECK: [[JUNIFORM]]:
+; CHECK: %[[CMP82UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP82UNIFORM]], label %[[QUNIFORM]], label %[[UUNIFORM]]
+
+; CHECK: [[UUNIFORM]]:
+; CHECK: br label %[[FORCOND107UNIFORM:.+]]
+
+; CHECK: [[FORCOND107UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111UNIFORM:.+]], label %[[ULOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY111UNIFORM]]:
+; CHECK: br label %[[FORCOND107UNIFORM]]
+
+; CHECK: [[VLOOPEXIT1UNIFORM:.+]]:
+; CHECK: br label %[[VUNIFORM:.+]]
+
+; CHECK: [[QUNIFORM]]:
+; CHECK: br label %[[FORCOND97UNIFORM:.+]]
+
+; CHECK: [[FORCOND97UNIFORM]]:
+; CHECK: %[[CMP99UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP99UNIFORM]], label %[[FORBODY101UNIFORM:.+]], label %[[VLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY101UNIFORM]]:
+; CHECK: br label %[[FORCOND97UNIFORM]]
+
+; CHECK: [[VLOOPEXITUNIFORM]]:
+; CHECK: br label %[[V:.+]]
+
+; CHECK: [[IFTHEN38]]:
+; CHECK: %[[CMP42:cmp.+]] = icmp
+; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]]
+
+; CHECK: [[IFTHEN44]]:
+; CHECK: br label %[[IFELSE48:.+]]
+
+; CHECK: [[IFELSE46]]:
+; CHECK: br label %[[IFELSE48]]
+
+; CHECK: [[IFELSE48]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]]
+
+; CHECK: [[IFTHEN51]]:
+; CHECK: br label %[[N58:.+]]
+
+; CHECK: [[IFELSE53]]:
+; CHECK: br label %[[N58]]
+
+; CHECK: [[M:.+]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[IFTHEN55:.+]]:
+; CHECK: br label %[[IFTHEN55ELSE:.+]]
+
+; CHECK: [[IFTHEN55ELSE]]:
+; CHECK: br label %[[FORCOND87PREHEADER:.+]]
+
+; CHECK: [[N58]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[P]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[R:.+]]
+
+; CHECK: [[R]]:
+; CHECK: br label %[[FORCOND67:.+]]
+
+; CHECK: [[FORCOND67]]:
+; CHECK: %[[CMP69:.+]] = icmp
+; CHECK: br i1 %[[CMP69]], label %[[FORBODY71:.+]], label %[[FOREND75:.+]]
+
+; CHECK: [[FORBODY71]]:
+; CHECK: br label %[[FORCOND67]]
+
+; CHECK: [[FOREND75]]:
+; CHECK: br label %[[IFEND79:.+]]
+
+; CHECK: [[FORCOND87PREHEADER]]:
+; CHECK: br label %[[FORCOND87:.+]]
+
+; CHECK: [[FORCOND87PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN23ELSE]], label %[[IFTHEN23SPLIT]]
+
+; CHECK: [[IFEND79]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN55]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[U:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[J]]:
+; CHECK: %[[CMP82:.+]] = icmp
+; CHECK: br i1 %[[CMP82]], label %[[Q:.+]], label %[[U]]
+
+; CHECK: [[FORCOND87]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXIT:.+]], label %[[FORBODY91:.+]]
+
+; CHECK: [[FORBODY91]]:
+; CHECK: br label %[[FORCOND87]]
+
+; CHECK: [[Q]]:
+; CHECK: br label %[[FORCOND97:.+]]
+
+; CHECK: [[FORCOND97]]:
+; CHECK: %[[CMP99:.+]] = icmp
+; CHECK: br i1 %[[CMP99]], label %[[FORBODY101:.+]], label %[[VLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY101]]:
+; CHECK: br label %[[FORCOND97]]
+
+; CHECK: [[ULOOPEXIT]]:
+; CHECK: br label %[[FORCOND87PREHEADERELSE]]
+
+; CHECK: [[U]]:
+; CHECK: br label %[[FORCOND107:.+]]
+
+; CHECK: [[FORCOND107]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111:.+]], label %[[VLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY111]]:
+; CHECK: br label %[[FORCOND107]]
+
+; CHECK: [[VLOOPEXIT]]:
+; CHECK: br label %[[V]]
+
+; CHECK: [[VLOOPEXIT1]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[V]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
new file mode 100644
index 0000000000000..67d9468d3bc33
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -0,0 +1,247 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;    \ / \
+;     |   \
+;     |    d
+;     |   / \
+;     |  |   e
+;     |   \ /
+;     |    f
+;     |   / \
+;     |  |   g
+;     |   \ /
+;      \   h
+;       \ /
+;        i
+;
+; * where nodes d and f are uniform branches, and nodes a and c are varying
+;   branches.
+; * where nodes b, c, i are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a___________
+;    / \          \
+;   b   c_________ c'
+;    \ / \        \|
+;     |   \        d'
+;     |    d      / \
+;     |   / \    |   e'
+;     |  |   e    \ /
+;     |   \ /      f'
+;     |    f      / \
+;     |   / \    |   g'
+;     |  |   g    \ /
+;     |   \ /      h'
+;      \   h       |
+;       \ /        b'
+;        i         |
+;        `--> & <- i'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization13(__global int *out, int n) {
+;   size_t tid = get_global_id(0);
+;   size_t size = get_global_size(0);
+;   // a
+;   if (tid + 1 < size) {
+;     // b
+;     out[tid] = n;
+;   } else if (tid + 1 == size) { // c
+;     size_t leftovers = 1 + (size & 1);
+;     switch (leftovers) { // d
+;       case 2: // e
+;         out[tid] = 2 * n + 1;
+;         // fall through
+;       case 1: // f
+;         out[tid] += 3 * n - 1;
+;         break;
+;     }
+;     switch (leftovers) { // g
+;       case 2:
+;         out[tid] /= n;
+;         // fall through
+;       case 1: // h
+;         out[tid]--;
+;         break;
+;     }
+;   }
+;   // i
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0) #2
+  %add = add i64 %call, 1
+  %cmp = icmp ult i64 %add, %call1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %n, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end17
+
+if.else:                                          ; preds = %entry
+  %add2 = add i64 %call, 1
+  %cmp3 = icmp eq i64 %add2, %call1
+  br i1 %cmp3, label %if.then4, label %if.end17
+
+if.then4:                                         ; preds = %if.else
+  %0 = and i64 %call1, 1
+  %trunc = icmp eq i64 %0, 0
+  br i1 %trunc, label %sw.bb8, label %sw.bb
+
+sw.bb:                                            ; preds = %if.then4
+  %mul = shl nsw i32 %n, 1
+  %add6 = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %add6, i32 addrspace(1)* %arrayidx7, align 4
+  br label %sw.bb8
+
+sw.bb8:                                           ; preds = %sw.bb, %if.then4
+  %mul9 = mul nsw i32 %n, 3
+  %sub = add nsw i32 %mul9, -1
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %add11 = add nsw i32 %sub, %1
+  store i32 %add11, i32 addrspace(1)* %arrayidx10, align 4
+  %2 = and i64 %call1, 1
+  %trunc2 = icmp ne i64 %2, 0
+  %trunc2.off = add i1 %trunc2, true
+  %switch = icmp ult i1 %trunc2.off, true
+  br i1 %switch, label %sw.bb12, label %sw.bb14
+
+sw.bb12:                                          ; preds = %sw.bb8
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %3 = load i32, i32 addrspace(1)* %arrayidx13, align 4
+  %4 = icmp eq i32 %3, -2147483648
+  %5 = icmp eq i32 %n, -1
+  %6 = and i1 %5, %4
+  %7 = icmp eq i32 %n, 0
+  %8 = or i1 %7, %6
+  %9 = select i1 %8, i32 1, i32 %n
+  %div = sdiv i32 %3, %9
+  store i32 %div, i32 addrspace(1)* %arrayidx13, align 4
+  br label %sw.bb14
+
+sw.bb14:                                          ; preds = %sw.bb12, %sw.bb8
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %10 = load i32, i32 addrspace(1)* %arrayidx15, align 4
+  %dec = add nsw i32 %10, -1
+  store i32 %dec, i32 addrspace(1)* %arrayidx15, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %sw.bb14, %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z15get_global_sizej(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization13, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization13
+; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSEUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN4UNIFORM]]:
+; CHECK: %[[TRUNCUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TRUNCUNIFORM]], label %[[SWBB8UNIFORM:.+]], label %[[SWBBUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND17UNIFORM:.+]], label %[[IFTHEN4:.+]]
+
+; CHECK: [[SWBBUNIFORM]]:
+; CHECK: br label %[[SWBB8UNIFORM]]
+
+; CHECK: [[SWBB8UNIFORM]]:
+; CHECK: %[[TRUNC2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TRUNC2UNIFORM]], label %[[SWBB14UNIFORM:.+]], label %[[SWBB12UNIFORM:.+]]
+
+; CHECK: [[SWBB12UNIFORM]]:
+; CHECK: br label %[[SWBB14UNIFORM]]
+
+; CHECK: [[SWBB14UNIFORM]]:
+; CHECK: br label %[[IFEND17UNIFORM]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: br label %[[IFEND17:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSEUNIFORM]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN4]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: %[[TRUNC:.+]] = icmp
+; CHECK: br i1 %[[TRUNC]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
+
+; CHECK: [[SWBB]]:
+; CHECK: br label %[[SWBB8]]
+
+; CHECK: [[SWBB8]]:
+; CHECK: %[[TRUNC2:.+]] = icmp
+; CHECK: br i1 %[[TRUNC2]], label %[[SWBB14:.+]], label %[[SWBB12:.+]]
+
+; CHECK: [[SWBB12]]:
+; CHECK: br label %[[SWBB14]]
+
+; CHECK: [[SWBB14]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
new file mode 100644
index 0000000000000..dcadd516f3ae2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
@@ -0,0 +1,356 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c <-.
+;   |  / \  |
+;   | d   e |
+;   |/ \ /  |
+;   f   g --'
+;    \  |
+;     \ h
+;      \|
+;       i
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, h and i are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;      / \
+;     b   c <-.   c' <.
+;    /   / \__|__ |   |
+;   |   d   e |  `e'  |
+;   |  / \ /  |   |   |
+;   | f   g --'   d'  |
+;    \ \  |       |   |
+;     \ \ h       g' -'
+;      \ \|       |
+;       \ i       h'
+;        \|      /
+;         \     /
+;        / \   /
+;        |  \ /
+;        |   f'
+;        |   |
+;        |   i'
+;         \ /
+;          &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization14(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto f;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %f, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+f:                                                ; preds = %if.then9, %for.cond
+  %ret.5 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %if.then9 ]
+  br label %for.cond41
+
+for.cond41:                                       ; preds = %for.body45, %f
+  %ret.6 = phi i32 [ %ret.5, %f ], [ %div46, %for.body45 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc48, %for.body45 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge3, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.6, 2
+  %inc48 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.6, %n
+  br label %early
+
+early:                                            ; preds = %for.end49, %for.end39
+  %storemerge2 = phi i32 [ %div, %for.end39 ], [ %sub50, %for.end49 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization14, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization14
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN9UNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND15PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND15UNIFORM:.+]]
+
+; CHECK: [[FORCOND15UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19UNIFORM:.+]], label %[[IFEND24LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY19UNIFORM]]:
+; CHECK: br label %[[FORCOND15UNIFORM]]
+
+; CHECK: [[IFEND24LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND24UNIFORM:.+]]
+
+; CHECK: [[IFTHEN9UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FLOOPEXIT1UNIFORM:.+]], label %[[IFEND24UNIFORM]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND15PREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[FORCOND15PREHEADER]]
+
+; CHECK: [[IFEND24UNIFORM]]:
+; CHECK: %[[CMP25UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP25UNIFORM]], label %[[IFEND29UNIFORM:.+]], label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[IFEND29UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36UNIFORM:.+]], label %[[FOREND39UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[FOREND39UNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[FLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[FUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM:.+]]
+
+; CHECK: [[FORCOND41UNIFORM]]:
+; CHECK: %[[CMP43UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP43UNIFORM]], label %[[FORBODY45UNIFORM:.+]], label %[[FOREND49UNIFORM:.+]]
+
+; CHECK: [[FORBODY45UNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM]]
+
+; CHECK: [[FOREND49UNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FLOOPEXIT2:.+]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT2]]:
+; CHECK: br label %[[F]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
new file mode 100644
index 0000000000000..81580111e2e4a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
@@ -0,0 +1,415 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;           a
+;           |
+;           b <-----.
+;          / \      |
+;         c   d     |
+;        / \ /      |
+;       /   e       |
+;      /    |       |
+;     /     g <---. |
+;    /     / \    | |
+;   f     h   i   | |
+;   |    / \ / \  | |
+;   |   |   j   k | |
+;   |    \ / \ /  | |
+;   |     l   m --' |
+;   |    /          |
+;   |   o ----------'
+;   |   |
+;   n   p
+;    \ /
+;     q
+;
+; * where nodes b, c, g, h, j and o are uniform branches, and node i is a
+;   varying branch.
+; * where nodes j, k, m, l, and o are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;         |
+;         b <------------------.
+;        / \                   |
+;       c   d                  |
+;      / \ /                   |
+;     /   e                    |
+;    /    |                    |
+;   f     g <---.     g' <---. |
+;   |    / \    |    / \     | |
+;   |   h   i___|__ h'  i'   | |
+;   |  / \ / \  |  \|__ |    | |
+;   | |   j   k |   |  `k'   | |
+;   |  \ / \ /  |    \ /     | |
+;   |   l   m --'     j'     | |
+;   |   |             |      | |
+;   |   |             m'-----' |
+;    \  |             |        |
+;     \ `----> & <--- l'       |
+;      \      /                |
+;       \    o ----------------'
+;        |   |
+;        n   p
+;         \ /
+;          q
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization15(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           goto l;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           goto m;
+;         }
+;       }
+;       // j
+;       if (n & 1) {
+;         goto l;
+;       }
+;       // m
+; m:
+;       ret++;
+;     }
+; l:
+;     ret *= 4;
+; o:
+;     if (n & 1) {
+;       // p
+;       ret++;
+;       goto p;
+;     }
+;   }
+;
+; p:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto q;
+;
+; f:
+;   ret /= n;
+;   goto n;
+;
+; n:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; q:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %l, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %mul40, %l ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.cond, label %for.cond9
+
+for.cond:                                         ; preds = %for.body, %while.body
+  %ret.1 = phi i32 [ %inc, %for.body ], [ %ret.0, %while.body ]
+  %storemerge3 = phi i32 [ %inc4, %for.body ], [ 0, %while.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+for.cond9:                                        ; preds = %for.body12, %while.body
+  %ret.2 = phi i32 [ %inc13, %for.body12 ], [ %ret.0, %while.body ]
+  %storemerge = phi i32 [ %inc15, %for.body12 ], [ 0, %while.body ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %m, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %m ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %l, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %m
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %m, label %l
+
+m:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+l:                                                ; preds = %if.end34, %if.then21
+  %mul40 = shl nsw i32 %storemerge1, 2
+  %and41 = and i32 %n, 1
+  %tobool42 = icmp eq i32 %and41, 0
+  br i1 %tobool42, label %while.body, label %if.then43
+
+if.then43:                                        ; preds = %l
+  %inc44 = or i32 %mul40, 1
+  br label %for.cond47
+
+for.cond47:                                       ; preds = %for.body51, %if.then43
+  %ret.5 = phi i32 [ %inc44, %if.then43 ], [ %inc52, %for.body51 ]
+  %storemerge2 = phi i32 [ 0, %if.then43 ], [ %inc54, %for.body51 ]
+  %div48 = sdiv i32 %n, 4
+  %cmp49 = icmp slt i32 %storemerge2, %div48
+  br i1 %cmp49, label %for.body51, label %q
+
+for.body51:                                       ; preds = %for.cond47
+  %inc52 = add nsw i32 %ret.5, 1
+  %inc54 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond47
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div56 = sdiv i32 %ret.1, %7
+  br label %for.cond59
+
+for.cond59:                                       ; preds = %for.body63, %f
+  %ret.6 = phi i32 [ %div56, %f ], [ %inc64, %for.body63 ]
+  %storemerge4 = phi i32 [ 0, %f ], [ %inc66, %for.body63 ]
+  %mul60 = shl nsw i32 %n, 1
+  %cmp61 = icmp slt i32 %storemerge4, %mul60
+  br i1 %cmp61, label %for.body63, label %q
+
+for.body63:                                       ; preds = %for.cond59
+  %inc64 = add nsw i32 %ret.6, 1
+  %inc66 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond59
+
+q:                                                ; preds = %for.cond59, %for.cond47
+  %ret.7 = phi i32 [ %ret.5, %for.cond47 ], [ %ret.6, %for.cond59 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.7, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization15, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization15
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 false, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br i1 true, label %[[WHILEBODY20UNIFORM:.+]], label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[TOBOOL:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[WHILEBODY20UNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[IFELSE26UNIFORM:.+]], label %[[IFTHEN21UNIFORM:.+]]
+
+; CHECK: [[IFTHEN21UNIFORM]]:
+; CHECK: %[[CMP22UNIFORM:.+]] = icmp
+; CHECK: %[[TOBOOLNEW36UNIFORM:.+]] = icmp
+; CHECK: %[[ORCONDUNIFORM:.+]] = and i1 %[[CMP22UNIFORM]], %[[TOBOOLNEW36UNIFORM]]
+; CHECK: br i1 %[[ORCONDUNIFORM]], label %[[MUNIFORM:.+]], label %[[L:.+]]
+
+; CHECK: [[IFELSE26UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND34UNIFORM:.+]], label %[[IFELSE26UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN29UNIFORM:.+]]:
+; CHECK: br label %[[MUNIFORM:.+]]
+
+; CHECK: [[IFEND34UNIFORM]]:
+; CHECK: %[[TOBOOL36UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL36UNIFORM]], label %[[MUNIFORM]], label %[[L:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN29UNIFORM]], label %[[IFELSE26UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN29]]
+
+; CHECK: [[MUNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34:.+]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[M]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[L]]
+
+; CHECK: [[L]]:
+; CHECK: %[[TOBOOL42:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL42]], label %[[WHILEBODY]], label %[[IFTHEN43:.+]]
+
+; CHECK: [[IFTHEN43]]:
+; CHECK: br label %[[FORCOND47:.+]]
+
+; CHECK: [[FORCOND47]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[FORBODY51:.+]], label %[[QLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY51]]:
+; CHECK: br label %[[FORCOND47]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND59:.+]]
+
+; CHECK: [[FORCOND59]]:
+; CHECK: br i1 false, label %[[FORBODY63:.+]], label %[[QLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY63]]:
+; CHECK: br label %[[FORCOND59]]
+
+; CHECK: [[QLOOPEXIT]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[QLOOPEXIT2]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[Q]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
new file mode 100644
index 0000000000000..02be40e3ae5ae
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
@@ -0,0 +1,394 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c <-.
+;    /   / \  |
+;   |   d   e |
+;   |  / \ /  |
+;   | f   g --'
+;   |/    |
+;   h     i
+;    \   /
+;     \ /
+;      j
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;        / \
+;       b   c <-.  c' <.
+;      /   / \__|_ |   |
+;     /   d   e | `e'  |
+;    /   / \ /  |  |   |
+;   /   f   g --'  d'  |
+;  |   /    |      |   |
+;   \ h     i      g' -'
+;    \ \   /       |
+;     \ \ /        i'
+;      \ j         |
+;       \|         f'
+;        \         |
+;        /\       /
+;       |  \     /
+;       |   \   /
+;       |    \ /
+;       |     h'
+;       |     |
+;       |     j'
+;        \   /
+;         \ /
+;          &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization16(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto h;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; h:
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %for.cond41, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+for.cond41:                                       ; preds = %for.body45, %if.then9
+  %ret.5 = phi i32 [ %div46, %for.body45 ], [ %ret.1, %if.then9 ]
+  %storemerge2 = phi i32 [ %inc48, %for.body45 ], [ 0, %if.then9 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge2, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.5, 2
+  %inc48 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.5, %n
+  br label %h
+
+h:                                                ; preds = %for.end49, %for.cond
+  %ret.6 = phi i32 [ %sub50, %for.end49 ], [ %ret.0, %for.cond ]
+  br label %for.cond52
+
+for.cond52:                                       ; preds = %for.body56, %h
+  %ret.7 = phi i32 [ %ret.6, %h ], [ %sub57, %for.body56 ]
+  %storemerge3 = phi i32 [ 0, %h ], [ %inc59, %for.body56 ]
+  %mul53 = shl nsw i32 %n, 1
+  %cmp54 = icmp slt i32 %storemerge3, %mul53
+  br i1 %cmp54, label %for.body56, label %early
+
+for.body56:                                       ; preds = %for.cond52
+  %sub57 = sub nsw i32 %ret.7, %storemerge3
+  %inc59 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond52
+
+early:                                            ; preds = %for.cond52, %for.end39
+  %ret.8 = phi i32 [ %div, %for.end39 ], [ %ret.7, %for.cond52 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization16, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization16
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN9UNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND15PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND15UNIFORM:.+]]
+
+; CHECK: [[FORCOND15UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19UNIFORM:.+]], label %[[IFEND24LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY19UNIFORM]]:
+; CHECK: br label %[[FORCOND15UNIFORM]]
+
+; CHECK: [[IFEND24LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND24UNIFORM:.+]]
+
+; CHECK: [[IFTHEN9UNIFORM:.+]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORCOND41PREHEADERUNIFORM:.+]], label %[[IFEND24UNIFORM]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND15PREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[FORCOND15PREHEADER]]
+
+; CHECK: [[IFEND24UNIFORM]]:
+; CHECK: %[[CMP25UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP25UNIFORM]], label %[[IFEND29UNIFORM:.+]], label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[IFEND29UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36UNIFORM:.+]], label %[[FOREND39UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[FOREND39UNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[FORCOND41PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM:.+]]
+
+; CHECK: [[FORCOND41UNIFORM]]:
+; CHECK: %[[CMP43UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP43UNIFORM]], label %[[FORBODY45UNIFORM:.+]], label %[[FOREND49UNIFORM:.+]]
+
+; CHECK: [[FORBODY45UNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM]]
+
+; CHECK: [[FOREND49UNIFORM]]:
+; CHECK: br label %[[HUNIFORM:.+]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[FORCOND52UNIFORM:.+]]
+
+; CHECK: [[FORCOND52UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56UNIFORM:.+]], label %[[EARLYLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY56UNIFORM]]:
+; CHECK: br label %[[FORCOND52UNIFORM]]
+
+; CHECK: [[EARLYLOOPEXITUNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND41PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FORCOND41PREHEADER]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND52:.+]]
+
+; CHECK: [[FORCOND52]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56:.+]], label %[[EARLYLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY56]]:
+; CHECK: br label %[[FORCOND52]]
+
+; CHECK: [[EARLYLOOPEXIT]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
new file mode 100644
index 0000000000000..e3d20bdef7686
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -0,0 +1,468 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <----.
+;             / \     |
+;            c   d    |
+;           /   / \   |
+;          e   f   g -'
+;         / \  |   |
+;   .--> h   | i   j
+;   |   / \  |  \ /
+;   '- k   l '-> m
+;      |    \   /
+;      n     \ /
+;       \     o
+;        \   /
+;         \ /
+;          p
+;
+; * where nodes b, d, and h are uniform branches, and nodes e and g are varying
+;   branches.
+; * where nodes h, j, m, o, and p are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;              a
+;              |
+;              b <----. .-----------> b' <----.
+;             / \     | |            / \      |
+;            c   d    | |           c'  d'    |
+;           /   / \   | |          /   / \    |
+;          e   f   g -' |         e'  f'  g' -'
+;         / \__|___|\___' _____  /    |   |
+;   .--> h   | i   j\____/ .-->`h'    i'  |
+;   |   / \  |  \ /        |   / \    |   |
+;   '- k   l '-> m         '- k'  l'  |   |
+;      |    \   /              \   \  |  /
+;      n     \ /                n'  \ | /
+;       \     o                  \   \|/
+;        \   /                    `-> j'
+;         \ /                         |
+;          p                          m'
+;          |                          |
+;          |                          o'
+;          |                          |
+;           `----------> & <--------- p'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization17(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 10) {
+;       goto c;
+;     } else if (n > 5) {
+;       goto f;
+;     }
+;     if (id + i++ % 2 == 0) {
+;       break;
+;     }
+;   }
+;
+;   // j
+;   for (int i = 0; i < n + 10; i++) ret++;
+;   goto m;
+;
+; f:
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret += i;
+;   goto m;
+;
+; c:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   // e
+;   if (id % 2 == 0) {
+;     goto h;
+;   } else {
+;     goto m;
+;   }
+;
+; m:
+;   ret <<= 2;
+;   goto o;
+;
+; h:
+;   for (int i = 0; i < n * 2; i++) {
+;     if (n > 5) {
+;       goto l;
+;     }
+;   }
+;   // n
+;   ret += id << 3;
+;   goto p;
+;
+; l:
+;   ret += id << 3;
+;
+; o:
+;   for (int i = 0; i < n * 2; i++) ret += i;
+;
+; p:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end5, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %for.cond28, label %if.else
+
+if.else:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 5
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.else
+  %inc = add nuw nsw i32 %i.0, 1
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp6 = icmp eq i32 %conv, %add
+  br i1 %cmp6, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.end5
+  %ret.0 = phi i32 [ %inc14, %for.body ], [ 0, %if.end5 ]
+  %storemerge = phi i32 [ %inc15, %for.body ], [ 0, %if.end5 ]
+  %add11 = add nsw i32 %n, 10
+  %cmp12 = icmp slt i32 %storemerge, %add11
+  br i1 %cmp12, label %for.body, label %m
+
+for.body:                                         ; preds = %for.cond
+  %inc14 = add nuw nsw i32 %ret.0, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+f:                                                ; preds = %if.else
+  %mul = shl i32 %n, 1
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.body22, %f
+  %ret.1 = phi i32 [ %mul, %f ], [ %add23, %for.body22 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ]
+  %mul19 = shl nsw i32 %n, 1
+  %cmp20 = icmp slt i32 %storemerge3, %mul19
+  br i1 %cmp20, label %for.body22, label %m
+
+for.body22:                                       ; preds = %for.cond18
+  %add23 = add nsw i32 %storemerge3, %ret.1
+  %inc25 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond18
+
+for.cond28:                                       ; preds = %for.body32, %while.body
+  %ret.2 = phi i32 [ %add33, %for.body32 ], [ 0, %while.body ]
+  %storemerge4 = phi i32 [ %inc35, %for.body32 ], [ 0, %while.body ]
+  %add29 = add nsw i32 %n, 5
+  %cmp30 = icmp slt i32 %storemerge4, %add29
+  br i1 %cmp30, label %for.body32, label %for.end36
+
+for.body32:                                       ; preds = %for.cond28
+  %add33 = add nuw nsw i32 %ret.2, 2
+  %inc35 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond28
+
+for.end36:                                        ; preds = %for.cond28
+  %rem375 = and i32 %conv, 1
+  %cmp38 = icmp eq i32 %rem375, 0
+  br i1 %cmp38, label %for.cond43, label %m
+
+m:                                                ; preds = %for.end36, %for.cond18, %for.cond
+  %ret.3 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond18 ], [ %ret.2, %for.end36 ]
+  %shl = shl i32 %ret.3, 2
+  br label %o
+
+for.cond43:                                       ; preds = %for.inc52, %for.end36
+  %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ]
+  %mul44 = shl nsw i32 %n, 1
+  %cmp45 = icmp slt i32 %storemerge6, %mul44
+  br i1 %cmp45, label %for.body47, label %for.end54
+
+for.body47:                                       ; preds = %for.cond43
+  %cmp48 = icmp sgt i32 %n, 5
+  br i1 %cmp48, label %l, label %for.inc52
+
+for.inc52:                                        ; preds = %for.body47
+  %inc53 = add nuw nsw i32 %storemerge6, 1
+  br label %for.cond43
+
+for.end54:                                        ; preds = %for.cond43
+  %shl55 = mul i32 %conv, 8
+  %add56 = add nsw i32 %ret.2, %shl55
+  br label %p
+
+l:                                                ; preds = %for.body47
+  %shl57 = mul i32 %conv, 8
+  %add58 = add nsw i32 %ret.2, %shl57
+  br label %o
+
+o:                                                ; preds = %l, %m
+  %storemerge1 = phi i32 [ %shl, %m ], [ %add58, %l ]
+  br label %for.cond60
+
+for.cond60:                                       ; preds = %for.body64, %o
+  %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ]
+  %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ]
+  %mul61 = shl nsw i32 %n, 1
+  %cmp62 = icmp slt i32 %storemerge2, %mul61
+  br i1 %cmp62, label %for.body64, label %p
+
+for.body64:                                       ; preds = %for.cond60
+  %add65 = add nsw i32 %storemerge2, %ret.4
+  %inc67 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond60
+
+p:                                                ; preds = %for.cond60, %for.end54
+  %ret.5 = phi i32 [ %add56, %for.end54 ], [ %ret.4, %for.cond60 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.5, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization17, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization17
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCOND28PREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCOND28PREHEADERELSE:.+]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[FORCOND28PREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND28:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[F:.+]], label %[[IFEND5:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCOND28PREHEADERUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND5UNIFORM:.+]]
+
+; CHECK: [[IFEND5UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFEND5UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[IFEND5UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFEND5UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFEND5UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[MLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[MLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[MUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[FORCOND18UNIFORM:.+]]
+
+; CHECK: [[FORCOND18UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22UNIFORM:.+]], label %[[MLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY22UNIFORM]]:
+; CHECK: br label %[[FORCOND18UNIFORM]]
+
+; CHECK: [[MLOOPEXITUNIFORM]]:
+; CHECK: br label %[[MUNIFORM]]
+
+; CHECK: [[FORCOND28PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND28UNIFORM:.+]]
+
+; CHECK: [[FORCOND28UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32UNIFORM:.+]], label %[[FOREND36UNIFORM:.+]]
+
+; CHECK: [[FORBODY32UNIFORM]]:
+; CHECK: br label %[[FORCOND28UNIFORM]]
+
+; CHECK: [[FOREND36UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND43PREHEADERUNIFORM:.+]], label %[[FOREND36UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND43PREHEADERUNIFORM]]:
+; CHECK: %[[CMP18UNIFORM:.+]] = icmp
+; CHECK: br label %[[FORCOND43UNIFORM:.+]]
+
+; CHECK: [[FOREND36UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[MUNIFORM]], label %[[FORCOND43PREHEADER:.+]]
+
+; CHECK: [[FORCOND43UNIFORM]]:
+; CHECK: br i1 %[[CMP18UNIFORM]], label %[[FORBODY47UNIFORM:.+]], label %[[FOREND54UNIFORM:.+]]
+
+; CHECK: [[FORBODY47UNIFORM]]:
+; CHECK: br i1 true, label %[[LUNIFORM:.+]], label %[[FORINC52UNIFORM:.+]]
+
+; CHECK: [[FORINC52UNIFORM]]:
+; CHECK: br label %[[FORCOND43UNIFORM]]
+
+; CHECK: [[FOREND54UNIFORM]]:
+; CHECK: br label %[[PUNIFORM:.+]]
+
+; CHECK: [[LUNIFORM]]:
+; CHECK: br label %[[OUNIFORM:.+]]
+
+; CHECK: [[MUNIFORM]]:
+; CHECK: br label %[[OUNIFORM]]
+
+; CHECK: [[OUNIFORM]]:
+; CHECK: br label %[[FORCOND60UNIFORM:.+]]
+
+; CHECK: [[FORCOND60UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64UNIFORM:.+]], label %[[PLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY64UNIFORM]]:
+; CHECK: br label %[[FORCOND60UNIFORM]]
+
+; CHECK: [[PLOOPEXITUNIFORM]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[MLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND28PREHEADERELSE]], label %[[FORCOND28PREHEADERSPLIT]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[FORCOND18:.+]]
+
+; CHECK: [[FORCOND18]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[MLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND18]]
+
+; CHECK: [[FORCOND28]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32:.+]], label %[[FOREND36:.+]]
+
+; CHECK: [[FORBODY32]]:
+; CHECK: br label %[[FORCOND28]]
+
+; CHECK: [[FOREND36]]:
+; CHECK: br label %[[FORCOND43PREHEADER]]
+
+; CHECK: [[FORCOND43PREHEADER]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br label %[[FORCOND43:.+]]
+
+; CHECK: [[MLOOPEXIT]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[MLOOPEXIT2]]:
+; CHECK: br label %[[FORCONDPREHEADERELSE]]
+
+; CHECK: [[M]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[FORCOND43]]:
+; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]]
+
+; CHECK: [[FORBODY47]]:
+; CHECK: br i1 true, label %[[L:.+]], label %[[FORINC52:.+]]
+
+; CHECK: [[FORINC52]]:
+; CHECK: br label %[[FORCOND43]]
+
+; CHECK: [[FOREND54]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[L]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[O]]:
+; CHECK: br label %[[FORCOND60:.+]]
+
+; CHECK: [[FORCOND60]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64:.+]], label %[[PLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY64]]:
+; CHECK: br label %[[FORCOND60]]
+
+; CHECK: [[PLOOPEXIT]]:
+; CHECK: br label %[[P]]
+
+; CHECK: [[P]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
new file mode 100644
index 0000000000000..1a3f9de02d562
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
@@ -0,0 +1,357 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <--.
+;      / \   |
+;     c   d -'
+;    / \  |
+;   e   f |
+;   |    \|
+;   |     g
+;   |    /
+;   |   h
+;    \ / \
+;     i   j
+;      \ /
+;       k
+;
+; * where nodes b, and h are uniform branches, and nodes c and d are varying
+;   branches.
+; * where nodes e, f, g, i and k are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <--. .-> b' <--.
+;      / \   | |  / \    |
+;     c   d -' | c'  d' -'
+;    / \__|\___' |   |
+;   e   f |`---> f'  |
+;   |    \|      |   |
+;   |     g      e'  |
+;   |    /        \ /
+;   |   h          g'
+;    \ / \         |
+;     i   j        h'
+;      \ /        / \
+;       k        |   j'
+;       |         \ /
+;       |          i'
+;       |          |
+;       `--> & <-- k'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization18(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (id + i % 2 == 0) {
+;         goto e;
+;       } else {
+;         goto f;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto g;
+;
+; g:
+;   for (int i = 1; i < n * 2; i++) ret *= i;
+;   goto h;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto i;
+;
+; h:
+;   if (n > 3) {
+; i:
+;     ret++;
+;   } else {
+;     ret *= 3;
+;   }
+;
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp2 = icmp eq i32 %conv, %add
+  br i1 %cmp2, label %for.cond26, label %for.cond
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add5 = add nsw i32 %inc, %conv
+  %cmp6 = icmp sgt i32 %add5, 3
+  br i1 %cmp6, label %g, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add14, %for.body ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc15, %for.body ], [ 0, %if.then ]
+  %add11 = add nsw i32 %n, 5
+  %cmp12 = icmp slt i32 %storemerge2, %add11
+  br i1 %cmp12, label %for.body, label %g
+
+for.body:                                         ; preds = %for.cond
+  %add14 = add nuw nsw i32 %ret.0, 2
+  %inc15 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %if.end
+  %ret.1 = phi i32 [ 0, %if.end ], [ %ret.0, %for.cond ]
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body20, %g
+  %ret.2 = phi i32 [ %ret.1, %g ], [ %mul21, %for.body20 ]
+  %storemerge = phi i32 [ 1, %g ], [ %inc23, %for.body20 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp18 = icmp slt i32 %storemerge, %mul
+  br i1 %cmp18, label %for.body20, label %h
+
+for.body20:                                       ; preds = %for.cond17
+  %mul21 = mul nsw i32 %storemerge, %ret.2
+  %inc23 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond17
+
+for.cond26:                                       ; preds = %for.body30, %if.then
+  %ret.3 = phi i32 [ %inc31, %for.body30 ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc33, %for.body30 ], [ 0, %if.then ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %i38
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nuw nsw i32 %ret.3, 1
+  %inc33 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+h:                                                ; preds = %for.cond17
+  %cmp35 = icmp sgt i32 %n, 3
+  br i1 %cmp35, label %i38, label %if.else40
+
+i38:                                              ; preds = %h, %for.cond26
+  %ret.4 = phi i32 [ %ret.3, %for.cond26 ], [ %ret.2, %h ]
+  %inc39 = add nsw i32 %ret.4, 1
+  br label %if.end42
+
+if.else40:                                        ; preds = %h
+  %mul41 = mul nsw i32 %ret.2, 3
+  br label %if.end42
+
+if.end42:                                         ; preds = %if.else40, %i38
+  %storemerge1 = phi i32 [ %mul41, %if.else40 ], [ %inc39, %i38 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization18, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization18
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT1UNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[GLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[IFTHENUNIFORM]]
+; CHECK: br i1 %{{.+}}, label %[[FORCOND26PREHEADERUNIFORM:.+]], label %[[IFTHENUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[GLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[GLOOPEXITUNIFORM]]:
+; CHECK: br label %[[GUNIFORM]]
+
+; CHECK: [[FORCOND26PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FORCOND26UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30UNIFORM:.+]], label %[[I38LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY30UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM]]
+
+; CHECK: [[I38LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I38UNIFORM:.+]]
+
+; CHECK: [[GUNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM:.+]]
+
+; CHECK: [[FORCOND17UNIFORM]]:
+; CHECK: %[[CMP18UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP18UNIFORM]], label %[[FORBODY20UNIFORM:.+]], label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORBODY20UNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: %[[CMP35UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP35UNIFORM]], label %[[I38UNIFORM]], label %[[IFELSE40UNIFORM:.+]]
+
+; CHECK: [[IFELSE40UNIFORM]]:
+; CHECK: br label %[[IFEND42UNIFORM:.+]]
+
+; CHECK: [[I38UNIFORM]]:
+; CHECK: br label %[[IFEND42:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[GLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: %[[CMP18:.+]] = icmp
+; CHECK: br i1 %[[CMP18]], label %[[FORBODY20:.+]], label %[[H:.+]]
+
+; CHECK: [[FORBODY20]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[I38LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[H]]:
+; CHECK: %[[CMP35:.+]] = icmp
+; CHECK: br i1 %[[CMP35]], label %[[I38:.+]], label %[[IFELSE40:.+]]
+
+; CHECK: [[I38LOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I38]]:
+; CHECK: br label %[[IFEND42]]
+
+; CHECK: [[IFELSE40]]:
+; CHECK: br label %[[I38]]
+
+; CHECK: [[IFEND42]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
new file mode 100644
index 0000000000000..ed8617e84ac95
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
@@ -0,0 +1,379 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <----.
+;      / \     |
+;     c   \    |
+;    / \   \   |
+;   d   e   f -'
+;   |   |   |
+;    \  \   g
+;     \  \ / \
+;      \  h   i <,
+;       \  \ /  /
+;        \  j  /
+;         \   /
+;          `-'
+;
+; * where nodes b, c, and g are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g, h, i and j are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <----. .---> b' <----.
+;      / \     | |    / \      |
+;     c   \    | |   c'  \     |
+;    / \   \   | |  / \   \    |
+;   d   e   f -' | d'  e'  f' -'
+;   |   |   |\___' |   |   |
+;    \  \   g       \  |  /
+;     \  \ / \       \ | /
+;      \  h   i <,    \|/
+;       \  \ /  /      g'
+;        \  j  /       |
+;         \ | /        i'
+;          `-'         |
+;           |          h'
+;           |          |
+;            `--> & <- j'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; The uniform branch `g` has been linearized because both its successors are
+; divergent. Not linearizing `g`  would mean that only one of both
+; successors could be executed in addition to the other, pending a uniform
+; condition evaluates to true, whereas what we want is to possibly execute both
+; no matter what the uniform condition evaluates to.
+;
+; __kernel void partial_linearization19(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (n == 6) {
+;         goto d;
+;       } else {
+;         goto e;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       break;
+;     }
+;   }
+;
+;   // g
+;   if (n == 3) {
+;     goto h;
+;   } else {
+;     goto i;
+;   }
+;
+; d:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto i;
+;
+; e:
+;   for (int i = 1; i < n * 2; i++) ret += i;
+;   goto h;
+;
+; i:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto j;
+;
+; h:
+;   for (int i = 0; i < n; i++) ret++;
+;   goto j;
+;
+; j:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp eq i32 %n, 6
+  br i1 %cmp2, label %for.cond, label %for.cond20
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add = add nsw i32 %inc, %conv
+  %cmp5 = icmp sgt i32 %add, 3
+  br i1 %cmp5, label %while.end, label %while.body
+
+while.end:                                        ; preds = %if.end
+  %cmp9 = icmp eq i32 %n, 3
+  br i1 %cmp9, label %h, label %i28
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add17, %for.body ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc18, %for.body ], [ 0, %if.then ]
+  %add14 = add nsw i32 %n, 5
+  %cmp15 = icmp slt i32 %storemerge3, %add14
+  br i1 %cmp15, label %for.body, label %i28
+
+for.body:                                         ; preds = %for.cond
+  %add17 = add nuw nsw i32 %ret.0, 2
+  %inc18 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.cond20:                                       ; preds = %for.body23, %if.then
+  %ret.1 = phi i32 [ %add24, %for.body23 ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc26, %for.body23 ], [ 1, %if.then ]
+  %mul = shl nsw i32 %n, 1
+  %cmp21 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp21, label %for.body23, label %h
+
+for.body23:                                       ; preds = %for.cond20
+  %add24 = add nuw nsw i32 %storemerge2, %ret.1
+  %inc26 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond20
+
+i28:                                              ; preds = %for.cond, %while.end
+  %ret.2 = phi i32 [ 0, %while.end ], [ %ret.0, %for.cond ]
+  br label %for.cond30
+
+for.cond30:                                       ; preds = %for.body34, %i28
+  %ret.3 = phi i32 [ %ret.2, %i28 ], [ %inc35, %for.body34 ]
+  %storemerge = phi i32 [ 0, %i28 ], [ %inc37, %for.body34 ]
+  %add31 = add nsw i32 %n, 5
+  %cmp32 = icmp slt i32 %storemerge, %add31
+  br i1 %cmp32, label %for.body34, label %j
+
+for.body34:                                       ; preds = %for.cond30
+  %inc35 = add nuw nsw i32 %ret.3, 1
+  %inc37 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond30
+
+h:                                                ; preds = %for.cond20, %while.end
+  %ret.4 = phi i32 [ 0, %while.end ], [ %ret.1, %for.cond20 ]
+  br label %for.cond40
+
+for.cond40:                                       ; preds = %for.body43, %h
+  %ret.5 = phi i32 [ %ret.4, %h ], [ %inc44, %for.body43 ]
+  %storemerge1 = phi i32 [ 0, %h ], [ %inc46, %for.body43 ]
+  %cmp41 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp41, label %for.body43, label %j
+
+for.body43:                                       ; preds = %for.cond40
+  %inc44 = add nsw i32 %ret.5, 1
+  %inc46 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond40
+
+j:                                                ; preds = %for.cond40, %for.cond30
+  %ret.6 = phi i32 [ %ret.3, %for.cond30 ], [ %ret.5, %for.cond40 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization19, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization19
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br i1 %[[CMP2MERGE:.+]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND20PREHEADER:.+]]
+
+; CHECK: [[FORCOND20PREHEADER]]:
+; CHECK: br label %[[FORCOND20:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP2MERGE]] = phi i1 [ %[[CMP2]], %[[IFTHEN]] ], [ false, %[[IFEND]] ]
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEENDUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[WHILEENDUNIFORM]]:
+; CHECK: %[[CMP9UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP9UNIFORM]], label %[[HUNIFORM:.+]], label %[[I28UNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND20PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND20PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND20UNIFORM:.+]]
+
+; CHECK: [[FORCOND20UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23UNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY23UNIFORM]]:
+; CHECK: br label %[[FORCOND20UNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[HUNIFORM]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[I28LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[I28LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I28UNIFORM]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[FORCOND40UNIFORM:.+]]
+
+; CHECK: [[FORCOND40UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43UNIFORM:.+]], label %[[JLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY43UNIFORM]]:
+; CHECK: br label %[[FORCOND40UNIFORM]]
+
+; CHECK: [[JLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[JUNIFORM:.+]]
+
+; CHECK: [[I28UNIFORM]]:
+; CHECK: br label %[[FORCOND30UNIFORM:.+]]
+
+; CHECK: [[FORCOND30UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34UNIFORM:.+]], label %[[JLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY34UNIFORM]]:
+; CHECK: br label %[[FORCOND30UNIFORM]]
+
+; CHECK: [[JLOOPEXITUNIFORM]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[I28LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND20]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY23]]:
+; CHECK: br label %[[FORCOND20]]
+
+; CHECK: [[I28LOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[I28:.+]]:
+; CHECK: br label %[[FORCOND30:.+]]
+
+; CHECK: [[FORCOND30]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY34]]:
+; CHECK: br label %[[FORCOND30]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND40:.+]]
+
+; CHECK: [[FORCOND40]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43:.+]], label %[[JLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY43]]:
+; CHECK: br label %[[FORCOND40]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[JLOOPEXIT2]]:
+; CHECK: br label %[[I28]]
+
+; CHECK: [[J]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
new file mode 100644
index 0000000000000..24f059dc49073
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
@@ -0,0 +1,340 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   X   /
+;      \ / \ /
+;       h   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b__     c________
+;    / \ \___/_\___    \
+;   d   e   f   g  `e'  g'
+;    \   \ /   /    |   |
+;     \   X   /     d'  f'
+;      \ / \ /       \ /
+;       h   i         i'
+;        \ /          |
+;         j           h'
+;          \          |
+;           `--> & <- j'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization2(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n > 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto i;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;   goto end;
+;
+; i:
+;   ret *= 10;
+;   goto end;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge5, %sub
+  br i1 %cmp5, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge4 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge4, %div9
+  br i1 %cmp10, label %for.body12, label %i42
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge4, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge3 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge3, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond
+  %ret.4 = phi i32 [ %ret.0, %for.cond ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %end
+
+i42:                                              ; preds = %for.cond32, %for.cond8
+  %ret.5 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %h
+  %storemerge2 = phi i32 [ %mul43, %i42 ], [ %add41, %h ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization2, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization2
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND8PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND8UNIFORM:.+]]
+
+; CHECK: [[FORCOND8UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[I42LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND8UNIFORM]]
+
+; CHECK: [[I42LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I42UNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND8PREHEADERUNIFORM]], label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORCOND8PREHEADER]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[I42LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND23PREHEADERUNIFORM:.+]], label %[[IFELSE17BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 false, label %[[FORBODY36UNIFORM:.+]], label %[[I42LOOPEXIT2UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[I42LOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[I42UNIFORM]]
+
+; CHECK: [[FORCOND23PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM:.+]]
+
+; CHECK: [[IFELSE17BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM]], label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND23UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26UNIFORM:.+]], label %[[HLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY26UNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM]]
+
+; CHECK: [[HLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[HUNIFORM]]
+
+; CHECK: [[I42UNIFORM]]:
+; CHECK: br label %[[ENDUNIFORM:.+]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[I42LOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[I42:.+]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[I42]]
+
+; CHECK: [[H:.+]]:
+; CHECK: br label %[[END]]
+
+; CHECK: [[I42LOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[I42LOOPEXIT2]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[I42]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
new file mode 100644
index 0000000000000..67926b955c666
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
@@ -0,0 +1,288 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <--------.
+;    / \         |
+;   |   c        |
+;   |  / \       |
+;   | f   h <--. |
+;   | |  / \   | |
+;   | | |   d -' |
+;   | | |   |    |
+;   | | |   e ---'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     g
+;
+; * where nodes b, d, and e are uniform branches, and node h is a varying
+;   branch.
+; * where nodes b, d and g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--------.      b' <--.
+;    / \         |      |     |
+;   |   c        | .-.  c'    |
+;   |  / \       | |  \/|     |
+;   | f   h <--. | |  / h' <. |
+;   | |  / \   | | | f' |   | |
+;   | | |   d -' | | |  d' -' |
+;   | | |   |\___|_' |  |     |
+;   | | |   e ---'   |  e' ---'
+;   | | |  /          \ |
+;   | | | /            \|
+;   | | |/              g'
+;   | | /               |
+;    \|/               /
+;     g ----> & <-----'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization20(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto g;
+;     }
+;     if (n == 6) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (ret++ + id >= n) {
+;         goto d;
+;       }
+;       if (n & 1) {
+;         goto g;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 1; i++) ret++;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %inc, %e ]
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  br i1 %0, label %g, label %if.end
+
+if.end:                                           ; preds = %while.body
+  %cmp4 = icmp eq i32 %n, 6
+  br i1 %cmp4, label %for.cond, label %while.body9
+
+while.body9:                                      ; preds = %d, %if.end
+  %ret.1 = phi i32 [ %ret.0, %if.end ], [ %inc, %d ]
+  %inc = add nsw i32 %ret.1, 1
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp sge i32 %add, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond1 = or i1 %tobool, %cmp10
+  br i1 %or.cond1, label %d, label %g
+
+d:                                                ; preds = %while.body9
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body9
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+for.cond:                                         ; preds = %for.body, %if.end
+  %ret.2 = phi i32 [ %inc27, %for.body ], [ %ret.0, %if.end ]
+  %storemerge = phi i32 [ %inc28, %for.body ], [ 0, %if.end ]
+  %cmp25 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp25, label %g, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc27 = add nsw i32 %ret.2, 1
+  %inc28 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %e, %while.body9, %while.body
+  %ret.3 = phi i32 [ %ret.0, %while.body ], [ %inc, %e ], [ %ret.2, %for.cond ], [ %inc, %while.body9 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization20, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization20
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP4:.+]] = icmp
+; CHECK: br i1 %[[CMP4]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODY9PREHEADER:.+]]
+
+; CHECK: [[WHILEBODY9PREHEADER]]:
+; CHECK: br label %[[WHILEBODY9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FORCONDPREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[WHILEBODY9]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT2UNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: %[[CMP4UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP4UNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[WHILEBODY9PREHEADERUNIFORM:.+]]
+
+; CHECK: [[WHILEBODY9PREHEADERUNIFORM]]:
+; CHECK: br label %[[WHILEBODY8UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY9UNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[WHILEBODY9UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: %[[CMP16UNIFORM:.+]] = icmp
+; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[WHILEBODY9UNIFORM]]
+
+; CHECK: [[WHILEBODY9UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT1UNIFORM:.+]], label %[[WHILEBODY9UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODY9UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[D]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[GLOOPEXIT2UNIFORM]]
+
+
+; CHECK: [[GLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[GLOOPEXITUNIFORM]]:
+; CHECK: br label %[[GUNIFORM]]
+
+; CHECK: [[GLOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[D]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY9]], label %[[WHILEBODY9PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY9PUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT1:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[GLOOPEXIT1]]:
+; CHECK: br label %[[GLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERELSE]], label %[[FORCONDPREHEADERSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
new file mode 100644
index 0000000000000..59e104a589feb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
@@ -0,0 +1,239 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   |   c <--. |
+;   |  / \   | |
+;   | |   d -' |
+;   | |  / \   |
+;   | | |   e -'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     f
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <------.   b' <--.
+;    / \       |   |     |
+;   |   c <--. |   c' <. |
+;   |  / \___|_|__ |   | |
+;   | |   d -' |  `d' -' |
+;   | |  / \   |   |     |
+;   | | |   e -'   e' ---'
+;   | | |  /       |
+;   | | | /        f'
+;   | | |/         |
+;   | | /          |
+;    \|/          /
+;     f --> & <--'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization21(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto f;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto f;
+;     }
+;   }
+;
+; f:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  %cmp6.old = icmp eq i32 %n, 3
+  br i1 %cmp6.old, label %if.else, label %f
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %f
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %f
+
+f:                                                ; preds = %e, %if.else, %while.body5, %while.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization21, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization21
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[WHILEBODY5:.+]]:
+
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMP6UNIFORM:cmp.+]] = icmp
+; CHECK: %[[ORCOND1UNIFORM:.+]] = or i1 %[[CMP6UNIFORM]]
+; CHECK: br i1 %[[ORCOND1UNIFORM]], label %[[FLOOPEXIT1UNIFORM:.+]], label %[[IFELSEPREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFELSEPREHEADERUNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: %[[CMP16UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP16UNIFORM]], label %[[EUNIFORM:.+]], label %[[WHILEBODY5UNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FLOOPEXITUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[D]]
+
+; CHECK: [[WHILEBODY5UNIFORM]]:
+; CHECK: %[[CMP6OLDUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP6OLDUNIFORM]], label %[[IFELSEUNIFORM]], label %[[FLOOPEXITUNIFORM]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FLOOPEXIT1UNIFORM]]
+
+
+; CHECK: [[FLOOPEXITUNIFORM]]:
+; CHECK: br label %[[FUNIFORM:.+]]
+
+; CHECK: [[FLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[WHILEBODY5]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[FLOOPEXITELSE:.+]]
+
+; CHECK: [[FLOOPEXITELSE]]:
+; CHECK: br label %[[FLOOPEXIT1:.+]]
+
+; CHECK: [[FLOOPEXIT1]]:
+; CHECK: br label %[[F]]
+
+; CHECK: [[F]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
new file mode 100644
index 0000000000000..6a05753fd764c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
@@ -0,0 +1,348 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-8.0-only
+; RUN: %veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,function(loop(indvars)),cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   f   c <--. |
+;   |\ / \   | |
+;   | |   d -' |
+;   | |\ / \   |
+;   | | |   e -'
+;   | | |\ /
+;   | | | g
+;   | | |/
+;   | | /
+;    \|/
+;     h
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <------.     b' <--.
+;    / \       |    /|     |
+;   f   c <--. |   / c' <. |
+;   |\ / \___|_|_ f' |   | |
+;   | |   d -' | `|- d' -' |
+;   | |\ / \   |  |  |     |
+;   | | |   e -'  |  e' ---'
+;   | | |\ /       \ |
+;   | | | g         \|
+;   | | |/           g'
+;   | | /            |
+;    \|/             h'
+;     h ----> & <---'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization22(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto h;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   if (n == 2) {
+;     goto h;
+;   }
+;
+; g:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto h;
+;
+; h:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  switch i32 %n, label %g [
+    i32 3, label %if.else
+    i32 2, label %h
+  ]
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %h
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+f:                                                ; preds = %while.body
+  %cmp24 = icmp eq i32 %n, 2
+  br i1 %cmp24, label %h, label %g
+
+g:                                                ; preds = %f, %e, %while.body5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %g
+  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
+  %cmp29 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp29, label %h, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc31 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
+  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization22
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP6:.+]] = icmp slt
+; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]]
+; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]])
+; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[WHILEBODY5:.+]]:
+; CHECK: br label %[[NODEBLOCK:.+]]
+
+; CHECK: [[NODEBLOCK]]:
+; CHECK: br label %[[LEAFBLOCK1:.+]]
+
+; CHECK: [[LEAFBLOCK1]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[LEAFBLOCK:.+]]:
+; CHECK: %[[SWITCHLEAF:.+]] = icmp
+; CHECK: %[[SWITCHLEAFANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[SWITCHLEAF]])
+; CHECK: br i1 %[[SWITCHLEAFANY]], label %[[LEAFBLOCKELSE:.+]], label %[[NEWDEFAULT:.+]]
+
+; CHECK: [[LEAFBLOCKELSE]]:
+; CHECK: br label %[[GLOOPEXIT:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMP6UNIFORM:cmp.+]] = icmp
+; CHECK: %[[ORCOND1UNIFORM:.+]] = or i1 %[[CMP6UNIFORM]]
+; CHECK: br i1 %[[ORCOND1UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFELSEPREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFELSEPREHEADERUNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: %[[CMP16UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP16UNIFORM]], label %[[EUNIFORM:.+]], label %[[NODEBLOCKUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[HLOOPEXIT3UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[D]]
+
+; CHECK: [[NODEBLOCKUNIFORM]]:
+; CHECK: %[[PIVOTUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[PIVOTUNIFORM]], label %[[LEAFBLOCKUNIFORM:.+]], label %[[LEAFBLOCK1UNIFORM:.+]]
+
+; CHECK: [[LEAFBLOCK1UNIFORM]]:
+; CHECK: %[[SWITCHLEAF2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[SWITCHLEAF2UNIFORM]], label %[[IFELSEUNIFORM]], label %[[NEWDEFAULTLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[GLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[HLOOPEXIT3UNIFORM]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[NEWDEFAULTLOOPEXITUNIFORM]]:
+; CHECK: br label %[[NEWDEFAULTUNIFORM:.+]]
+
+; CHECK: [[LEAFBLOCKUNIFORM]]:
+; CHECK: %[[SWITCHLEAFUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[SWITCHLEAFUNIFORM]], label %[[H]], label %[[NEWDEFAULTUNIFORM]]
+
+; CHECK: [[NEWDEFAULTUNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[GLOOPEXITUNIFORM]]:
+; CHECK: br label %[[GUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: %[[CMP24UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP24UNIFORM]], label %[[H]], label %[[GUNIFORM]]
+
+; CHECK: [[GUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP29UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP29UNIFORM]], label %[[HLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[WHILEBODY5]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[CMP24:cmp24]], %[[F]] ], [ false, %[[E]] ]
+; CHECK: br label %[[HLOOPEXIT3:.+]]
+
+; CHECK: [[F]]:
+; CHECK: %[[CMP24]] = icmp
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE:.+]]:
+; CHECK: br label %[[NEWDEFAULT]]
+
+; CHECK: [[FSPLIT:.+]]:
+; CHECK: %[[CMP24MERGEANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP24MERGE]])
+; CHECK: br i1 %[[CMP24MERGEANY]], label %[[NEWDEFAULT]], label %[[G:.+]]
+
+; CHECK: [[NEWDEFAULT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[GLOOPEXITELSE:.+]]
+
+; CHECK: [[GLOOPEXITELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP29:.+]] = icmp
+; CHECK: %[[CMP29ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP29]])
+; CHECK: br i1 %[[CMP29ANY]], label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[HLOOPEXIT3]]:
+; CHECK: br label %[[HLOOPEXIT3ELSE:.+]]
+
+; CHECK: [[HLOOPEXIT3ELSE]]:
+; CHECK: br label %[[NEWDEFAULTLOOPEXIT:.+]]
+
+; CHECK: [[H]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
new file mode 100644
index 0000000000000..02b0cb286fddb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
@@ -0,0 +1,332 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   h   /
+;      \   \ /
+;       \   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b__     c________
+;    / \ \___/_\___    \
+;   d   e   f   g  `e'  g'
+;   |    \ /   /    |   |
+;   j     h   /     d'  f'
+;   |      \ /       \ /
+;   |       i         h'
+;   |       |         |
+;   |       `--> & <- i'
+;   |            |
+;    `---> & <-- j'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization3(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n > 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto h;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;
+; i:
+;   ret *= 10;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge4 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge4, %sub
+  br i1 %cmp5, label %for.body, label %end
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge4, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge3 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge3, %div9
+  br i1 %cmp10, label %for.body12, label %h
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge3, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge2 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge2, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond8
+  %ret.4 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %i42
+
+i42:                                              ; preds = %h, %for.cond32
+  %ret.5 = phi i32 [ %add41, %h ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %for.cond
+  %ret.6 = phi i32 [ %mul43, %i42 ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization3, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization3
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND8PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND8UNIFORM:.+]]
+
+; CHECK: [[FORCOND8UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND8UNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND8PREHEADERUNIFORM]], label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[ENDLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[ENDLOOPEXITUNIFORM]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[FORCOND8PREHEADER]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[EXITCOND:.+]] = icmp
+; CHECK: br i1 %[[EXITCOND]], label %[[FORBODY:.+]], label %[[ENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND23PREHEADERUNIFORM:.+]], label %[[IFELSE17BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 false, label %[[FORBODY36UNIFORM:.+]], label %[[ENDLOOPEXIT2UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[ENDLOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[END]]
+
+; CHECK: [[FORCOND23PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM:.+]]
+
+; CHECK: [[IFELSE17BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM:.+]], label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND23UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26UNIFORM:.+]], label %[[HLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY26UNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM]]
+
+; CHECK: [[HLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[HUNIFORM]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[END]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[ENDLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[I42:.+]]
+
+; CHECK: [[ENDLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[ENDLOOPEXIT2]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
new file mode 100644
index 0000000000000..10120d5f5bd10
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
@@ -0,0 +1,219 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where node b is a uniform branch, and node c is a varying branch.
+; * where nodes f, d and g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <-.     b' <--.
+;    / \  |    / \    |
+;   e   c_|_  e'  c'  |
+;   |  / \| \_|__ |   |
+;   | f   d   |  `d' -'
+;   |/         \ /
+;   g           f'
+;   |           |
+;   `---> & <-- g'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization4(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (n > 20) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end5, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc6, %if.end5 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end5 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc7, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 20
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add = add nsw i32 %y.0, %x.0
+  %cmp2 = icmp sgt i32 %add, %n
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc6 = add nsw i32 %x.0, 1
+  %inc7 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add8 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add8
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add9 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add9, 0
+  %13 = select i1 %12, i32 1, i32 %add9
+  %div10 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div10, %f ], [ %mul, %e ]
+  %add11 = add i32 %y.0, %x.0
+  %add12 = add i32 %add11, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization4, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization4
+; CHECK: br i1 true, label %[[FORCONDUNIFORM:.+]], label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[E:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND5:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[EUNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[FUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFEND5UNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND5UNIFORM:.+]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFEND5]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[FORCONDPUREEXIT]]
+
+; CHECK: [[EELSE:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[ESPLIT:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE]], label %[[ESPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
new file mode 100644
index 0000000000000..81508c657eab6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -0,0 +1,264 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;   |\ / \
+;   | d   e
+;   |  \ /
+;   |   f
+;    \ /
+;     g
+;
+; * where node c is a uniform branch, and nodes a and b are varying branches.
+; * where nodes b, c, d, f, g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a________
+;    / \       \
+;   b   c       c'
+;   |\_/_\__   / \
+;   | d   e \ |   e'
+;   |  \ /   \ \ /
+;   |   f     \ b'
+;    \ /       \|
+;     g         d'
+;     |         |
+;     |         f'
+;     |         |
+;     `--> & <- g'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization5(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 2 == 0) { // a
+;     if (id == 4) { // b
+;       goto g;
+;     } else {
+;       goto d;
+;     }
+;   } else { // c
+;     if (n % 2 == 0) {
+;       goto d;
+;     } else {
+;       goto e;
+;     }
+;   }
+;
+; d:
+;   for (int i = 0; i < n / 4; i++) { ret += i - 2; }
+;   goto f;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) { ret += i + 5; }
+;
+; f:
+;   ret *= ret % n;
+;   ret *= ret + 4;
+;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem1 = and i32 %conv, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %cmp2 = icmp eq i32 %conv, 4
+  br i1 %cmp2, label %g, label %d
+
+if.else5:                                         ; preds = %entry
+  %rem62 = and i32 %n, 1
+  %cmp7 = icmp eq i32 %rem62, 0
+  br i1 %cmp7, label %d, label %e
+
+d:                                                ; preds = %if.else5, %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %d
+  %ret.0 = phi i32 [ 0, %d ], [ %add, %for.body ]
+  %storemerge3 = phi i32 [ 0, %d ], [ %inc, %for.body ]
+  %div = sdiv i32 %n, 4
+  %cmp11 = icmp slt i32 %storemerge3, %div
+  br i1 %cmp11, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %sub = add i32 %ret.0, -2
+  %add = add i32 %sub, %storemerge3
+  %inc = add nsw i32 %storemerge3, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.body18, %e
+  %ret.1 = phi i32 [ 0, %e ], [ %add20, %for.body18 ]
+  %storemerge = phi i32 [ 0, %e ], [ %inc22, %for.body18 ]
+  %add15 = add nsw i32 %n, 5
+  %cmp16 = icmp slt i32 %storemerge, %add15
+  br i1 %cmp16, label %for.body18, label %f
+
+for.body18:                                       ; preds = %for.cond14
+  %add19 = add i32 %ret.1, 5
+  %add20 = add i32 %add19, %storemerge
+  %inc22 = add nsw i32 %storemerge, 1
+  br label %for.cond14
+
+f:                                                ; preds = %for.cond14, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond14 ]
+  %0 = icmp eq i32 %ret.2, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %rem24 = srem i32 %ret.2, %5
+  %mul = mul nsw i32 %rem24, %ret.2
+  %add25 = add nsw i32 %mul, 4
+  %mul26 = mul nsw i32 %add25, %mul
+  br label %g
+
+g:                                                ; preds = %f, %if.then
+  %ret.3 = phi i32 [ %mul26, %f ], [ 0, %if.then ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization5, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization5
+; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE5UNIFORM:.+]]:
+; CHECK: %[[CMP7UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP7UNIFORM]], label %[[DUNIFORM:.+]], label %[[FORCOND14PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND14PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND14UNIFORM:.+]]
+
+; CHECK: [[FORCOND14UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18UNIFORM:.+]], label %[[FLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY18UNIFORM]]:
+; CHECK: br label %[[FORCOND14UNIFORM]]
+
+; CHECK: [[FLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[FUNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[GUNIFORM:.+]], label %[[IFTHENUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE5UNIFORM]], label %[[IFELSE5:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP11UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP11UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[FLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FLOOPEXITUNIFORM]]:
+; CHECK: br label %[[FUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+
+; CHECK: [[FORCOND14PREHEADER]]:
+; CHECK: br label %[[FORCOND14:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP11:.+]] = icmp
+; CHECK: br i1 %[[CMP11]], label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND14]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18:.+]], label %[[FLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY18]]:
+; CHECK: br label %[[FORCOND14]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT1]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK; [[F]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
+
+; CHECK: [[IFTHENUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM]], label %[[D]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
new file mode 100644
index 0000000000000..fa7033c12f26a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -0,0 +1,228 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization6 -vecz-passes="function(simplifycfg),vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes b and c are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g and h are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-. .---> b' <-.
+;      / \  | |    / \   |
+;     c   d | |   c'  d' |
+;    / \ /  | |  / \ /   |
+;   e   f --' | e'  f' --'
+;    \  |\____'  \  |
+;     \ g         \ |
+;      \|          \|
+;       h           g'
+;       |           |
+;       `---> & <-- h'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization6(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       ret += n + 1;
+;     }
+;     if (id == n) break;
+;   }
+;
+;   ret += n * 2;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   ret += n * 4;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end10, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.1, %if.end10 ]
+  %rem1 = and i32 %n, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end6
+
+if.else:                                          ; preds = %while.body
+  %add = add nsw i32 %n, 1
+  %add5 = add nsw i32 %add, %ret.0
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.else, %if.then
+  %ret.1 = phi i32 [ %add5, %if.else ], [ %ret.0, %if.then ]
+  %cmp7 = icmp eq i32 %conv, %n
+  br i1 %cmp7, label %while.end, label %if.end10
+
+if.end10:                                         ; preds = %if.end6
+  br label %while.body
+
+while.end:                                        ; preds = %if.end6
+  %mul = shl nsw i32 %n, 1
+  %add11 = add nsw i32 %ret.1, %mul
+  %0 = icmp eq i32 %add11, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %add11, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  %mul12 = mul i32 %n, 4
+  %n.neg = sub i32 0, %n
+  %add13 = add i32 %mul12, %n.neg
+  %sub = add i32 %add13, %ret.0
+  br label %early
+
+early:                                            ; preds = %e, %while.end
+  %storemerge = phi i32 [ %div, %while.end ], [ %sub, %e ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization6, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization6
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[E:.+]], label %[[IFEND6:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFEND6]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br label %[[IFEND6UNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[EUNIFORM:.+]], label %[[IFEND6EUNIFORM:.+]]
+
+; CHECK: [[IFEND6UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEENDUNIFORM:.+]], label %[[IFEND6UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[WHILEENDUNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[IFEND6UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFEND6UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFEND6UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE:.+]], label %[[ESPLIT:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[EELSE]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[ESPLIT]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
new file mode 100644
index 0000000000000..257ec93334f3b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
@@ -0,0 +1,262 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c
+;    / \ / \
+;   d   e   f
+;    \ / \ /
+;     g   h
+;      \ /
+;       i
+;
+; * where nodes a, c and e are uniform branches, and node b is a varying
+;   branch.
+; * where nodes d, e, g and i are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;          a
+;         / \
+;        /   \
+;       /     \
+;      /       \
+;     b____     c
+;    / \   \   / \
+;   d   e   d'|   |
+;    \ / \   \|   |
+;     g   h   e'  f
+;      \ /     \ /
+;       i       h'
+;       |       |
+;       |       g'
+;       |       |
+;       |       i'
+;        \     /
+;         \   /
+;          \ /
+;           &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization7(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   if (n > 10) { // a
+;     if (n + id > 10) { // b
+;       i = n * 10; // d
+;       goto g;
+;     } else {
+;       goto e;
+;     }
+;   } else {
+;     if (n < 5) { // c
+;       goto e;
+;     } else {
+;       for (int j = 0; j < n; j++) { i++; }
+;       goto h;
+;     }
+;   }
+;
+; e:
+;   if (n > 5) {
+;     goto g;
+;   } else {
+;     i = n * 3 / 5;
+;     goto h;
+;   }
+;
+; g:
+;   for (int j = 0; j < n; j++) { i++; }
+;   goto i;
+;
+; h:
+;   i = n + id / 3;
+;
+; i:
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %conv, %n
+  %cmp2 = icmp sgt i32 %add, 10
+  br i1 %cmp2, label %if.then4, label %e
+
+if.then4:                                         ; preds = %if.then
+  %mul = mul nsw i32 %n, 10
+  br label %g
+
+if.else5:                                         ; preds = %entry
+  %cmp6 = icmp slt i32 %n, 5
+  br i1 %cmp6, label %e, label %if.else9
+
+if.else9:                                         ; preds = %if.else5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else9
+  %storemerge = phi i32 [ 0, %if.else9 ], [ %inc12, %for.body ]
+  %cmp10 = icmp slt i32 %storemerge, %n
+  br i1 %cmp10, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5, %if.then
+  %cmp13 = icmp sgt i32 %n, 5
+  br i1 %cmp13, label %g, label %h
+
+g:                                                ; preds = %e, %if.then4
+  %i.1 = phi i32 [ %mul, %if.then4 ], [ 0, %e ]
+  br label %for.cond19
+
+for.cond19:                                       ; preds = %for.body22, %g
+  %i.2 = phi i32 [ %i.1, %g ], [ %inc23, %for.body22 ]
+  %storemerge1 = phi i32 [ 0, %g ], [ %inc25, %for.body22 ]
+  %cmp20 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp20, label %for.body22, label %i29
+
+for.body22:                                       ; preds = %for.cond19
+  %inc23 = add nsw i32 %i.2, 1
+  %inc25 = add nsw i32 %storemerge1, 1
+  br label %for.cond19
+
+h:                                                ; preds = %e, %for.cond
+  %div27 = sdiv i32 %conv, 3
+  %add28 = add nsw i32 %div27, %n
+  br label %i29
+
+i29:                                              ; preds = %h, %for.cond19
+  %i.3 = phi i32 [ %add28, %h ], [ %i.2, %for.cond19 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %i.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization7, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization7
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE5:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN4UNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[IFTHEN4:.+]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[CMP13UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP13UNIFORM]], label %[[GUNIFORM]], label %[[HUNIFORM:.+]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[I29UNIFORM:.+]]
+
+; CHECK: [[GUNIFORM]]:
+; CHECK: br label %[[FORCOND19UNIFORM:.+]]
+
+; CHECK: [[FORCOND19UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22UNIFORM:.+]], label %[[I29LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY22UNIFORM]]:
+; CHECK: br label %[[FORCOND19UNIFORM]]
+
+; CHECK: [[I29LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I29:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP6:.+]] = icmp
+; CHECK: br i1 %[[CMP6]], label %[[E]], label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[E]]:
+; CHECK: %[[CMP13:.+]] = icmp
+; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND19:.+]]
+
+; CHECK: [[FORCOND19]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[I29LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND19]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I29LOOPEXIT]]:
+; CHECK: br label %[[I29]]
+
+; CHECK: [[I29]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
new file mode 100644
index 0000000000000..3932ca05554f6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
@@ -0,0 +1,220 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization8 -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where nodes b and c varying branches.
+; * where nodes e, f, d and g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <-.   b' <.
+;    / \__|__ |   |
+;   e   c_|__`c'  |
+;   |  / \|  \|   |
+;   | f   d   d' -'
+;   |/        |
+;   g         f'
+;   |         |
+;   |         e'
+;   |         |
+;   `--> & <- g'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization8(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (i + id > n) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end6, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc7, %if.end6 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end6 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc8, %if.end6 ]
+  %add = add nsw i32 %storemerge, %conv
+  %cmp = icmp sgt i32 %add, %n
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add2 = add nsw i32 %y.0, %x.0
+  %cmp3 = icmp sgt i32 %add2, %n
+  br i1 %cmp3, label %f, label %if.end6
+
+if.end6:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc7 = add nsw i32 %x.0, 1
+  %inc8 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add9 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add9
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add10 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add10, 0
+  %13 = select i1 %12, i32 1, i32 %add10
+  %div11 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div11, %f ], [ %mul, %e ]
+  %add12 = add i32 %y.0, %x.0
+  %add13 = add i32 %add12, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add13, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization8, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization8
+; CHECK: br i1 true, label %[[FORCONDUNIFORM:.+]], label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[FORCONDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFENDUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFEND6UNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND6UNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFEND6:.+]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[FORCONDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFENDUNIFORM]], label %[[FORCONDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[FORCONDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND6]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[E]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
new file mode 100644
index 0000000000000..057e704c3cee9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
@@ -0,0 +1,173 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization9 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; * where node e is a varying branch.
+; * where node f is divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;   a
+;   |
+;   b <--. .> b' <--.
+;   |    | |  |     |
+;   c <. | |  c' <. |
+;   |  | | |  |   | |
+;   d -' | |  d' -' |
+;   |    | |  |     |
+;   e ---' |  e' ---'
+;   |\_____'  |
+;   f         f'
+;    \       /
+;     \     /
+;      \   /
+;       \ /
+;        &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization9(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   while (1) {
+;     int j = 0;
+;     for (; ; i++) {
+;       if (j++ > n) break;
+;     }
+;     if (i++ + id > n) break;
+;   }
+;
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end7, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc3, %if.end7 ]
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %while.body
+  %i.1 = phi i32 [ %i.0, %while.body ], [ %inc3, %for.inc ]
+  %j.0 = phi i32 [ 0, %while.body ], [ %inc, %for.inc ]
+  %cmp = icmp sgt i32 %j.0, %n
+  %inc3 = add nsw i32 %i.1, 1
+  br i1 %cmp, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %add = add nsw i32 %i.1, %conv
+  %cmp4 = icmp sgt i32 %add, %n
+  br i1 %cmp4, label %while.end, label %if.end7
+
+if.end7:                                          ; preds = %for.end
+  br label %while.body
+
+while.end:                                        ; preds = %for.end
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %inc3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization9, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization9
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FOREND:.+]], label %[[FORINC:.+]]
+
+; CHECK: [[FORINC]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FORENDUNIFORM:.+]], label %[[FORINCUNIFORM:.+]]
+
+; CHECK: [[FORINCUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEEND]], label %[[FORENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: ret void
+
+; CHECK: [[FORENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[FORENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[FORENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
new file mode 100644
index 0000000000000..621ddf10503e0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
@@ -0,0 +1,125 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; TODO(CA-1981): Using `not` in qemu does not work.
+; REQUIRES: native
+; RUN: %not %veczc -k printf_add -vecz-simd-width=4 -S -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC < %s 2>&1 | %filecheck %s
+
+; This test just checks that we don't crash while converting the control flow.
+; LinearizeBOSCC would leave behind an invalid function when control flow fails
+; some time afterwards. This could trigger verification failures or crashes
+; depending on which passes were run later.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @printf_add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i32 addrspace(1)* %status, i8 addrspace(1)* %x) {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %status.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %sum = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  store i32 addrspace(1)* %status, i32 addrspace(1)** %status.addr, align 8
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  store i64 %call, i64* %tid, align 8
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8
+  %1 = load i64, i64* %tid, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %1
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8
+  %4 = load i64, i64* %tid, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %4
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %2, %5
+  store i32 %add, i32* %sum, align 4
+  %6 = load i32, i32* %sum, align 4
+  %7 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8
+  %8 = load i64, i64* %tid, align 8
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %7, i64 %8
+  store i32 %6, i32 addrspace(1)* %arrayidx2, align 4
+  %9 = load i64, i64* %tid, align 8
+  %conv = trunc i64 %9 to i32
+  %10 = load i32, i32* %sum, align 4
+  %11 = call spir_func i64 @_Z14get_num_groupsj(i32 0)
+  %12 = trunc i64 %11 to i32
+  %13 = call spir_func i64 @_Z14get_num_groupsj(i32 1)
+  %14 = trunc i64 %13 to i32
+  %15 = call spir_func i64 @_Z14get_num_groupsj(i32 2)
+  %16 = trunc i64 %15 to i32
+  %17 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %18 = trunc i64 %17 to i32
+  %19 = call spir_func i64 @_Z12get_group_idj(i32 1)
+  %20 = trunc i64 %19 to i32
+  %21 = call spir_func i64 @_Z12get_group_idj(i32 2)
+  %22 = trunc i64 %21 to i32
+  %23 = mul i32 %12, %20
+  %24 = mul i32 %14, %16
+  %25 = mul i32 %22, %24
+  %26 = add i32 %23, %25
+  %27 = add i32 %18, %26
+  %28 = mul i32 %14, %16
+  %29 = mul i32 %12, %28
+  %30 = udiv i32 1048576, %29
+  %31 = and i32 %30, -4
+  %32 = mul i32 %27, %31
+  %33 = getelementptr i8, i8 addrspace(1)* %x, i32 %32
+  %34 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
+  %35 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
+  %36 = atomicrmw add i32 addrspace(1)* %35, i32 12 acq_rel
+  %37 = add i32 %36, 12
+  %38 = icmp ugt i32 %37, %31
+  br i1 %38, label %early_return.i, label %store.i
+
+early_return.i:                                   ; preds = %entry
+  %39 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
+  %40 = getelementptr i32, i32 addrspace(1)* %39, i32 1
+  %41 = atomicrmw add i32 addrspace(1)* %40, i32 12 acq_rel
+  br label %.exit
+
+store.i:                                          ; preds = %entry
+  %42 = getelementptr i8, i8 addrspace(1)* %33, i32 %36
+  %43 = bitcast i8 addrspace(1)* %42 to i32 addrspace(1)*
+  store i32 0, i32 addrspace(1)* %43, align 1
+  %44 = add i32 %36, 4
+  %45 = getelementptr i8, i8 addrspace(1)* %33, i32 %44
+  %46 = bitcast i8 addrspace(1)* %45 to i32 addrspace(1)*
+  store i32 %conv, i32 addrspace(1)* %46, align 1
+  %47 = add i32 %36, 8
+  %48 = getelementptr i8, i8 addrspace(1)* %33, i32 %47
+  %49 = bitcast i8 addrspace(1)* %48 to i32 addrspace(1)*
+  store i32 %10, i32 addrspace(1)* %49, align 1
+  br label %.exit
+
+.exit:                                            ; preds = %store.i, %early_return.i
+  %call31 = phi i32 [ -1, %early_return.i ], [ 0, %store.i ]
+  %50 = load i32 addrspace(1)*, i32 addrspace(1)** %status.addr, align 8
+  %51 = load i64, i64* %tid, align 8
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %50, i64 %51
+  store i32 %call31, i32 addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z12get_group_idj(i32)
+declare spir_func i64 @_Z14get_num_groupsj(i32)
+
+; We can't vectorize this control flow
+; CHECK: Error: Failed to vectorize function 'printf_add'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
new file mode 100644
index 0000000000000..ad4f75c240fad
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
@@ -0,0 +1,25 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that we don't crash when costing a scalable reduction
+; RUN: %veczc -vecz-scalable -vecz-passes="pre-linearize" -vecz-choices=LinearizeBOSCC -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @boscc_merge() {
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
new file mode 100644
index 0000000000000..b1fb99891c7f0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 %a, i32 %b, i32* %c, float %rf) {
+entry:
+  %d = alloca i32
+  %e = alloca i32
+  %f = alloca float
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %d, align 4
+  store i32 %sum, i32* %e, align 4
+  %call = call spir_func i32 @foo(i32* %e)
+  %d.load = load i32, i32* %d, align 4
+  %e.load = load i32, i32* %e, align 4
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %d.load, i32* %c0, align 4
+  %c1 = getelementptr i32, i32* %c0, i64 1
+  store i32 %e.load, i32* %c1, align 4
+  store float %rf, float* %f
+  %ri = bitcast float* %f to i32*
+  %ri.load = load i32, i32* %ri, align 4
+  %c2 = getelementptr i32, i32* %c1, i64 2
+  store i32 %ri.load, i32* %c2, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @foo(i32*)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf)
+; CHECK: %e = alloca i32
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %sum = add i32 %a, %b
+; CHECK: store i32 %sum, ptr %e
+; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e)
+; CHECK: %e.load = load i32, ptr %e
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %sum, ptr %c0
+; CHECK: %c1 = getelementptr i32, ptr %c0, i64 1
+; CHECK: store i32 %e.load, ptr %c1
+; CHECK: %0 = bitcast float %rf to i32
+; CHECK: %c2 = getelementptr i32, ptr %c1, i64 2
+; CHECK: store i32 %0, ptr %c2, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
new file mode 100644
index 0000000000000..eed794573ca46
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Note: *not* running LLVM's mem2reg pass as before LLVM 15 it crashes for the
+; same reason we used to!
+; RUN: %veczc -vecz-passes=vecz-mem2reg -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p) {
+  %data = alloca i32, align 4
+  %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1
+  %3 = load i32, ptr addrspace(1) %2, align 4
+  store i32 %3, ptr %data, align 4
+  %4 = load <2 x i16>, ptr %data, align 2
+  ret void
+}
+
+define spir_kernel void @load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) {
+  %data = alloca i32, align 4
+  %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1
+  %3 = load i32, ptr addrspace(1) %2, align 4
+  store i32 %3, ptr %data, align 4
+  %4 = load i16, ptr %data, align 2
+  ret void
+}
+
+define spir_kernel void @store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) {
+  %data = alloca i32, align 4
+  %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %2 = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %1
+  %3 = load i16, ptr addrspace(1) %2, align 4
+  store i16 %3, ptr %data, align 2
+  %4 = load i32, ptr %data, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p)
+; CHECK-NOT: alloca i32
+; CHECK:  %3 = load i32, ptr addrspace(1) %2, align 4
+; CHECK:  %4 = bitcast i32 %3 to <2 x i16>
+
+; Note: we can't optimize this as the allocated type size and loaded type sizes
+; don't match. Maybe we could trunc %3 from i32 to i16? See CA-4382.
+
+; CHECK: define spir_kernel void @__vecz_v4_load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
+; CHECK:  %data = alloca i32, align 4
+; CHECK:  %4 = load i16, ptr %data, align 2
+
+; Note: we can't optimize this as the allocated type size and loaded type sizes
+; don't match. Maybe we could trunc %3 from i32 to i16? See CA-4382.
+
+; CHECK: define spir_kernel void @__vecz_v4_store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
+; CHECK:  %data = alloca i32, align 4
+; CHECK:  %4 = load i32, ptr %data, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
new file mode 100644
index 0000000000000..fab368e4f87ef
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
@@ -0,0 +1,122 @@
+
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_memset_i8(i64* %z) {
+  %dst = bitcast i64* %z to i8*
+  call void @llvm.memset.p0i8.i64(i8* %dst, i8 42, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i8(ptr %z)
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  store i64 3038287259199220266, ptr %1, align 8
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  store i64 3038287259199220266, ptr %2, align 8
+; CHECK:  %dst1 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  store i8 42, ptr %dst1, align 1
+; CHECK:  %dst2 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  store i8 42, ptr %dst2, align 1
+; CHECK:  ret void
+; CHECK: }
+
+define spir_kernel void @test_memset_i16(i64* %z) {
+  %dst = bitcast i64* %z to i16*
+  call void @llvm.memset.p0i16.i64(i16* %dst, i8 42, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i16(ptr %z)
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  store i64 3038287259199220266, ptr %1, align 8
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  store i64 3038287259199220266, ptr %2, align 8
+; CHECK:  %dst1 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  store i8 42, ptr %dst1, align 1
+; CHECK:  %dst2 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  store i8 42, ptr %dst2, align 1
+; CHECK:  ret void
+; CHECK: }
+
+define spir_kernel void @test_memcpy_i8(i64* %a, i64* %z) {
+  %src = bitcast i64* %a to i8*
+  %dst = bitcast i64* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i8(ptr %a, ptr %z)
+; CHECK:  %src = bitcast ptr %a to ptr
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %src, i64 0
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  %src1 = load i64, ptr %1, align 8
+; CHECK:  store i64 %src1, ptr %2, align 8
+; CHECK:  %3 = getelementptr inbounds i8, ptr %src, i64 8
+; CHECK:  %4 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  %src2 = load i64, ptr %3, align 8
+; CHECK:  store i64 %src2, ptr %4, align 8
+; CHECK:  %5 = getelementptr inbounds i8, ptr %src, i64 16
+; CHECK:  %dst3 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  %src4 = load i8, ptr %5, align 1
+; CHECK:  store i8 %src4, ptr %dst3, align 1
+; CHECK:  %6 = getelementptr inbounds i8, ptr %src, i64 17
+; CHECK:  %dst5 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  %src6 = load i8, ptr %6, align 1
+; CHECK:  store i8 %src6, ptr %dst5, align 1
+; CHECK:  ret void
+; CHECK: }
+
+define spir_kernel void @test_memcpy_i16(i64* %a, i64* %z) {
+  %src = bitcast i64* %a to i16*
+  %dst = bitcast i64* %z to i16*
+  call void @llvm.memcpy.p0i16.p0i16.i64(i16* %dst, i16* %src, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i16(ptr %a, ptr %z)
+; CHECK:  %src = bitcast ptr %a to ptr
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %src, i64 0
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  %src1 = load i64, ptr %1, align 8
+; CHECK:  store i64 %src1, ptr %2, align 8
+; CHECK:  %3 = getelementptr inbounds i8, ptr %src, i64 8
+; CHECK:  %4 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  %src2 = load i64, ptr %3, align 8
+; CHECK:  store i64 %src2, ptr %4, align 8
+; CHECK:  %5 = getelementptr inbounds i8, ptr %src, i64 16
+; CHECK:  %dst3 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  %src4 = load i8, ptr %5, align 1
+; CHECK:  store i8 %src4, ptr %dst3, align 1
+; CHECK:  %6 = getelementptr inbounds i8, ptr %src, i64 17
+; CHECK:  %dst5 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  %src6 = load i8, ptr %6, align 1
+; CHECK:  store i8 %src6, ptr %dst5, align 1
+; CHECK:  ret void
+; CHECK: }
+
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
+declare void @llvm.memset.p0i16.i64(i16*, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memcpy.p0i16.p0i16.i64(i16*, i16*, i64, i32, i1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
new file mode 100644
index 0000000000000..3e6ffbe2dcdd9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func float @_Z5fractfPf(float, float*)
+declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
+declare spir_func <4 x float> @_Z5fractDv4_fPS_(<4 x float>, <4 x float>*)
+declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
+
+; FIXME: Both of these are instantiating when we have vector equivalents: see
+; CA-4046.
+
+define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
+  %iouta = alloca float
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx
+  %x = load float, float* %arrayidx.x, align 4
+  %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta)
+  %arrayidx.out = getelementptr inbounds float, float* %outptr, i64 %idx
+  %arrayidx.iout = getelementptr inbounds float, float* %ioutptr, i64 %idx
+  store float %out, float* %arrayidx.out, align 4
+  %iout = load float, float* %iouta, align 4
+  store float %iout, float* %arrayidx.iout, align 4
+  ret void
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+}
+
+define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
+  %iouta = alloca <2 x float>
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx
+  %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8
+  %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta)
+  %arrayidx.out = getelementptr inbounds <2 x float>, <2 x float>* %outptr, i64 %idx
+  %arrayidx.iout = getelementptr inbounds <2 x float>, <2 x float>* %ioutptr, i64 %idx
+  store <2 x float> %out, <2 x float>* %arrayidx.out, align 8
+  %iout = load <2 x float>, <2 x float>* %iouta, align 8
+  store <2 x float> %iout, <2 x float>* %arrayidx.iout, align 8
+  ret void
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
new file mode 100644
index 0000000000000..0e14a786bfb7a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_varying_if_ptr(i32 %a, ptr %b, ptr %on_true, ptr %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom
+  store ptr %on_true, ptr %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42
+  store ptr %on_false, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store ptr [[A]], ptr [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
new file mode 100644
index 0000000000000..be956f806b372
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; This test checks that we can optimize interleaved accesses out of order.
+
+define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %conv, %mul
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = or i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx)
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx9)
+  %sub1 = sub nsw <4 x i32> %0, %1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12
+  %2 = bitcast i32 addrspace(1)* %arrayidx13 to <4 x i32> addrspace(1)*
+  store <4 x i32> %sub1, <4 x i32> addrspace(1)* %2, align 4
+  ret void
+}
+
+; CHECK: __vecz_v4_interleaved_load_4(
+; CHECK:  [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK:  [[TMP1:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; CHECK:  [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; CHECK:  %deinterleave = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:  %deinterleave1 = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK:  %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
new file mode 100644
index 0000000000000..d7d30c23d963d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @load_add_store(ptr %aptr, ptr %bptr, ptr %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+  %a = load i32, ptr %arrayidxa, align 4
+  %b = load i32, ptr %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, ptr %arrayidxz, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_load_add_store(ptr %aptr, ptr %bptr, ptr %zptr)
+; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+; CHECK: %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+; CHECK: %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+; CHECK: %[[TMP0:.*]] = load <4 x i32>, ptr %arrayidxa, align 4
+; CHECK: %[[TMP1:.*]] = load <4 x i32>, ptr %arrayidxb, align 4
+; CHECK: %sum1 = add <4 x i32> %[[TMP0]], %[[TMP1]]
+; CHECK: store <4 x i32> %sum1, ptr %arrayidxz, align 4
+; CHECK: ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
new file mode 100644
index 0000000000000..9334d060cd878
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
@@ -0,0 +1,82 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_varying_if(i32 %a, ptr %b, float %on_true, float %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom
+  store float %on_true, ptr %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42
+  store float %on_false, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define spir_kernel void @test_varying_if_as3(i32 %a, ptr addrspace(3) %b, float %on_true, float %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds ptr, ptr addrspace(3) %b, i64 %idxprom
+  store float %on_true, ptr addrspace(3) %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds ptr, ptr addrspace(3) %b, i64 42
+  store float %on_false, ptr addrspace(3) %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK:     define void @__vecz_b_masked_store4_fu3ptrb(float [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store float [[A]], ptr [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
+
+; CHECK:     define void @__vecz_b_masked_store4_fu3ptrU3AS3b(float [[A:%.*]], ptr addrspace(3) [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store float [[A]], ptr addrspace(3) [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
new file mode 100644
index 0000000000000..899c0a195ed21
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i8(
+; CHECK: %shl = shl i64 %call, 2
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
+; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
+; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
+define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl i64 %call, 2
+  %add = add i64 %shl, %0
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Note that unlike with typed pointers, we don't need a bitcast to i8 here.
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i16(
+; CHECK: %shl = shl i64 %call, 2
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
+; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
+; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
+define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = ptrtoint i16 addrspace(1)* %in to i64
+  %shl = shl i64 %call, 2
+  %add = add i64 %shl, %0
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
new file mode 100644
index 0000000000000..b0d03c44b89bf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=ternary-transform,verify -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_positive(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+define spir_kernel void @test_positive_gep_different_type(i64 %a, i64 %b, i8* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i8, i8* %c2, i64 %gid
+  store i8 1, i8* %c3, align 4
+  ret void
+}
+
+define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  %c1 = getelementptr i64, i64* %c, i64 0
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  store i64 %b, i64* %c2, align 4
+  ret void
+ }
+
+
+define spir_kernel void @test_vector_scalar_cond(i64 %a, <2 x i32> %b, <2 x i32>* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr <2 x i32>, <2 x i32>* %c, i64 %gid
+  %c1 = getelementptr <2 x i32>, <2 x i32>* %c, i64 0
+  %c2 = select i1 %cond, <2 x i32>* %c0, <2 x i32>* %c1
+  %c3 = getelementptr <2 x i32>, <2 x i32>* %c2, i64 %gid
+  store <2 x i32> <i32 1, i32 0>, <2 x i32>* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive_gep_different_type(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i8, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i8, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP2]], i1 %[[XOR]])
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: store i64 %b, ptr %c2, align 4
+
+; Note: we don't perform this transform on vector accesses - see CA-4337.
+; CHECK: define spir_kernel void @__vecz_v4_test_vector_scalar_cond(i64 %a, <2 x i32> %b, ptr %c)
+; CHECK:   %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK:   %cond = icmp eq i64 %a, %gid
+; CHECK:   %c0 = getelementptr <2 x i32>, ptr %c, i64 %gid
+; CHECK:   %c1 = getelementptr <2 x i32>, ptr %c, i64 0
+; CHECK:   %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK:   %c3 = getelementptr <2 x i32>, ptr %c2, i64 %gid
+; CHECK:   store <2 x i32> <i32 1, i32 0>, ptr %c3, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
new file mode 100644
index 0000000000000..e2e11f816701d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -0,0 +1,82 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #[[ATTRS:[0-9]+]]
+; CHECK: ret void
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
new file mode 100644
index 0000000000000..a612d7601767f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -0,0 +1,82 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #[[ATTRS:[0-9]+]]
+; CHECK: ret void
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
new file mode 100644
index 0000000000000..3440be62739c1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+; This test checks if a uniform <4 x i32> phi is not scalarized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ]
+; CHECK: %[[INC]] = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
new file mode 100644
index 0000000000000..a5583b224d23c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
@@ -0,0 +1,94 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %initaddr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in2, i64 %call
+  %init = load <4 x i32>, <4 x i32> addrspace(1)* %initaddr
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ %init, %entry ]
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %0 = extractelement <4 x i32> %storemerge, i64 0
+  %cmp2 = icmp slt i32 %0, %conv
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+; This test checks if a varying <4 x i32> phi gets scalarized
+; if it is only accessed through individually extracted elements.
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
new file mode 100644
index 0000000000000..9b330a3408c9f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast_illegal(<32 x float> addrspace(1)* nocapture readonly %in, <32 x float> %addend, <32 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
+  %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
+  %2 = fadd <32 x float> %1, %addend
+  %arrayidx3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %out, i64 %call
+  store <32 x float> %2, <32 x float> addrspace(1)* %arrayidx3, align 64
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)*
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16
+  %and1.i.i.i1.i = and <4 x i32> %1, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %cmp.i.i.i2.i = icmp ne <4 x i32> %and1.i.i.i1.i, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %and2.i.i.i3.i = and <4 x i32> %1, <i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  %cmp3.i.i.i4.i = icmp eq <4 x i32> %and2.i.i.i3.i, zeroinitializer
+  %2 = or <4 x i1> %cmp.i.i.i2.i, %cmp3.i.i.i4.i
+  %3 = bitcast <4 x i32> %1 to <4 x float>
+  %4 = select <4 x i1> %2, <4 x float> %3, <4 x float> <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+; Check that new instructions aren't inserting before pre-existing allocas
+define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 {
+entry:
+  %existing.alloc = alloca <4 x i32>
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc
+  %scalar = bitcast <4 x i32>* %existing.alloc to i32*
+  store i32 1, i32* %scalar
+  %v = load <4 x i32>, <4 x i32>* %existing.alloc
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %arrayidx4, align 16
+
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %op = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %v4 = fadd <4 x float> %op, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %v4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fcmp oeq <4 x float> %1, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %3 = and <4 x i1> %2, %input
+  %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> %woof
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
+; CHECK-NEXT:  [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
+; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
+; CHECK-NEXT:  [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:  [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:  [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:  ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_illegal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <32 x float>, align 128
+; CHECK-NEXT:    store <32 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 128
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 128 x i32> @llvm.experimental.stepvector.nxv128i32()
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], shufflevector (<vscale x 128 x i32> insertelement (<vscale x 128 x i32> {{(undef|poison)}}, i32 31, {{i32|i64}} 0), <vscale x 128 x i32> {{(undef|poison)}}, <vscale x 128 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> undef)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 128 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 128 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <32 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 128 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 64
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  [[EXISTINGALLOC:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:  [[VS21:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
+; CHECK-NEXT:  [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:  [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
+; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
+; CHECK-NEXT:  [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:  store <4 x i32> zeroinitializer, ptr [[EXISTINGALLOC]], align 16
+; CHECK-NEXT:  store i32 1, ptr [[EXISTINGALLOC]], align 16
+; CHECK-NEXT:  [[V:%.*]] = load <4 x i32>, ptr [[EXISTINGALLOC]], align 16
+; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.v4i32(<vscale x 16 x i32> poison, <4 x i32> [[V]], i64 0)
+; CHECK-NEXT:  [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[VS2]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
+; CHECK-NEXT:  [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  store <vscale x 16 x i32> [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 16
+; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  [[TMP5:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:  [[V44:%.*]] = fadd <vscale x 16 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT:  [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  store <vscale x 16 x float> [[V44]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:  ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VS21:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[WOOF:%.*]], i64 0)
+; CHECK-NEXT:    [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:    [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[XLEN4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[XLEN4]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i8> @llvm.{{(experimental.)?}}vector.insert.nxv16i8.v4i8(<vscale x 16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
+; CHECK-NEXT:    [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> {{(undef|poison)}}, i16 3, {{i32|i64}} 0), <vscale x 16 x i16> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP0]])
+; CHECK: [[TMP4:%.*]] = trunc <vscale x 16 x i8> [[TMP3]] to <vscale x 16 x i1>
+; CHECK: [[TMP5:%.*]] = fcmp oeq <vscale x 16 x float>
+; CHECK: [[TMP8:%.*]] = and <vscale x 16 x i1> [[TMP5]], [[TMP4]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..2b90bb3d118a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -0,0 +1,183 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
+; CHECK:   %[[VL:.+]] = mul i64 %[[VLSCALE]], 4
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
+; CHECK:   %[[VL:.+]] = mul i64 %[[VLSCALE]], 4
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[VLSCALE2:.+]] = call i64 @llvm.vscale.i64()
+; CHECK:   %[[VL2:.+]] = mul i64 %[[VLSCALE2]], 4
+; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+; Make sure the floating point version of the slide1up intrinsic is created
+; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> undef, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
new file mode 100644
index 0000000000000..4fa347583ea33
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -0,0 +1,184 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %n = call <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(<vscale x 4 x double> zeroinitializer, i32 0)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VL:.+]] = zext i32 %1 to i64
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %[[SHUFFLE]]
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VL:.+]] = zext i32 %1 to i64
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[VL2:.+]] = zext i32 %1 to i64
+; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+; Make sure the floating point version of the slide1up intrinsic is created
+; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> undef, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(<vscale x 4 x double>, i32)
+; CHECK-LABEL: define <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(<vscale x 4 x double>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x double> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x double> @llvm.minnum.nxv4f64(<vscale x 4 x double> %[[VEC]], <vscale x 4 x double> %{{.+}})
+; CHECK:   call <vscale x 4 x double> @llvm.riscv.vfslide1up.nxv4f64.f64.i64({{(<vscale x 4 x double> undef, )?}}<vscale x 4 x double> %{{.+}}, double 0x7FF0000000000000, i64 %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
new file mode 100644
index 0000000000000..f851e2e9f5c3e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -0,0 +1,169 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k extract_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE
+; RUN: %not %veczc -k extract_element_ilegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
+; RUN: %veczc -k extract_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI
+; RUN: %veczc -k extract_element_uniform_vec -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI-VEC
+; RUN: %veczc -k extract_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-INDICES
+; RUN: %veczc -k extract_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = extractelement <4 x float> %1, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; NOTE: Base packetization failing for this case.
+
+define spir_kernel void @extract_element_ilegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
+  %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
+  %2 = extractelement <32 x float> %1, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %0 = extractelement <4 x float> %in, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %i = urem i64 %call, 4
+  %0 = extractelement <4 x float> %in, i64 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx
+  %1 = extractelement <4 x float> %0, i32 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %1, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %3 = extractelement <4 x i1> %2, i64 %i
+  %4 = sext i1 %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %4, i32 addrspace(1)* %arrayidx3, align 4
+  %5 = sext <4 x i1> %2 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; EE-LABEL: @__vecz_nxv4_extract_element(
+; EE:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-NEXT:    [[TMP2:%.*]] = shl i64 [[XLEN]], 2
+; EE-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
+; EE-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; EE-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-NEXT:    [[VS1:%.*]] = add <vscale x 4 x i32> [[IDXSCALE]], [[SPLAT]]
+; EE-NEXT:    [[T3:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-NEXT:    [[T4:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T3]], i64 [[TMP2]])
+; EE-NEXT:    [[T5:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T4]], i64 0)
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
+
+; EE-UNI-LABEL: @__vecz_nxv4_extract_element_uniform(
+; EE-UNI: [[T0:%.*]] = extractelement <4 x float> %in, i32 %idx
+; EE-UNI: [[T1:%.*]] = insertelement <vscale x 4 x float> poison, float [[T0]], {{(i32|i64)}} 0
+; EE-UNI: [[T2:%.*]] = shufflevector <vscale x 4 x float> [[T1]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; EE-UNI: store <vscale x 4 x float> [[T2]], ptr addrspace(1) {{%.*}}, align 4
+
+; The vector is uniform and the index is varying, so we must broadcast the vector
+; FIXME: Do we really need to broadcast? Can we mod the indices with the original vector length?
+
+; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec(
+; EE-UNI-VEC:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-UNI-VEC:         [[T3:%.*]] = shl i64 [[XLEN]], 2
+; EE-UNI-VEC-NEXT:    [[T:%.*]] = trunc <vscale x 4 x i64> [[T2:%.*]] to <vscale x 4 x i32>
+; EE-UNI-VEC-NEXT:    [[I1:%.*]] = and <vscale x 4 x i32> [[T]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+; EE-UNI-VEC-NEXT:    [[IDX02:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; EE-UNI-VEC-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX02]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+
+; LLVM 16 deduces add/or equivalence and uses `or` instead.
+; EE-UNI-VEC-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1]]
+
+; EE-UNI-VEC-NEXT:    [[T4:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-UNI-VEC-NEXT:    [[T5:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T4]], i64 [[T3]])
+; EE-UNI-VEC-NEXT:    [[T6:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T5]], i64 0)
+
+; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
+; EE-INDICES:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-INDICES-NEXT:    [[T4:%.*]] = shl i64 [[XLEN]], 2
+; EE-INDICES-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-INDICES-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1:%.*]]
+; EE-INDICES-NEXT:    [[T5:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-INDICES-NEXT:    [[T6:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[T3:%.*]], <vscale x 16 x i32> [[T5]], i64 [[T4]])
+; EE-INDICES-NEXT:    [[T7:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T6]], i64 0)
+
+; Check we promote from i1 to i8 before doing our memops and use vrgatherei16.
+; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool(
+; EE-BOOL:       [[T6:%.*]] = sext <vscale x 16 x i1> [[T5:%.*]] to <vscale x 16 x i8>
+; EE-BOOL-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-BOOL-NEXT:  [[T7:%.*]] = shl i64 [[XLEN]], 2
+; EE-BOOL-NEXT:  [[T8:%.*]] = trunc <vscale x 4 x i64> [[T0:%.*]] to <vscale x 4 x i16>
+; EE-BOOL-NEXT:  [[T9:%.*]] = and <vscale x 4 x i16> [[T8]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i16>)
+; EE-BOOL-NEXT:  [[T10:%.*]] = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
+; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> {{(undef|poison)}}, i16 2, {{(i32|i64)}} 0), <vscale x 4 x i16> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-BOOL-NEXT:  [[VS1:%.*]] = {{add|or}} <vscale x 4 x i16> [[T11]], [[T9]]
+; EE-BOOL-NEXT:  [[T12:%.*]] = call <vscale x 16 x i16> @llvm.{{(experimental.)?}}vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[VS1]], i64 0)
+; EE-BOOL-NEXT:  [[T13:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[T6]], <vscale x 16 x i16> [[T12]], i64 [[T7]])
+; EE-BOOL-NEXT:  [[T14:%.*]] = call <vscale x 4 x i8> @llvm.{{(experimental.)?}}vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8> [[T13]], i64 0)
+; EE-BOOL-NEXT:  [[T15:%.*]] = trunc <vscale x 4 x i8> [[T14]] to <vscale x 4 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
new file mode 100644
index 0000000000000..3f91c699cdf23
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -0,0 +1,137 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k insert_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE
+; RUN: %veczc -k insert_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-UNI
+; RUN: %veczc -k insert_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-INDICES
+; RUN: %not %veczc -k insert_element_illegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
+; RUN: %veczc -k insert_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = insertelement <4 x float> %1, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %0 = insertelement <4 x float> %in, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %fidx = uitofp i64 %call to float
+  %2 = insertelement <4 x float> %1, float %fidx, i32 %i
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_illegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <32 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx, align 4
+  %i = urem i32 %idx, 32
+  %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
+  %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
+  %fidx = uitofp i64 %call to float
+  %2 = insertelement <32 x float> %1, float %fidx, i32 %i
+  %arrayidx3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %out, i64 %call
+  store <32 x float> %2, <32 x float> addrspace(1)* %arrayidx3, align 64
+  ret void
+}
+
+define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %v = trunc i32 %val to i1
+  %3 = insertelement <4 x i1> %2, i1 %v, i64 %i
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; IE-LABEL: @__vecz_nxv4_insert_element(
+; IE:         [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[VAL:%.*]], {{(i32|i64)}} 0
+; IE:         [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; IE:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; IE-NEXT:    [[TMP2:%.*]] = shl i64 [[XLEN]], 4
+; IE-NEXT:    [[SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
+; IE-NEXT:    [[SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; IE-NEXT:    [[ELTS:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[SPLAT]], i64 0)
+; IE-NEXT:    [[STEP:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; IE-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[STEP]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; IE-NEXT:    [[OUTER:%.*]] = lshr <vscale x 16 x i32> [[STEP]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; IE-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[SPLAT2]], [[INNER]]
+; IE-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(<vscale x 16 x float> [[TMP1:%.*]], <vscale x 16 x float> [[ELTS]], <vscale x 16 x i32> [[OUTER]], <vscale x 16 x i1> [[VM]], i64 [[TMP2]]{{(, i64 1)?}})
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing
+
+; IE-UNI-LABEL: @__vecz_nxv4_insert_element_uniform(
+; IE-UNI: {{%.*}} = insertelement <4 x float> %in, float %val, {{(i32|i64)}} %idx
+
+; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices(
+; IE-INDICES:         [[FIDX2:%.*]] = uitofp <vscale x 4 x i64> [[TMP0:%.*]] to <vscale x 4 x float>
+; IE-INDICES-NEXT:    [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; IE-INDICES-NEXT:    [[TMP5:%.*]] = shl i64 [[XLEN]], 4
+; IE-INDICES-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> {{%.*}}, i64 0)
+; IE-INDICES:         [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; IE-INDICES-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i32> [[IDX1]], i64 [[TMP5]])
+; IE-INDICES-NEXT:    [[VS25:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[FIDX2]], i64 0)
+; IE-INDICES-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; IE-INDICES-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[TMP9]], [[INNER]]
+; IE-INDICES-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(<vscale x 16 x float> [[TMP4:%.*]], <vscale x 16 x float> [[VS25]], <vscale x 16 x i32> [[IDX1]], <vscale x 16 x i1> [[VM]], i64 [[TMP5]]{{(, i64 1)?}})
+
+; Check we promote from i1 to i8 before doing our memops
+; IE-BOOL-LABEL: @__vecz_nxv4_insert_element_bool(
+; IE-BOOL-DAG:     [[T1:%.*]] = sext <vscale x 16 x i1> {{%.*}} to <vscale x 16 x i8>
+; IE-BOOL-DAG:     [[T0:%.*]] = sext <vscale x 4 x i1> {{%.*}} to <vscale x 4 x i8>
+; IE-BOOL:         [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8.i64(<vscale x 16 x i8> [[TMP7:%.*]], <vscale x 16 x i8> {{%.*}}, <vscale x 16 x i16> [[TMP16:%.*]], <vscale x 16 x i1> [[VM:%.*]], i64 [[TMP8:%.*]])
+;                            %12 = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8.i64(<vscale x 16 x i8> %6, <vscale x 16 x i8> %vs25, <vscale x 16 x i16> %vs16, <vscale x 16 x i1> %vm, i64 %7, i64 1)
+; IE-BOOL-NEXT:    [[TMP19:%.*]] = trunc <vscale x 16 x i8> [[TMP18]] to <vscale x 16 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
new file mode 100644
index 0000000000000..6b200207cf85a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
@@ -0,0 +1,23 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if not 'RISCV' in config.root.targets:
+    config.unsupported = True
+
+if config.llvm_version_major >= 14:
+    config.substitutions.append(('%vattr', '+v'))
+else:
+    config.substitutions.append(('%vattr', '+experimental-v'))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
new file mode 100644
index 0000000000000..edf3c93c53cfc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
+  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  store <vscale x 16 x i32> %[[GATHER]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
new file mode 100644
index 0000000000000..9ccda9f36ecdc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
+  %in.bool = icmp ne <4 x i32> %in.data, zeroinitializer
+  %out.data = shufflevector <4 x i1> %in.bool, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %out.sext = sext <4 x i1> %out.data to <4 x i32>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.sext, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic,
+; and that it works with a vector of i1 type by temporarily extending to i8.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[DATA_i1:.+]] = icmp ne <vscale x 16 x i32> %[[DATA]], zeroinitializer
+; CHECK:  %[[DATA_i8:.+]] = zext <vscale x 16 x i1> %[[DATA_i1]] to <vscale x 16 x i8>
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> %[[DATA_i8]], <vscale x 16 x i16> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHER_i1:.+]] = trunc <vscale x 16 x i8> %[[GATHER]] to <vscale x 16 x i1>
+; CHECK:  %[[RESULT:.+]] = sext <vscale x 16 x i1> %[[GATHER_i1]] to <vscale x 16 x i32>
+; CHECK:  store <vscale x 16 x i32> %[[RESULT]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
new file mode 100644
index 0000000000000..afed931f2b1e6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<2 x i32> addrspace(1)* %a, <2 x i32> addrspace(1)* %b, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 %gid
+  %a.data = load <2 x i32>, <2 x i32> addrspace(1)* %a.ptr
+  %b.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 %gid
+  %b.data = load <2 x i32>, <2 x i32> addrspace(1)* %b.ptr
+  %out.data = shufflevector <2 x i32> %a.data, <2 x i32> %b.data, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; It checks that a two-operand shuffle is packetized to a gather intrinsics and a select.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[DATB:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[WIDENA:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATA]], i64 0)
+; CHECK:  %[[GATHERA:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[WIDENA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[WIDENB:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATB]], i64 0)
+; CHECK:  %[[GATHERB:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[WIDENB]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[SELECT:.+]] = select <vscale x 16 x i1> %{{.+}}, <vscale x 16 x i32> %[[GATHERB]], <vscale x 16 x i32> %[[GATHERA]]
+; CHECK:  store <vscale x 16 x i32> %[[SELECT]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
new file mode 100644
index 0000000000000..cf1961a1d9208
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <2 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
+  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i64 %gid
+  store <2 x i32> %out.data, <2 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; It checks that a single-operand shuffle that narrows the vector is packetized to a gather intrinsic.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[EXTRACT:.+]] = call <vscale x 8 x i32> @llvm.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %[[GATHER]], i64 0)
+; CHECK:  store <vscale x 8 x i32> %[[EXTRACT]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
new file mode 100644
index 0000000000000..87145169d836e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<2 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %in.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <2 x i32>, <2 x i32> addrspace(1)* %in.ptr
+  %out.data = shufflevector <2 x i32> %in.data, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; It checks that a single-operand shuffle that widens the vector is packetized to a gather intrinsic.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[WIDEN:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATA]], i64 0)
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[WIDEN]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  store <vscale x 16 x i32> %[[GATHER]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
new file mode 100644
index 0000000000000..62eddff9c4af3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k select_scalar_vector -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, <2 x i32> %c, <2 x i32> <i32 4, i32 4>
+  store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_vector
+; CHECK: [[rhs:%.*]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
+; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
+; CHECK: [[idx0:%.*]] = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> {{(undef|poison)}}, i16 1, {{i32|i64}} 0), <vscale x 8 x i16> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[vs2:%.*]], <vscale x 8 x i16> [[vs1:%.*]], i64 [[xlen:%.*]])
+; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{i32|i64}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
new file mode 100644
index 0000000000000..b6062349ff2cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -0,0 +1,100 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-STORE-4
+; RUN: %veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-STORE-8
+; RUN: %veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-STORE-16
+; RUN: %veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-LOAD-4
+; RUN: %veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-LOAD-8
+; RUN: %veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-LOAD-16
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @store_element(i32 %0, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp ne i64 %call, 0
+  br i1 %cond, label %do, label %ret
+
+do:
+  %dest = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %0, i32 addrspace(1)* %dest, align 4
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-STORE-4:       define void @__vecz_b_masked_store4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 4 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-STORE-4-NEXT:  entry:
+; CHECK-STORE-4-NEXT:    call void @llvm.vp.store.nxv4i32.p1(<vscale x 4 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], <vscale x 4 x i1> [[TMP2]], i32 [[TMP3]])
+; CHECK-STORE-4-NEXT:    ret void
+
+; CHECK-STORE-8:       define void @__vecz_b_masked_store4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(<vscale x 8 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 8 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-STORE-8-NEXT:  entry:
+; CHECK-STORE-8-NEXT:    call void @llvm.vp.store.nxv8i32.p1(<vscale x 8 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], <vscale x 8 x i1> [[TMP2]], i32 [[TMP3]])
+; CHECK-STORE-8-NEXT:    ret void
+
+; CHECK-STORE-16:       define void @__vecz_b_masked_store4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(<vscale x 16 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 16 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-STORE-16-NEXT:  entry:
+; CHECK-STORE-16-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-STORE-16-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], {{i32|i64}} 0
+; CHECK-STORE-16-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-STORE-16-NEXT:    [[TMP6:%.*]] = icmp ult <vscale x 16 x i32> [[TMP5]], [[SPLAT]]
+; CHECK-STORE-16-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
+; CHECK-STORE-16-NEXT:    call void @llvm.masked.store.nxv16i32.p1(<vscale x 16 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], i32 4, <vscale x 16 x i1> [[TMP7]])
+; CHECK-STORE-16-NEXT:    ret void
+
+define spir_kernel void @load_element(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp ne i64 %call, 0
+  br i1 %cond, label %do, label %ret
+
+do:
+  %src = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %dest = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  %do.res = load i32, i32 addrspace(1)* %src, align 4
+  store i32 %do.res, i32 addrspace(1)* %dest, align 4
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-LOAD-4:      define <vscale x 4 x i32> @__vecz_b_masked_load4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 4 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-LOAD-4-NEXT: entry:
+; CHECK-LOAD-4-NEXT:   [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p1(ptr addrspace(1) [[TMP0]], <vscale x 4 x i1> [[TMP1]], i32 [[TMP2]])
+; CHECK-LOAD-4-NEXT:   ret <vscale x 4 x i32> [[TMP4]]
+
+; CHECK-LOAD-8:      define <vscale x 8 x i32> @__vecz_b_masked_load4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 8 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-LOAD-8-NEXT: entry:
+; CHECK-LOAD-8-NEXT:   [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p1(ptr addrspace(1) [[TMP0]], <vscale x 8 x i1> [[TMP1]], i32 [[TMP2]])
+; CHECK-LOAD-8-NEXT:   ret <vscale x 8 x i32> [[TMP4]]
+
+; CHECK-LOAD-16:      define <vscale x 16 x i32> @__vecz_b_masked_load4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 16 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-LOAD-16-NEXT: entry:
+; CHECK-LOAD-16-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-LOAD-16-NEXT: [[TMPSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], {{i32|i64}} 0
+; CHECK-LOAD-16-NEXT: [[TMPSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[TMPSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-LOAD-16-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 16 x i32> [[TMP4]], [[TMPSPLAT]]
+; CHECK-LOAD-16-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[TMP1]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; CHECK-LOAD-16-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p1(ptr addrspace(1) [[TMP0]], i32 4, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i32> {{undef|poison}})
+; CHECK-LOAD-16-NEXT: ret <vscale x 16 x i32> [[TMP7]]
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
new file mode 100644
index 0000000000000..e7de572a7d473
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-14+
+
+; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+v -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_load_add_store
+; CHECK: %local.id = call i64 @__mux_get_local_id(i32 0)
+; CHECK: %local.size = call i64 @__mux_get_local_size(i32 0)
+; CHECK: %work.remaining = sub nuw nsw i64 %local.size, %local.id
+; CHECK: %[[vli64:.+]] = call i64 @llvm.riscv.vsetvli.opt.i64(i64 %work.remaining, i64 2, i64 1)
+; CHECK: %[[vl:.+]] = trunc i64 %[[vli64]] to i32
+; CHECK: %[[lhs:.+]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]])
+; CHECK: %[[rhs:.+]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]])
+; CHECK: %[[sum:.+]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %[[lhs]], <vscale x 4 x i32> %[[rhs]], {{.*}}, i32 %[[vl]])
+; CHECK: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %[[sum]], {{.*}}, i32 %[[vl]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
new file mode 100644
index 0000000000000..bf89c3b4fb613
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -0,0 +1,181 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; NOTE: Assertions have been autogenerated by scripts/testing/update_veczc_checks.py
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)*
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16
+  %and1.i.i.i1.i = and <4 x i32> %1, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %cmp.i.i.i2.i = icmp ne <4 x i32> %and1.i.i.i1.i, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %and2.i.i.i3.i = and <4 x i32> %1, <i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  %cmp3.i.i.i4.i = icmp eq <4 x i32> %and2.i.i.i3.i, zeroinitializer
+  %2 = or <4 x i1> %cmp.i.i.i2.i, %cmp3.i.i.i4.i
+  %3 = bitcast <4 x i32> %1 to <4 x float>
+  %4 = select <4 x i1> %2, <4 x float> %3, <4 x float> <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+; Check that new instructions aren't inserting before pre-existing allocas
+define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 {
+entry:
+  %existing.alloc = alloca <4 x i32>
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc
+  %scalar = bitcast <4 x i32>* %existing.alloc to i32*
+  store i32 1, i32* %scalar
+  %v = load <4 x i32>, <4 x i32>* %existing.alloc
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %arrayidx4, align 16
+
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %op = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %v4 = fadd <4 x float> %op, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %v4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fcmp oeq <4 x float> %1, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %3 = and <4 x i1> %2, %input
+  %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> %woof
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{(i32|i64)}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x float>, align 16
+; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[EXISTING_ALLOC:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    [[FIXLEN_ALLOC1:%.*]] = alloca <4 x float>, align 16
+; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16
+; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> {{(undef|poison)}})
+; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16
+; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align 16
+; CHECK-NEXT:    [[V:%.*]] = load <4 x i32>, ptr [[EXISTING_ALLOC]], align 16
+; CHECK-NEXT:    store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i32> {{(undef|poison)}})
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[V46:%.*]] = fadd <vscale x 16 x float> [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK:    [[FIXLEN_MASK_ALLOC:%.*]] = alloca <4 x i8>, align 4
+; CHECK:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK:    [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
+; CHECK:    store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4
+; CHECK:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
+; CHECK:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i8, ptr [[FIXLEN_MASK_ALLOC]], <vscale x 16 x i64> [[TMP0]]
+; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> {{(undef|poison)}})
+; CHECK:    [[BMASK:%.*]] = trunc <vscale x 16 x i8> [[TMP1]] to <vscale x 16 x i1>
+; CHECK:    {{.*}} = and <vscale x 16 x i1> {{.*}}, [[BMASK]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
new file mode 100644
index 0000000000000..8a5985633bfd3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k builtins -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @builtins(float* %aptr, float* %bptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  %b = load float, float* %arrayidxb, align 4
+  %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b)
+  store i32 %cmp, i32* %arrayidxz, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z9isgreaterff(float, float)
+
+; CHECK: void @__vecz_nxv4_builtins
+; CHECK:   = fcmp ogt <vscale x 4 x float> %{{.*}}, %{{.*}}
+; CHECK:   = zext <vscale x 4 x i1> %relational2 to <vscale x 4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
new file mode 100644
index 0000000000000..fdf25e13b438a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k cast -vecz-scalable -vecz-simd-width=8 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @cast(i32* %aptr, float* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %c = sitofp i32 %a to float
+  store float %c, float* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv8_cast
+; CHECK: sitofp <vscale x 8 x i32> {{%[0-9]+}} to <vscale x 8 x float>
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
new file mode 100644
index 0000000000000..f12c2871782cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2)
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, <vscale x 4 x i32> zeroinitializer
+; CHECK: %2 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK: %3 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %2
+; CHECK: %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
+; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32{{( immarg)?}} 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
new file mode 100644
index 0000000000000..7941d226f9982
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2)
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func void @_Z7barrierj(i32)
+
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double> %0, ptr addrspace(1) %1) {
+; CHECK: entry:
+; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %2 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:   %3 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %2
+; CHECK:   %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
+; CHECK:   call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32 immarg 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK:   ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
new file mode 100644
index 0000000000000..86d7f65cfce01
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dont_mask_workitem_builtins -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func void @_Z7barrierj(i32)
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+declare spir_func i64 @_Z12get_group_idj(i32)
+
+; Test if the masked load is defined correctly
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_load4_u5nxv4ju3ptrU3AS2u5nxv4b(ptr addrspace(2){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> {{undef|poison}})
+; CHECK: ret <vscale x 4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
new file mode 100644
index 0000000000000..d3fa01fd2781d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the masked scatter store is defined correctly
+; CHECK: define void @__vecz_b_masked_scatter_store4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b(<vscale x 4 x i32>{{( %0)?}}, <vscale x 4 x ptr addrspace(1)>{{( %1)?}}, <vscale x 4 x i1>{{( %2)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 4, <vscale x 4 x i1> %2)
+; CHECK: ret void
+
+; Test if the masked gather load is defined correctly
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b(<vscale x 4 x ptr addrspace(1)>{{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> undef)
+; CHECK: ret <vscale x 4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..5c40ea5966351
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -0,0 +1,183 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
new file mode 100644
index 0000000000000..fae026f2ec3ac
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -0,0 +1,187 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> undef)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %[[SHUFFLE]]
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> undef)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
new file mode 100644
index 0000000000000..67124dcc8fb8b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -0,0 +1,145 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k extract_element -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE
+; RUN: %veczc -k extract_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI
+; RUN: %veczc -k extract_element_uniform_vec -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI-VEC
+; RUN: %veczc -k extract_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-INDICES
+; RUN: %veczc -k extract_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = extractelement <4 x float> %1, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %0 = extractelement <4 x float> %in, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %i = urem i64 %call, 4
+  %0 = extractelement <4 x float> %in, i64 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx
+  %1 = extractelement <4 x float> %0, i32 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %1, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %3 = extractelement <4 x i1> %2, i64 %i
+  %4 = sext i1 %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %4, i32 addrspace(1)* %arrayidx3, align 4
+  %5 = sext <4 x i1> %2 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+
+; EE-LABEL: @__vecz_nxv4_extract_element(
+; EE: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; EE: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
+; EE: [[IDX:%.*]] = sext i32 %idx to i64
+; EE: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], i64 [[IDX]]
+; EE: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_interleaved_load4_4_u5nxv4fu3ptr(ptr nonnull [[ADDR]])
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
+
+; EE-UNI-LABEL: @__vecz_nxv4_extract_element_uniform(
+; EE-UNI: [[T0:%.*]] = extractelement <4 x float> %in, i32 %idx
+; EE-UNI: [[T1:%.*]] = insertelement <vscale x 4 x float> poison, float [[T0]], {{(i32|i64)}} 0
+; EE-UNI: [[T2:%.*]] = shufflevector <vscale x 4 x float> [[T1]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; EE-UNI: store <vscale x 4 x float> [[T2]], ptr addrspace(1) {{%.*}}, align 4
+
+; The vector is uniform and the index is varying, so we must broadcast the vector
+; FIXME: Do we really need to broadcast? Can we mod the indices with the original vector length?
+
+; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec(
+; EE-UNI-VEC: [[T3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 %call, {{(i32|i64)}} 0
+; EE-UNI-VEC: [[T4:%.*]] = shufflevector <vscale x 4 x i64> [[T3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EE-UNI-VEC: [[STEP:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; EE-UNI-VEC: [[T5:%.*]] = add <vscale x 4 x i64> [[T4]], [[STEP]]
+; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 2, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+
+; LLVM 16 deduces add/or equivalence and uses `or` instead.
+; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} <vscale x 4 x i64> [[T6]], [[MOD]]
+
+; EE-UNI-VEC: [[T8:%.*]] = getelementptr inbounds float, ptr {{%.*}}, <vscale x 4 x i64> [[T7]]
+; EE-UNI-VEC: [[T9:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[T8]])
+; EE-UNI-VEC: store <vscale x 4 x float> [[T9]], ptr addrspace(1) {{%.*}}, align 4
+
+; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
+; EE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; EE-INDICES: [[T0:%.*]] = getelementptr inbounds i32, ptr addrspace(1) %idxs, i64 %call
+; EE-INDICES: [[T2:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) [[T0]], align 4
+; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
+; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-INDICES: [[T5:%.*]] = {{add|or}} <vscale x 4 x i32> [[T4]], [[T3]]
+; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
+; EE-INDICES: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
+; EE-INDICES: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[ADDR]])
+
+; Check we promote from i1 to i8 before doing our memops
+; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool(
+; EE-BOOL: [[T0:%.*]] = sext <vscale x 16 x i1> {{%.*}} to <vscale x 16 x i8>
+; EE-BOOL: store <vscale x 16 x i8> {{.*}}
+; EE-BOOL: [[T1:%.*]] = call <vscale x 4 x i8> @__vecz_b_gather_load1_u5nxv4hu9nxv4u3ptr(<vscale x 4 x ptr> {{%.*}}
+; EE-BOOL: [[T2:%.*]] = trunc <vscale x 4 x i8> [[T1]] to <vscale x 4 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
new file mode 100644
index 0000000000000..2ec5a36d63112
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k fadd -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @fadd(float* %aptr, float* %bptr, float* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  %b = load float, float* %arrayidxb, align 4
+  %sum = fadd float %a, %b
+  store float %sum, float* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_fadd
+; CHECK: load <vscale x 4 x float>, ptr
+; CHECK: load <vscale x 4 x float>, ptr
+; CHECK: fadd <vscale x 4 x float>
+; CHECK: store <vscale x 4 x float>
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
new file mode 100644
index 0000000000000..4d74540461ed8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %not %veczc -k fail_builtins -vecz-scalable -vecz-simd-width=4 -S < %s 2>&1 | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @fail_builtins(float* %aptr, float* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  %math = call spir_func float @_Z4tanff(float %a)
+  store float %math, float* %arrayidxz, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func float @_Z4tanff(float)
+
+; We can't scalarize this builtin call
+; CHECK: Error: Failed to vectorize function 'fail_builtins'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
new file mode 100644
index 0000000000000..56f259fde786a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -0,0 +1,121 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k insert_element -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE
+; RUN: %veczc -k insert_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-UNI
+; RUN: %veczc -k insert_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-INDICES
+; RUN: %veczc -k insert_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = insertelement <4 x float> %1, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %0 = insertelement <4 x float> %in, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %fidx = uitofp i64 %call to float
+  %2 = insertelement <4 x float> %1, float %fidx, i32 %i
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %v = trunc i32 %val to i1
+  %3 = insertelement <4 x i1> %2, i1 %v, i64 %i
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; IE-LABEL: @__vecz_nxv4_insert_element(
+; IE: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; IE: [[VAL0:%.*]] = insertelement <vscale x 4 x float> poison, float %val, {{(i32|i64)}} 0
+; IE: [[VAL1:%.*]] = shufflevector <vscale x 4 x float> [[VAL0]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; IE: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
+; IE: [[IDX:%.*]] = sext i32 %idx to i64
+; IE: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], i64 [[IDX]]
+; IE: call void @__vecz_b_interleaved_store4_4_u5nxv4fu3ptr(<vscale x 4 x float> [[VAL1]], ptr nonnull [[ADDR]])
+; IE: = load <vscale x 16 x float>, ptr [[ALLOC]], align 64
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing
+
+; IE-UNI-LABEL: @__vecz_nxv4_insert_element_uniform(
+; IE-UNI: {{%.*}} = insertelement <4 x float> %in, float %val, {{(i32|i64)}} %idx
+
+; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices(
+; IE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; IE-INDICES: [[VAL:%.*]] = uitofp <vscale x 4 x i64> {{%.*}} to <vscale x 4 x float>
+; IE-INDICES: store <vscale x 16 x float> {{%.*}}, ptr [[ALLOC]], align 64
+; IE-INDICES: [[T1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+
+; LLVM 16 deduces add/or equivalence and uses `or` instead.
+; IE-INDICES: [[T3:%.*]] = {{add|or}} <vscale x 4 x i32> [[T2]], {{%.*}}
+
+; IE-INDICES: [[T4:%.*]] = sext <vscale x 4 x i32> [[T3]] to <vscale x 4 x i64>
+; IE-INDICES: [[ADDR:%.*]] = getelementptr inbounds float, ptr %0, <vscale x 4 x i64> [[T4]]
+; IE-INDICES: call void @__vecz_b_scatter_store4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x float> [[VAL]], <vscale x 4 x ptr> [[ADDR]])
+; IE-INDICES: = load <vscale x 16 x float>, ptr [[ALLOC]], align 64
+
+; Check we promote from i1 to i8 before doing our memops
+; IE-BOOL-LABEL: @__vecz_nxv4_insert_element_bool(
+; IE-BOOL: [[ALLOC:%.*]] = alloca <vscale x 16 x i8>, align 16
+; IE-BOOL-DAG: [[T0:%.*]] = sext <vscale x 4 x i1> {{%.*}} to <vscale x 4 x i8>
+; IE-BOOL-DAG: [[T1:%.*]] = sext <vscale x 16 x i1> {{%.*}} to <vscale x 16 x i8>
+; IE-BOOL: store <vscale x 16 x i8> [[T1]], ptr [[ALLOC]], align 16
+; IE-BOOL: call void @__vecz_b_scatter_store1_u5nxv4hu9nxv4u3ptr(<vscale x 4 x i8> [[T0]], <vscale x 4 x ptr> {{%.*}})
+; IE-BOOL: [[T2:%.*]] = load <vscale x 16 x i8>, ptr [[ALLOC]], align 16
+; IE-BOOL: [[T3:%.*]] = trunc <vscale x 16 x i8> [[T2]] to <vscale x 16 x i1>
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
new file mode 100644
index 0000000000000..c9ebeb38f3ddc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k load_interleaved -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @load_interleaved(i32 addrspace(1)* nocapture readonly %input, i32 addrspace(1)* nocapture %output, i32 %stride) local_unnamed_addr {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %0 = trunc i64 %call to i32
+  %conv1 = mul i32 %0, %stride
+  %idxprom = sext i32 %conv1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom
+  store i32 %1, i32 addrspace(1)* %arrayidx3, align 4
+  %add = add nsw i32 %conv1, 1
+  %idxprom4 = sext i32 %add to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom4
+  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  %add6 = add nsw i32 %conv1, 2
+  %idxprom7 = sext i32 %add6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom7
+  store i32 1, i32 addrspace(1)* %arrayidx8, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define void @__vecz_b_interleaved_store4_V_u5nxv4ju3ptrU3AS1(<vscale x 4 x i32> [[ARG0:%.*]], ptr addrspace(1) [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) [[ARG1]], {{i32|i64}} 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <vscale x 4 x ptr addrspace(1)> [[TMP0]], <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[ARG2]], {{i32|i64}} 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <vscale x 4 x i64> [[TMP2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <vscale x 4 x ptr addrspace(1)> [[TMP1]], <vscale x 4 x i64> [[TMP5]]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)) #[[ATTRS:[0-9]+]]
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
new file mode 100644
index 0000000000000..837f87b3eaf40
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -0,0 +1,196 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k ctpop -vecz-scalable -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix CTPOP
+; RUN: %veczc -k ctlz -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix CTLZ
+; RUN: %veczc -k cttz -vecz-scalable -vecz-simd-width=8 -S < %s | %filecheck %s --check-prefix CTTZ
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a)
+  %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b)
+  store i32 %ctpopi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %ctlzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %cttzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CTPOP: void @__vecz_nxv2_ctpop
+; CTPOP: = call <vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> %{{.*}})
+; CTPOP: = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> %{{.*}})
+
+; CTLZ: void @__vecz_nxv4_ctlz
+; ... but it does widen ctlz
+; CTLZ: = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %{{.*}}, i1 false)
+
+; CTTZ: void @__vecz_nxv8_cttz
+; ... and cttz
+; CTTZ: = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call <vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> %{{.*}}, i1 false)
+
+; SADD_SAT: void @__vecz_nxv2_sadd_sat
+; SADD_SAT: = call <vscale x 2 x i32> @llvm.sadd.sat.nxv2i32(
+; SADD_SAT: = call <vscale x 4 x i8> @llvm.sadd.sat.nxv4i8(
+
+; UADD_SAT: void @__vecz_nxv2_uadd_sat
+; UADD_SAT: = call <vscale x 2 x i32> @llvm.uadd.sat.nxv2i32(
+; UADD_SAT: = call <vscale x 4 x i8> @llvm.uadd.sat.nxv4i8(
+
+; SSUB_SAT: void @__vecz_nxv2_ssub_sat
+; SSUB_SAT: = call <vscale x 2 x i32> @llvm.ssub.sat.nxv2i32(
+; SSUB_SAT: = call <vscale x 4 x i8> @llvm.ssub.sat.nxv4i8(
+
+; USUB_SAT: void @__vecz_nxv2_usub_sat
+; USUB_SAT: = call <vscale x 2 x i32> @llvm.usub.sat.nxv2i32(
+; USUB_SAT: = call <vscale x 4 x i8> @llvm.usub.sat.nxv4i8(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
new file mode 100644
index 0000000000000..d04f11fb7d98a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
@@ -0,0 +1,18 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Scalable vectorization is only supported on LLVM 12+
+config.unsupported = config.llvm_version_major < 12
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
new file mode 100644
index 0000000000000..dfbe70086294a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store
+; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[sum:%[0-9a-z]+]] = add <vscale x 4 x i32> [[lhs]], [[rhs]]
+; CHECK: store <vscale x 4 x i32> [[sum]],
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
new file mode 100644
index 0000000000000..403eda139fcc0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k load_binops_store -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load_binops_store(i32* %aptr, i32* %bptr, i32* %cptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds i32, i32* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %c = load i32, i32* %arrayidxc, align 4
+  %sum = add i32 %a, %b
+  %mpy = mul i32 %sum, %c
+  %shf = ashr i32 %mpy, 3
+  %dvu = udiv i32 %shf, %sum
+  store i32 %dvu, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_load_binops_store
+; CHECK: load <vscale x 4 x i32>, ptr
+; CHECK: load <vscale x 4 x i32>, ptr
+; CHECK: add <vscale x 4 x i32>
+; CHECK: mul <vscale x 4 x i32>
+; CHECK: ashr <vscale x 4 x i32>
+; CHECK: store <vscale x 4 x i32>
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
new file mode 100644
index 0000000000000..8a6c8a1ccc3cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-scalable -vecz-simd-width=8 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx
+  store i32 %load, i32 addrspace(1)* %slot
+  ret void
+}
+
+; CHECK: define spir_kernel void @test(ptr addrspace(1) %in) !codeplay_ca_vecz.base !0
+; CHECK: define spir_kernel void @__vecz_nxv8_test(ptr addrspace(1) %in) #0 !codeplay_ca_vecz.derived !2
+
+; CHECK: attributes #0 = { "mux-base-fn-name"="__vecz_nxv8_test" }
+
+; CHECK: !0 = !{!1, ptr @__vecz_nxv8_test}
+
+; CHECK: !1 = !{i32 8, i32 1, i32 0, i32 0}
+; CHECK: !2 = !{!1, ptr @test}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
new file mode 100644
index 0000000000000..48f01f28b3743
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; A kernel which should produce a uniform masked vector load where the mask is
+; a single varying splatted bit.
+define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %mod_idx = urem i64 %idx, 2
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %ins = insertelement <4 x i1> undef, i1 true, i32 0
+  %cmp = icmp slt i64 %idx, 64
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  %v = load <4 x i32>, <4 x i32>* %aptr
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16
+  br label %if.end
+if.end:
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_mask_varying
+; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+
+; Note that since we just did a lshr 2 on the input of the extend, it doesn't
+; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
+; CHECK: [[idx2:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
+
+; CHECK: [[t1:%.*]] = getelementptr inbounds i8, ptr {{.*}}, <vscale x 16 x i64> [[idx2]]
+; CHECK: [[t2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[t1]],
+; CHECK: [[splat:%.*]] = trunc <vscale x 16 x i8> [[t2]] to <vscale x 16 x i1>
+; CHECK: call void @__vecz_b_masked_store16_u6nxv16ju3ptru6nxv16b(<vscale x 16 x i32> {{.*}}, ptr %arrayidxz, <vscale x 16 x i1> [[splat]])
+
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
new file mode 100644
index 0000000000000..dc14d1322eb4b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k cast -vecz-scalable -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @cast(i32* %aptr, float* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %c = sitofp i32 %a to float
+  store float %c, float* %arrayidxz, align 4
+  ret void
+}
+
+; Check that passing -vecz-scalable with no width automatically chooses an
+; appropriate scalable vectorization factor.
+; CHECK: define spir_kernel void @__vecz_nxv[[VF:[0-9]+]]_cast
+; CHECK: sitofp <vscale x [[VF]] x i32> {{%[0-9]+}} to <vscale x [[VF]] x float>
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
new file mode 100644
index 0000000000000..bf64dd619998f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @select_scalar_scalar(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 4
+  store i32 %sel, i32* %arrayidxz, align 4
+  ret void
+}
+
+define spir_kernel void @select_vector_vector(<2 x i32>* %aptr, <2 x i32>* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds <2 x i32>, <2 x i32>* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i32>, <2 x i32>* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx
+  %a = load <2 x i32>, <2 x i32>* %arrayidxa, align 4
+  %b = load <2 x i32>, <2 x i32>* %arrayidxb, align 4
+  %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4
+  %cmp = icmp slt <2 x i32> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i32> %c, <2 x i32> <i32 4, i32 4>
+  store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_scalar
+; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 4 x i32> [[lhs]], [[rhs]]
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 4 x i1> [[cmp]], <vscale x 4 x i32> [[rhs]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; CHECK: store <vscale x 4 x i32> [[sel]],
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_vector_vector
+; CHECK: [[x:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[y:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[z:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 8 x i32> [[x]], [[y]]
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[z]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
new file mode 100644
index 0000000000000..1f8fa02ef7860
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k select_scalar_vector -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, <2 x i32> %c, <2 x i32> <i32 4, i32 4>
+  store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_vector
+; CHECK: [[rhs:%.*]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
+; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[sext]], ptr [[alloc:%.*]], align 4
+; CHECK: [[idx0:%.*]] = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 1, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+
+; Note that since we just did a lshr 1 on the input of the extend, it doesn't
+; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
+; CHECK: [[sext2:%.*]] = {{s|z}}ext <vscale x 8 x i32> [[idx1]] to <vscale x 8 x i64>
+
+; CHECK: [[addrs:%.*]] = getelementptr inbounds i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
+; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[addrs]],
+; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
new file mode 100644
index 0000000000000..3c9b9a952d298
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -0,0 +1,62 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
+  %insert = insertelement <4 x i32> undef, i32 %a, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %splat, <4 x i32>* %arrayidxz
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat
+; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+
+; Note that since we just did a lshr 2 on the input of the extend, it doesn't
+; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
+; CHECK: [[idx2:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
+
+; CHECK: [[alloc:%.*]] = getelementptr inbounds i32, ptr %{{.*}}, <vscale x 16 x i64> [[idx2]]
+; CHECK: [[splat:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[alloc]],
+; CHECK: store <vscale x 16 x i32> [[splat]], ptr
+}
+
+define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
+  %insert = insertelement <4 x i32> undef, i32 %a, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %splat, <4 x i32>* %arrayidxz
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat_uniform
+; CHECK: [[ins:%.*]] = insertelement <vscale x 16 x i32> poison, i32 %a, {{(i32|i64)}} 0
+; CHECK: [[splat:%.*]] = shufflevector <vscale x 16 x i32> [[ins]], <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer
+; CHECK: store <vscale x 16 x i32> [[splat]], ptr
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
new file mode 100644
index 0000000000000..93c216ef61c99
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i32 @_Z18get_sub_group_sizev()
+declare spir_func i32 @_Z22get_sub_group_local_idv()
+declare spir_func i32 @_Z19sub_group_broadcastij(i32, i32)
+
+define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call.i = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %conv = zext i32 %call.i to i64
+  %call2 = tail call spir_func i32 @_Z18get_sub_group_sizev()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_size(
+; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK: [[W:%.*]] = shl i32 [[VSCALE]], 2
+; CHECK: store i32 [[W]], ptr addrspace(1) {{.*}}
+}
+
+define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %call, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_local_id(
+; CHECK: [[LID:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK: store <vscale x 4 x i32> [[LID]], ptr addrspace(1) %out
+}
+
+define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @_Z19sub_group_broadcastij(i32 %v, i32 0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_sub_group_broadcast(
+; CHECK: [[LD:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) {{%.*}}, align 4
+; CHECK: [[EXT:%.*]] = extractelement <vscale x 4 x i32> [[LD]], {{(i32|i64)}} 0
+; CHECK: [[INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXT]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: store <vscale x 4 x i32> [[SPLAT]], ptr addrspace(1)
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
new file mode 100644
index 0000000000000..885253dd2a0e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
@@ -0,0 +1,154 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S -vecz-passes=packetizer < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
+declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
+declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
+declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i64(
+; CHECK: call <vscale x 4 x i64> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4m(<vscale x 4 x i64> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smin_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4i(<vscale x 4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umin_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smax_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4i(<vscale x 4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umax_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmin_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmax_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> %{{.*}})
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..e9caa97be59e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,175 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-scalable -w 4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
+declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
+declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_excl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4f(<vscale x 4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_excl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4f(<vscale x 4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_and_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_and_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_or_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_or_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_xor_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_and(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_and_u5nxv4b(<vscale x 4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_or(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_or_u5nxv4b(<vscale x 4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_xor(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4b(<vscale x 4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
new file mode 100644
index 0000000000000..6c31bc229af68
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
@@ -0,0 +1,175 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-scalable -w 4 -S -vecz-choices=VectorPredication < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
+declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
+declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_excl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_excl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_and_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_or_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_xor_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_and(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_or(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_xor(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
new file mode 100644
index 0000000000000..f464a69f2596b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
@@ -0,0 +1,154 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S -vecz-passes=packetizer < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
+declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
+declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
+declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_i64(
+; CHECK: call <vscale x 4 x i64> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4mj(<vscale x 4 x i64> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_smin_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4ij(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_umin_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_smax_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4ij(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_umax_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmin_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmax_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
new file mode 100644
index 0000000000000..7196ed98fc9d2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load_add_store(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  %a = load <4 x i32>, <4 x i32>* %arrayidxa, align 4
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 4
+  %sum = add <4 x i32> %a, %b
+  store <4 x i32> %sum, <4 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store
+; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 16 x i32>, ptr
+; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 16 x i32>, ptr
+; CHECK: [[sum:%[0-9a-z]+]] = add <vscale x 16 x i32> [[lhs]], [[rhs]]
+; CHECK: store <vscale x 16 x i32> [[sum]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
new file mode 100644
index 0000000000000..a3cfb59861b55
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that we fail to vectorize but don't leave behind an invalid function.
+; REQUIRES: llvm-13+
+; RUN: %not %veczc -k regression_phis -vecz-scalable -w 1 -vecz-passes=packetizer,verify -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @regression_phis(i64 addrspace(1)* %xs, i64 addrspace(1)* %ys, i32 addrspace(1)* %out, i64 %lim) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx.x = getelementptr inbounds i64, i64 addrspace(1)* %xs, i64 %call
+  %x = load i64, i64 addrspace(1)* %arrayidx.x, align 4
+  %cond = icmp eq i64 %call, 0
+  br i1 %cond, label %if.then, label %exit
+
+if.then:
+  %arrayidx.y = getelementptr inbounds i64, i64 addrspace(1)* %ys, i64 %call
+  %y = load i64, i64 addrspace(1)* %arrayidx.y, align 4
+  br label %exit
+
+exit:
+  ; We previously left behind an invalid PHI with too few operands, owing to us
+  ; bailing our while PHIs were still pending post-vectorization fixup.
+  %retval = phi i64 [ %x, %entry ], [ %y, %if.then ]
+  %0 = icmp eq i64 %lim, 0
+  %1 = select i1 %0, i64 1, i64 %lim
+  %rem = urem i64 %retval, %1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %rem
+  %2 = atomicrmw add i32 addrspace(1)* %arrayidx, i32 1 monotonic
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
new file mode 100644
index 0000000000000..f2748abcc01e2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k widen_vload -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @widen_vload(<4 x i32>* %aptr, <4 x i32>* %zptr) {
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %mod_idx = urem i64 %idx, 2
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %mod_idx
+  %v = load <4 x i32>, <4 x i32>* %arrayidxa, align 16
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_widen_vload(
+; CHECK: %v4 = call <vscale x 16 x i32> @__vecz_b_gather_load16_u6nxv16ju10nxv16u3ptr(<vscale x 16 x ptr> %{{.*}})
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
new file mode 100644
index 0000000000000..3d7433948c454
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k store_ult -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; Check that we can scalably-vectorize a call to get_global_id by using the
+; stepvector intrinsic
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @store_ult(i32* %out, i64* %N) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %0 = load i64, i64* %N, align 8
+  %cmp = icmp ult i64 %call, %0
+  %conv = zext i1 %cmp to i32
+  %arrayidx = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %conv, i32* %arrayidx, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_nxv4_store_ult
+; CHECK:   [[step:%[0-9.a-z]+]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:   %{{.*}} = add <vscale x 4 x i64> %{{.*}}, [[step]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
new file mode 100644
index 0000000000000..5d5438246b08c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k foo -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @foo(float addrspace(1)* nocapture readonly %a, i32 addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %1, 42
+  store i32 %add, i32 addrspace(1)* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) nocapture readonly %a, ptr addrspace(1) nocapture %out)
+; CHECK:  [[CMP:%.*]] = fcmp oeq <vscale x 2 x float> %{{.*}}, zeroinitializer
+; CHECK:  [[INS:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[VL:%.*]], {{(i32|i64)}} 0
+; CHECK:  [[SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[INS]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK:  [[IDX:%.*]] = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+; CHECK:  [[MASK:%.*]] = icmp ult <vscale x 2 x i32> [[IDX]], [[SPLAT]]
+; CHECK:  [[INP:%.*]] = select <vscale x 2 x i1> [[MASK]], <vscale x 2 x i1> [[CMP]], <vscale x 2 x i1> zeroinitializer
+; CHECK:  %{{.*}} = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[INP]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
new file mode 100644
index 0000000000000..731cf2729cd17
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
@@ -0,0 +1,34 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; Just check that the VectorPredication choice is valid
+; RUN: %veczc -k foo -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @foo(float* %aptr, float* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  store float %a, float* %arrayidxz, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
new file mode 100644
index 0000000000000..326fe8da639db
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k get_sub_group_size -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-F2
+; RUN: %veczc -k get_sub_group_size -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-S4
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i32 @_Z18get_sub_group_sizev()
+
+define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call.i = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %conv = zext i32 %call.i to i64
+  %call2 = tail call spir_func i32 @_Z18get_sub_group_sizev()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Makes sure the vector length is properly computed and substituted for get_sub_group_size()
+
+; CHECK-F2-LABEL: define spir_kernel void @__vecz_v2_vp_get_sub_group_size(
+; CHECK-F2: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK-F2: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK-F2: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
+; CHECK-F2: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 2)
+; CHECK-F2: [[VL1:%.*]] = trunc i64 [[VL0]] to i32
+; CHECK-F2: store i32 [[VL1]], ptr addrspace(1) {{.*}}
+
+; CHECK-S4-LABEL: define spir_kernel void @__vecz_nxv4_vp_get_sub_group_size(
+; CHECK-S4: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK-S4: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK-S4: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
+; CHECK-S4: [[VF0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-S4: [[VF1:%.*]] = shl i64 [[VF0]], 2
+; CHECK-S4: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 [[VF1]])
+; CHECK-S4: [[VL1:%.*]] = trunc i64 [[VL0]] to i32
+; CHECK-S4: store i32 [[VL1]], ptr addrspace(1) {{.*}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
new file mode 100644
index 0000000000000..480d0dccf4e7c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k f -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication:FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+; Test if the interleaved load is defined correctly
+; Vector-predicated interleaved loads are always masked
+; CHECK: define <vscale x 4 x double> @__vecz_b_masked_interleaved_load8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(ptr addrspace(1){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}}) {
+; CHECK: entry:
+; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %3 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:   %4 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %3
+; CHECK:   %5 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %4
+; CHECK:   %6 = call <vscale x 4 x double> @llvm.vp.gather.nxv4f64.nxv4p1(<vscale x 4 x ptr addrspace(1)> %5, <vscale x 4 x i1> %1, i32 %2)
+; CHECK:   ret <vscale x 4 x double> %6
+; CHECK: }
+
+
+; Test if the interleaved store is defined correctly
+; Vector-predicated interleaved stores are always masked
+; CHECK: define void @__vecz_b_masked_interleaved_store8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <vscale x 4 x i1>{{( %2)?}}, i32{{( %3)?}})
+; CHECK: entry:
+; CHECK:  %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:  %4 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:  %5 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %4
+; CHECK:  %6 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %5
+; CHECK:  call void @llvm.vp.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %6, <vscale x 4 x i1> %2, i32 %3)
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
new file mode 100644
index 0000000000000..4a91a8f7ea122
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+declare spir_func i64 @_Z12get_group_idj(i32)
+
+; Test if the masked store is defined correctly
+; CHECK: define void @__vecz_b_masked_store4_vp_Dv4_ju3ptrU3AS1Dv4_bj(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}, i32{{( %3)?}}) {
+; CHECK: entry:
+; CHECK: call void @llvm.vp.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, <4 x i1> %2, i32 %3)
+; CHECK: ret void
+
+; Test if the masked load is defined correctly
+; CHECK: define <4 x i32> @__vecz_b_masked_load4_vp_Dv4_ju3ptrU3AS2Dv4_bj(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}}, i32{{( %2)?}})
+; CHECK: entry:
+; CHECK: %3 = call <4 x i32> @llvm.vp.load.v4i32.p2(ptr addrspace(2) %0, <4 x i1> %1, i32 %2)
+; CHECK: ret <4 x i32> %3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
new file mode 100644
index 0000000000000..77e9b47c8b866
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; Test if the vector-predicated scatter store is defined correctly
+; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32>{{( %0)?}}, <vscale x 4 x ptr addrspace(1)>{{( %1)?}}, <vscale x 4 x i1>{{( %2)?}}, i32{{( %3)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.vp.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3)
+; CHECK: ret void
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the vector-predicated gather load is defined correctly
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)>{{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}})
+; CHECK: entry:
+; CHECK: %3 = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2)
+; CHECK: ret <vscale x 4 x i32> %3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..12924804cd560
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
@@ -0,0 +1,184 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %b = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %c = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %d = call <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %e = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %f = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %g = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %h = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %i = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %j = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %k = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %l = call <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %m = call <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  ret void
+}
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <4 x i32>
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <4 x i32> %[[N_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <4 x i32> %[[VEC]], {{(<4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <4 x i32> %[[MASK]]
+; CHECK:   %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> undef)
+
+; CHECK:   %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <4 x i1> %[[WHICH]], <4 x i32> %[[ACCUM]], <4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <4 x i32>
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <4 x i32> %[[N_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <4 x i32> %[[VEC]], {{(<4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <4 x i32> %[[MASK]]
+; CHECK:   %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> undef)
+
+; CHECK:   %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <4 x i1> %[[WHICH]], <4 x i32> %[[ACCUM]], <4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up goes here
+; CHECK:  %[[SLIDE:.+]] = shufflevector <4 x i32> %[[SCAN]], <4 x i32> undef, <4 x i32> <i32 {{[0-9]+}}, i32 0, i32 1, i32 2>
+; CHECK:  %[[RESULT:.+]] = insertelement <4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
+
+; CHECK:   ret <4 x i32> %[[RESULT]]
+; CHECK: }
+
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <4 x float> %[[VEC]], %{{.+}}
+
+declare <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <4 x float> %[[VEC]], %{{.+}}
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.minnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.minnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
new file mode 100644
index 0000000000000..5ec70e2f96116
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -0,0 +1,105 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+
+; RUN: %veczc -k load_add_store_i32 -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_4F
+; RUN: %veczc -k load_add_store_i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_1S
+; RUN: %veczc -k load_add_store_v4i32 -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_V4_2F
+; RUN: %veczc -k load_add_store_v4i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_V4_1S
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @load_add_store_i32(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK_4F: define spir_kernel void @__vecz_v4_vp_load_add_store_i32(
+; CHECK_4F: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_4F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_4F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_4F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 4)
+; CHECK_4F: [[VL:%.*]] = trunc i64 [[T0]] to i32
+; CHECK_4F: [[LHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
+; CHECK_4F: [[RHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
+; CHECK_4F: [[ADD:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[LHS]], <4 x i32> [[RHS]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
+; CHECK_4F: call void @llvm.vp.store.v4i32.p0(<4 x i32> [[ADD]], ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
+
+; CHECK_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_i32(
+; CHECK_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK_1S: [[T1:%.*]] = shl i64 [[T0]], 2
+; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
+; CHECK_1S: [[VL:%.*]] = trunc i64 [[T2]] to i32
+; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 4 x i1> (undef|poison), <vscale x 4 x i32> zeroinitializer\)]], i32 [[VL]])
+; CHECK_1S: [[RHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
+; CHECK_1S: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[LHS]], <vscale x 4 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
+; CHECK_1S: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
+
+define spir_kernel void @load_add_store_v4i32(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  %a = load <4 x i32>, <4 x i32>* %arrayidxa, align 16
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
+  %sum = add <4 x i32> %a, %b
+  store <4 x i32> %sum, <4 x i32>* %arrayidxz, align 16
+  ret void
+}
+
+; CHECK_V4_2F: define spir_kernel void @__vecz_v2_vp_load_add_store_v4i32(
+; CHECK_V4_2F: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_V4_2F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_V4_2F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_V4_2F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 2)
+; CHECK_V4_2F: [[VL:%.*]] = trunc i64 [[T0]] to i32
+; Each WI performs 4 elements, so multiply the VL by 4
+; CHECK_V4_2F: [[SVL:%.*]] = shl nuw nsw i32 [[VL]], 2
+; CHECK_V4_2F: [[LHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
+; CHECK_V4_2F: [[RHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
+; CHECK_V4_2F: [[ADD:%.*]] = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> [[LHS]], <8 x i32> [[RHS]], <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
+; CHECK_V4_2F: call void @llvm.vp.store.v8i32.p0(<8 x i32> [[ADD]], ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
+
+; CHECK_V4_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_v4i32(
+; CHECK_V4_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_V4_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_V4_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_V4_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK_V4_1S: [[T1:%.*]] = shl i64 [[T0]], 2
+; CHECK_V4_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
+; CHECK_V4_1S: [[VL:%.*]] = trunc i64 [[T2]] to i32
+; Each WI performs 4 elements, so multiply the VL by 4
+; CHECK_V4_1S: [[SVL:%.*]] = shl i32 [[VL]], 2
+; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 16 x i1> (undef|poison), <vscale x 16 x i32> zeroinitializer\)]], i32 [[SVL]])
+; CHECK_V4_1S: [[RHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
+; CHECK_V4_1S: [[ADD:%.*]] = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> [[LHS]], <vscale x 16 x i32> [[RHS]], [[TRUEMASK]], i32 [[SVL]])
+; CHECK_V4_1S: call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
new file mode 100644
index 0000000000000..3a4b5e4087072
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; A kernel which should produce a uniform masked vector load where the mask is
+; a single varying splatted bit.
+define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %mod_idx = urem i64 %idx, 2
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %ins = insertelement <4 x i1> undef, i1 true, i32 0
+  %cmp = icmp slt i64 %idx, 64
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  %v = load <4 x i32>, <4 x i32>* %aptr
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16
+  br label %if.end
+if.end:
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying
+; CHECK: [[CMP:%.*]] = icmp slt <vscale x 4 x i64> %{{.*}},
+; CHECK: [[INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VL:%.*]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[IDX:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK: [[MASK:%.*]] = icmp ult <vscale x 4 x i32> [[IDX]], [[SPLAT]]
+; CHECK: [[INP:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[CMP]], <vscale x 4 x i1> zeroinitializer
+; CHECK: [[RED:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[INP]])
+; CHECK: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0
+; CHECK: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
new file mode 100644
index 0000000000000..90a1811e3ae30
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; With VP all gathers become masked ones.
+define spir_kernel void @unmasked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %rem
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_unmasked_gather(
+; CHECK: [[v:%.*]] = call <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)> %{{.*}})
+; CHECK: call void @llvm.vp.store.nxv4i32.p1(<vscale x 4 x i32> [[v]],
+
+
+; With VP all scatters become masked ones.
+define spir_kernel void @unmasked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %rem
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_unmasked_scatter(
+; CHECK: [[v:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p1(
+; CHECK: call void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[v]],
+
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2) {
+; CHECK:   %3 = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2)
+; CHECK:   ret <vscale x 4 x i32> %3
+
+; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3) {
+; CHECK: entry:
+; CHECK:   call void @llvm.vp.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3)
+; CHECK:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
new file mode 100644
index 0000000000000..0009fbbfcc929
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -0,0 +1,266 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z16get_sub_group_idv()
+
+declare spir_func i32 @_Z13sub_group_alli(i32)
+declare spir_func i32 @_Z13sub_group_anyi(i32)
+
+declare spir_func i32 @_Z20sub_group_reduce_addi(i32)
+declare spir_func i64 @_Z20sub_group_reduce_addl(i64)
+declare spir_func float @_Z20sub_group_reduce_addf(float)
+declare spir_func i32 @_Z20sub_group_reduce_mini(i32)
+declare spir_func i32 @_Z20sub_group_reduce_minj(i32)
+declare spir_func i32 @_Z20sub_group_reduce_maxi(i32)
+declare spir_func i32 @_Z20sub_group_reduce_maxj(i32)
+declare spir_func float @_Z20sub_group_reduce_minf(float)
+declare spir_func float @_Z20sub_group_reduce_maxf(float)
+
+define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z13sub_group_alli(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32(
+; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ult <4 x i32> [[S]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[T2]]
+; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4
+; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], -1
+; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z13sub_group_anyi(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32(
+; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> [[T2]], <4 x i1> zeroinitializer
+; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4
+; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0
+; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_addi(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_addl(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv
+  store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i64(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[I]])
+; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_addf(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_add_f32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float -0.000000e+00, float -0.000000e+00,
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[I]])
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_mini(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_smin_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 2147483647, i32 2147483647, 
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_minj(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_umin_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxi(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_smax_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -2147483648, i32 -2147483648, 
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxj(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_umax_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_minf(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_fmin_f32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[I]])
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_maxf(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_fmax_f32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 0xFFF8000000000000, float 0xFFF8000000000000,
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[I]])
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..b59835f9e4dc9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,218 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z16get_sub_group_idv()
+
+declare spir_func i32 @_Z20sub_group_reduce_muli(i32)
+declare spir_func i64 @_Z20sub_group_reduce_mull(i64)
+declare spir_func float @_Z20sub_group_reduce_mulf(float)
+
+declare spir_func i32 @_Z20sub_group_reduce_andj(i32)
+declare spir_func i32 @_Z19sub_group_reduce_ori(i32)
+declare spir_func i64 @_Z20sub_group_reduce_xorl(i64)
+
+declare spir_func i1 @_Z28sub_group_reduce_logical_andb(i1)
+declare spir_func i1 @_Z27sub_group_reduce_logical_orb(i1)
+declare spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1)
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_muli(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i64(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[I]])
+; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_mull(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_f32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[I]])
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_mulf(float %0)
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv
+  store float %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_and_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> 
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_andj(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_or_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[I]])
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z19sub_group_reduce_ori(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i32(
+; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
+; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[I]])
+; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_xorl(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and(
+; This doesn't generate a reduction intrinsic...
+; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1
+; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_andb(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or(
+; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0
+; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @_Z27sub_group_reduce_logical_orb(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor(
+; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
+; CHECK: [[T:%.*]] = and i4 [[X]], 1
+; CHECK: [[R:%.*]] = zext i4 [[T]] to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
new file mode 100644
index 0000000000000..e8880f9d01e22
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
@@ -0,0 +1,157 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | %filecheck %s 
+
+; Tests the use of the VectorPredication choice. However, note that this option
+; currently makes no difference on fixed length vectors.
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
+declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
+declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
+declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_i64(
+; CHECK: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_smin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_umin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_smax_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_umax_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmin_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmax_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..596e8aa572c59
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,178 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | %filecheck %s
+
+; Tests the use of the VectorPredication choice. However, note that this option
+; currently makes no difference on fixed length vectors.
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
+declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
+declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_excl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_excl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_and_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_or_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_xor_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_and(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_or(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_xor(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
new file mode 100644
index 0000000000000..b35f912176ea9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-13+
+
+; RUN: %veczc -k udiv -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @udiv(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = udiv i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv2_vp_udiv(
+; CHECK: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK: [[T0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK: [[T1:%.*]] = shl i64 [[T0]], 1
+; CHECK: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
+; CHECK: [[VL:%.*]] = trunc i64 [[T2]] to i32
+; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 2 x i1> (undef|poison), <vscale x 2 x i32> zeroinitializer\)]], i32 [[VL]])
+; CHECK: [[RHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
+; CHECK: [[ADD:%.*]] = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> [[LHS]], <vscale x 2 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
+; CHECK: call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
new file mode 100644
index 0000000000000..8e59750414057
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+; Test if the interleaved load is NOT defined
+; CHECK-NOT: define <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+
+; Wide load instead
+; CHECK: load <16 x double>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
new file mode 100644
index 0000000000000..8e59750414057
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+; Test if the interleaved load is NOT defined
+; CHECK-NOT: define <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+
+; Wide load instead
+; CHECK: load <16 x double>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
new file mode 100644
index 0000000000000..a1a36ef885ae6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.addr.0, %e
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in)
+  call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out)
+  %0 = extractelement <4 x i32> %call1, i64 0
+  %tobool = icmp ne i32 %0, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond, label %for.inc
+
+while.cond:                                       ; preds = %while.cond, %for.body
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc, label %while.cond
+
+for.inc:                                          ; preds = %for.body, %while.cond
+  %inc = add nsw i32 %i.addr.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
+
+declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Vector widening results in a single load
+; CHECK: load <16 x i32>
+; CHECK-NOT: call {{.*}}i32 @__vecz_b_interleaved_load4_ju3ptrU3AS1
+
+; CHECK: ret void
+
+; Check if the declaration is missing as well
+; CHECK-NOT: @__vecz_b_interleaved_load4_ju3ptrU3AS1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
new file mode 100644
index 0000000000000..79adbc8155ed1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k extract_constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_constant_index(<4 x i64> addrspace(1)* %in, i32 %x, i64 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call
+  %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x i64> %0, i32 0;
+  %arrayidx1 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %vecext, i64 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
+; CHECK: %[[LD:.+]] = load <16 x i64>
+; CHECK: %[[EXT:.+]] = shufflevector <16 x i64> %[[LD]], <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: store <4 x i64> %[[EXT]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
new file mode 100644
index 0000000000000..d44c3842eebe4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x float> %0, i32 %x
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: %[[LD:.+]] = load <16 x float>, ptr addrspace(1) %
+
+; No splitting of the widened source vector
+; CHECK-NOT: shufflevector
+
+; Extract directly from the widened source and insert directly into result
+; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[LD]], i32 %x
+; CHECK: %[[INS0:.+]] = insertelement <4 x float> undef, float %[[EXT0]], i32 0
+; CHECK: %[[IDX1:.+]] = add i32 %x, 4
+; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX1]]
+; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1
+; CHECK: %[[IDX2:.+]] = add i32 %x, 8
+; CHECK: %[[EXT2:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX2]]
+; CHECK: %[[INS2:.+]] = insertelement <4 x float> %[[INS1]], float %[[EXT2]], i32 2
+; CHECK: %[[IDX3:.+]] = add i32 %x, 12
+; CHECK: %[[EXT3:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX3]]
+; CHECK: %[[INS3:.+]] = insertelement <4 x float> %[[INS2]], float %[[EXT3]], i32 3
+; CHECK: store <4 x float> %[[INS3]], ptr addrspace(1) %{{.+}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
new file mode 100644
index 0000000000000..bb369a4298bbe
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(i32 addrspace(1)* %in, <4 x i8> %x, i8 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x i8> %x, i32 %0
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %call
+  store i8 %vecext, i8 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %
+
+; No splitting of the widened source vector
+; CHECK-NOT: shufflevector
+
+; Extract directly from the uniform source with vectorized indices and insert directly into result
+; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[LD]], i32 0
+; CHECK: %[[EXT0:.+]] = extractelement <4 x i8> %x, i32 %[[IND0]]
+; CHECK: %[[INS0:.+]] = insertelement <4 x i8> undef, i8 %[[EXT0]], i32 0
+; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[LD]], i32 1
+; CHECK: %[[EXT1:.+]] = extractelement <4 x i8> %x, i32 %[[IND1]]
+; CHECK: %[[INS1:.+]] = insertelement <4 x i8> %[[INS0]], i8 %[[EXT1]], i32 1
+; CHECK: %[[IND2:.+]] = extractelement <4 x i32> %[[LD]], i32 2
+; CHECK: %[[EXT2:.+]] = extractelement <4 x i8> %x, i32 %[[IND2]]
+; CHECK: %[[INS2:.+]] = insertelement <4 x i8> %[[INS1]], i8 %[[EXT2]], i32 2
+; CHECK: %[[IND3:.+]] = extractelement <4 x i32> %[[LD]], i32 3
+; CHECK: %[[EXT3:.+]] = extractelement <4 x i8> %x, i32 %[[IND3]]
+; CHECK: %[[INS3:.+]] = insertelement <4 x i8> %[[INS2]], i8 %[[EXT3]], i32 3
+; CHECK: store <4 x i8> %[[INS3]], ptr addrspace(1) %{{.+}}, align 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
new file mode 100644
index 0000000000000..6533fdda3e2fe
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
@@ -0,0 +1,62 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %x, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %vecext = extractelement <4 x float> %0, i32 %1
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: %[[SRC:.+]] = load <16 x float>, ptr addrspace(1) %
+; CHECK: %[[IDX:.+]] = load <4 x i32>, ptr addrspace(1) %
+
+; No splitting of the widened source vector
+; CHECK-NOT: shufflevector
+
+; Offset the indices
+; CHECK: %[[ADD:.+]] = add <4 x i32> %[[IDX]], <i32 0, i32 4, i32 8, i32 12>
+
+; Extract directly from the widened source with vectorized indices and insert directly into result
+; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0
+; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND0]]
+; CHECK: %[[INS0:.+]] = insertelement <4 x float> undef, float %[[EXT0]], i32 0
+; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1
+; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND1]]
+; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1
+; CHECK: %[[IND2:.+]] = extractelement <4 x i32> %[[ADD]], i32 2
+; CHECK: %[[EXT2:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND2]]
+; CHECK: %[[INS2:.+]] = insertelement <4 x float> %[[INS1]], float %[[EXT2]], i32 2
+; CHECK: %[[IND3:.+]] = extractelement <4 x i32> %[[ADD]], i32 3
+; CHECK: %[[EXT3:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND3]]
+; CHECK: %[[INS3:.+]] = insertelement <4 x float> %[[INS2]], float %[[EXT3]], i32 3
+; CHECK: store <4 x float> %[[INS3]], ptr addrspace(1) %{{.+}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
new file mode 100644
index 0000000000000..3c330b6f91053
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @constant_index(<4 x i32>* %in, i32* %inval, <4 x i32>* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx2 = getelementptr inbounds i32, i32* %inval, i64 %call
+  %ldval = load i32, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 %ldval, i32 2
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx3
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_constant_index
+
+; A single wide load
+; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %
+
+; The vectorized element load:
+; CHECK: %[[ELTS:.+]] = load <4 x i32>, ptr %
+
+; No interleaved loads
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr
+
+; Insert elements turned into shufflevectors
+; CHECK: %[[WIDE:.+]] = shufflevector <4 x i32> %[[ELTS]], <4 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[INS:.+]] = shufflevector <16 x i32> %[[WIDE]], <16 x i32> %[[INTO]], <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 20, i32 21, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 28, i32 29, i32 14, i32 31>
+
+; No more shuffles..
+; CHECK-NOT: shufflevector
+
+; We should have one widened store
+; CHECK: store <16 x i32> %[[INS]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
new file mode 100644
index 0000000000000..b64f0d1a8d1bf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 2
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx2
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_constant_index
+
+; A single wide load
+; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %
+
+; No interleaved loads
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr
+
+; Insert constant elements into the widened vector:
+; CHECK: %[[INS0:.+]] = insertelement <16 x i32> %[[INTO]], i32 42, i32 2
+; CHECK: %[[INS1:.+]] = insertelement <16 x i32> %[[INS0]], i32 42, i32 6
+; CHECK: %[[INS2:.+]] = insertelement <16 x i32> %[[INS1]], i32 42, i32 10
+; CHECK: %[[INS3:.+]] = insertelement <16 x i32> %[[INS2]], i32 42, i32 14
+
+; No shuffles..
+; CHECK-NOT: shufflevector
+
+; We should have one widened store
+; CHECK: store <16 x i32> %[[INS3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
new file mode 100644
index 0000000000000..c9c4cdab72f5a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  store <4 x i32> %0, <4 x i32>* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %call
+  %1 = load i32, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 %1
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx3
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_runtime_index
+
+; CHECK: %[[INTO:.+]]  = load <16 x i32>, ptr %arrayidx, align 4
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr
+; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LD]], <i32 0, i32 4, i32 8, i32 12>
+
+; The inserts got widened
+; CHECK: %[[ELT0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0
+; CHECK: %[[INS0:.+1]] = insertelement <16 x i32> %[[INTO]], i32 42, i32 %[[ELT0]]
+; CHECK: %[[ELT1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1
+; CHECK: %[[INS1:.+]] = insertelement <16 x i32> %[[INS0]], i32 42, i32 %[[ELT1]]
+; CHECK: %[[ELT2:.+]] = extractelement <4 x i32> %[[ADD]], i32 2
+; CHECK: %[[INS2:.+]] = insertelement <16 x i32> %[[INS1]], i32 42, i32 %[[ELT2]]
+; CHECK: %[[ELT3:.+]] = extractelement <4 x i32> %[[ADD]], i32 3
+; CHECK: %[[INS3:.+]] = insertelement <16 x i32> %[[INS2]], i32 42, i32 %[[ELT3]]
+
+; No shuffles..
+; CHECK-NOT: shufflevector
+
+; One widened store directly storing the result
+; CHECK: store <16 x i32> %[[INS3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
new file mode 100644
index 0000000000000..44255f5e0f923
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -0,0 +1,98 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width 4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_f
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+
+; There should be exactly six vector loads and one store in the code
+; CHECK: load <16 x double>
+
+; And in between them there should be a barrier call
+; CHECK: call spir_func void @_Z7barrierj
+; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
+; CHECK: load <16 x double>
+; CHECK: load <16 x double>
+; CHECK: load <16 x double>
+; CHECK: load <16 x double>
+
+; The fmuladd instrinsic will be widened..
+; CHECK: call <16 x double> @llvm.fmuladd.v16f64
+; CHECK: load <16 x double>
+; CHECK: store <16 x double>
+
+; There shouldn't be any interleaved loads or stores left
+; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load
+; CHECK-NOT: call void @__vecz_b_interleaved_store
+
+; Function end
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
new file mode 100644
index 0000000000000..10d7722650ccf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: and <16 x i64>
+; CHECK: icmp slt <16 x i64>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
new file mode 100644
index 0000000000000..f48cfd82fba5a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp slt <16 x i32>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
new file mode 100644
index 0000000000000..cf57f8567354b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_d
+; CHECK: and <16 x i64>
+; CHECK: icmp eq <16 x i64>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
new file mode 100644
index 0000000000000..32c46f96ce1fb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp eq <16 x i32>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
new file mode 100644
index 0000000000000..51f68d42f68de
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_d
+; CHECK: and <16 x i64>
+; CHECK: icmp eq <16 x i64>
+; CHECK: and <16 x i64>
+; CHECK: icmp sgt <16 x i64>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
new file mode 100644
index 0000000000000..f39158268fcb9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp eq <16 x i32>
+; CHECK: and <16 x i32>
+; CHECK: icmp sgt <16 x i32>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
new file mode 100644
index 0000000000000..95dbae10c76b6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_d
+; CHECK: and <16 x i64>
+; CHECK: icmp slt <16 x i64>
+; CHECK: icmp sgt <16 x i64>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
new file mode 100644
index 0000000000000..76fedc4097831
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp slt <16 x i32>
+; CHECK: icmp sgt <16 x i32>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
new file mode 100644
index 0000000000000..8374890eb5309
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z12get_local_idj(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0
+
+declare spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float>, i64, float addrspace(1)*)
+
+declare spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64, float addrspace(1)*)
+; Function Attrs: inlinehint norecurse nounwind readnone
+declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2
+
+define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) {
+entry:
+  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0) #0
+  %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid
+  br label %loop
+
+loop:                                              ; preds = %entry, %loop
+  %madv4.prev = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4, %loop ]
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %i.inc = add nuw nsw i64 %i, 1
+  %cmp = icmp slt i64 %i.inc, %n
+  %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address)
+  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> undef, <4 x i32> zeroinitializer
+  %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0
+  br i1 %cmp, label %loop, label %end
+
+end:                                             ; preds = %loop
+  %mad.vec0 = extractelement <4 x float> %madv4, i32 0
+  store float %mad.vec0, float addrspace(1)* %inout.address, align 4
+  tail call spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float> %madv4, i64 0, float addrspace(1)* %inout.address)
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate }
+attributes #2 = { inlinehint norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; The purpose of this test is to make sure we correctly scalarize an instruction
+; used by both a scalar and vector instruction. We would previously try to
+; scalarize its users twice thus resulting in invalid IR.
+
+; CHECK: define spir_kernel void @__vecz_v4_scalar_vector_user
+; CHECK: loop:
+; CHECK: %madv4.prev{{.*}} = phi <16 x float> [ zeroinitializer, %entry ], [ %[[CONCAT:.+]], %loop ]{{$}}
+
+; make sure the above PHI incomings are unique by looking for their definitions
+; one day we might be able to super-vectorize this call, but for now we instantiate and concatenate it
+; CHECK: %madv4[[S0:[0-9]+]] =
+; CHECK: %madv4[[S1:[0-9]+]] =
+; CHECK: %madv4[[S2:[0-9]+]] =
+; CHECK: %madv4[[S3:[0-9]+]] =
+; CHECK: %[[C0:.+]] = shufflevector <4 x float> %madv4[[S0]], <4 x float> %madv4[[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[C1:.+]] = shufflevector <4 x float> %madv4[[S2]], <4 x float> %madv4[[S3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CONCAT]] = shufflevector <8 x float> %[[C0]], <8 x float> %[[C1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
new file mode 100644
index 0000000000000..8ee50ab195be2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vector_copy -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_copy(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %0, <4 x i32> addrspace(1)* %arrayidx1, align 16
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; It makes sure the vector load and store are preserved right through to packetization
+; and then widened, instead of being scalarized across work-items first
+; and then getting de-interleaved by the Interleaved Group Combine Pass.
+; We expect a single vector loads feeding directly into a single vector store.
+
+; CHECK: define spir_kernel void @__vecz_v4_vector_copy
+; CHECK: load <16 x i32>
+; CHECK-NOT: load
+; CHECK-NOT: %deinterleave{{[0-9]*}} = shufflevector
+; CHECK-NOT: %interleave{{[0-9]*}} = shufflevector
+; CHECK: store <16 x i32>
+; CHECK-NOT: store
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
new file mode 100644
index 0000000000000..4a51971786778
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
@@ -0,0 +1,91 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vector_loop -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call.trunc = trunc i64 %call to i32
+  %call.splatinsert = insertelement <4 x i32> undef, i32 %call.trunc, i32 0
+  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, %call.splat
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis
+; and then re-packetized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE1:.+]] = phi <16 x i32> [ %[[INC2:.+]], %for.body ], [ zeroinitializer, %entry ]
+; CHECK: %[[INC2]] = add <16 x i32> %[[STOREMERGE1]], [[CALL:.+]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
new file mode 100644
index 0000000000000..a361649b5f151
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -0,0 +1,68 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-12+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare i32 @llvm.abs.i32(i32, i1)
+declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
+
+define spir_kernel void @absff(i32* %pa, i32* %pb) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr i32, i32* %pa, i64 %idx
+  %b = getelementptr i32, i32* %pb, i64 %idx
+  %la = load i32, i32* %a, align 16
+  %res = call spir_func i32 @llvm.abs.i32(i32 %la, i1 true)
+  store i32 %res, i32* %b, align 16
+  ret void
+}
+
+define spir_kernel void @absvf(<2 x i32>* %pa, <2 x i32>* %pb) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <2 x i32>, <2 x i32>* %pa, i64 %idx
+  %b = getelementptr <2 x i32>, <2 x i32>* %pb, i64 %idx
+  %la = load <2 x i32>, <2 x i32>* %a, align 16
+  %res = call spir_func <2 x i32> @llvm.abs.v2i32(<2 x i32> %la, i1 true)
+  store <2 x i32> %res, <2 x i32>* %b, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_absff(ptr %pa, ptr %pb)
+; CHECK: entry:
+; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %a = getelementptr i32, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr i32, ptr %pb, i64 %idx
+; CHECK: %[[T0:.*]] = load <4 x i32>, ptr %a, align 4
+; CHECK: %[[RES1:.+]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %[[T0]], i1 true)
+; CHECK: store <4 x i32> %[[RES1]], ptr %b, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_absvf(ptr %pa, ptr %pb)
+; CHECK: entry:
+; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx
+; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 4
+; CHECK: %[[RES2:.+]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %[[T0]], i1 true)
+; CHECK: store <8 x i32> %[[RES2]], ptr %b, align 8
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
new file mode 100644
index 0000000000000..54770ac3795e8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k widen_binops -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @widen_binops(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i64>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx
+  %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx
+  %d = getelementptr <4 x i64>, <4 x i64>* %pd, i64 %idx
+  %la = load <4 x i32>, <4 x i32>* %a, align 16
+  %lb = load <4 x i32>, <4 x i32>* %b, align 16
+  %xa = zext <4 x i32> %la to <4 x i64>
+  %xb = zext <4 x i32> %lb to <4 x i64>
+  %add = add nuw nsw <4 x i64> %xa, %xb
+  store <4 x i64> %add, <4 x i64>* %d, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v8_widen_binops(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the zexts and add of <4 x i32> gets widened by a factor of 8,
+; to produce PAIRs of <16 x i32>s.
+; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
+; CHECK: %[[XA0:.+]] = zext <16 x i32> %[[LDA0]] to <16 x i64>
+; CHECK: %[[XA1:.+]] = zext <16 x i32> %[[LDA1]] to <16 x i64>
+; CHECK: %[[XB0:.+]] = zext <16 x i32> %[[LDB0]] to <16 x i64>
+; CHECK: %[[XB1:.+]] = zext <16 x i32> %[[LDB1]] to <16 x i64>
+; CHECK: %[[ADD0:.+]] = add nuw nsw <16 x i64> %[[XA0]], %[[XB0]]
+; CHECK: %[[ADD1:.+]] = add nuw nsw <16 x i64> %[[XA1]], %[[XB1]]
+; CHECK: store <16 x i64> %[[ADD0]], ptr %{{.+}}
+; CHECK: store <16 x i64> %[[ADD1]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
new file mode 100644
index 0000000000000..c6bad1ac63687
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-12+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare float @llvm.copysign.f32(float, float)
+declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>)
+
+define spir_kernel void @copysignff(float* %pa, float* %pb, float* %pc) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr float, float* %pa, i64 %idx
+  %b = getelementptr float, float* %pb, i64 %idx
+  %c = getelementptr float, float* %pc, i64 %idx
+  %la = load float, float* %a, align 16
+  %lb = load float, float* %b, align 16
+  %res = call float @llvm.copysign.f32(float %la, float %lb)
+  store float %res, float* %c, align 16
+  ret void
+}
+
+define spir_kernel void @copysignvf(<2 x float>* %pa, <2 x float>* %pb, <2 x float>* %pc) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx
+  %b = getelementptr <2 x float>, <2 x float>* %pb, i64 %idx
+  %c = getelementptr <2 x float>, <2 x float>* %pc, i64 %idx
+  %la = load <2 x float>, <2 x float>* %a, align 16
+  %lb = load <2 x float>, <2 x float>* %b, align 16
+  %res = call <2 x float> @llvm.copysign.v2f32(<2 x float> %la, <2 x float> %lb)
+  store <2 x float> %res, <2 x float>* %c, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_copysignff(ptr %pa, ptr %pb, ptr %pc)
+; CHECK: entry:
+; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %a = getelementptr float, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr float, ptr %pb, i64 %idx
+; CHECK: %c = getelementptr float, ptr %pc, i64 %idx
+; CHECK: [[T0:%.*]] = load <4 x float>, ptr %a, align 4
+; CHECK: [[T1:%.*]] = load <4 x float>, ptr %b, align 4
+; CHECK: %res1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[T0]], <4 x float> [[T1]])
+; CHECK: store <4 x float> %res1, ptr %c, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_copysignvf(ptr %pa, ptr %pb, ptr %pc)
+; CHECK: entry:
+; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx
+; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx
+; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 4
+; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 4
+; CHECK: %res1 = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[T0]], <8 x float> [[T1]])
+; CHECK: store <8 x float> %res1, ptr %c, align 8
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
new file mode 100644
index 0000000000000..174696e1a6d8f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  %fma = call <4 x float> @llvm.fma.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc)
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fma intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
+; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
+; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
+; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
new file mode 100644
index 0000000000000..956e9855caa8e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k fmin_vector_scalar -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z4fminDv4_ff(<4 x float>, float)
+
+; Note that we have to declare the scalar version, because when we vectorize
+; an already-vector builtin, we have to scalarize it first. This is the case
+; even for Vector Widening, where we don't actually create a call to the
+; scalar version, but we retrieve the wide version via the scalar version,
+; so the declaration still needs to exist.
+
+; Function Attrs: inlinehint nounwind readnone
+declare spir_func float @_Z4fminff(float, float)
+
+; Function Attrs: inlinehint nounwind readnone
+declare spir_func <16 x float> @_Z4fminDv16_fS_(<16 x float>, <16 x float>)
+
+define spir_kernel void @fmin_vector_scalar(<4 x float>* %pa, float* %pb, <4 x float>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr float, float* %pb, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load float, float* %b, align 4
+  %res = tail call spir_func <4 x float> @_Z4fminDv4_ff(<4 x float> %la, float %lb)
+  store <4 x float> %res, <4 x float>* %d, align 16
+  ret void
+}
+
+
+; CHECK: define spir_kernel void @__vecz_v4_fmin_vector_scalar(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmin builtin gets widened by a factor of 4, while its
+; scalar operand is sub-splatted to the required <16 x float>.
+; CHECK: %[[LDA:.+]] = load <16 x float>, ptr %{{.+}}
+; CHECK: %[[LDB:.+]] = load <4 x float>, ptr %{{.+}}
+; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[RES:.+]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> %[[LDA]], <16 x float> %[[SPL]])
+; CHECK: store <16 x float> %[[RES]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
new file mode 100644
index 0000000000000..f154caf9e51b6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc)
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
+; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
+; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
+; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
new file mode 100644
index 0000000000000..4f70e6f57350a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
@@ -0,0 +1,91 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx2 = shl i64 %idx, 1
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx2
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx2
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx2
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx2
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc)
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+
+; It concatenates the 8 x <4 x float> inputs into 2 x <16 x float> values
+; CHECK: %[[CA0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CA1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CA2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CA3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[SA0:.+]] = shufflevector <8 x float> %[[CA0]], <8 x float> %[[CA1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[SA1:.+]] = shufflevector <8 x float> %[[CA2]], <8 x float> %[[CA3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+; CHECK: %[[CB0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CB1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CB2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CB3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[SB0:.+]] = shufflevector <8 x float> %[[CB0]], <8 x float> %[[CB1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[SB1:.+]] = shufflevector <8 x float> %[[CB2]], <8 x float> %[[CB3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+; CHECK: %[[CC0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CC1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CC2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CC3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[SC0:.+]] = shufflevector <8 x float> %[[CC0]], <8 x float> %[[CC1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[SC1:.+]] = shufflevector <8 x float> %[[CC2]], <8 x float> %[[CC3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA0]], <16 x float> %[[SB0]], <16 x float> %[[SC0]])
+; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA1]], <16 x float> %[[SB1]], <16 x float> %[[SC1]])
+
+; It splits the 2 x <16 x float> results into 8 <4 x float> values
+; CHECK: %[[RES0:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[RES1:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[RES2:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: %[[RES3:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[RES4:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[RES5:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[RES6:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: %[[RES7:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: store <4 x float> %[[RES0]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES1]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES2]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES3]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES4]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES5]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES6]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES7]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
new file mode 100644
index 0000000000000..b45474767d66e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -0,0 +1,74 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  br label %loop
+
+loop:
+  %n = phi i32 [ %dec, %loop ], [ 10, %entry ]
+  %acc = phi <4 x float> [ %fma, %loop ], [ %la, %entry ]
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %acc, <4 x float> %lb, <4 x float> %lc)
+  %dec = sub i32 %n, 1
+  %cmp = icmp ne i32 %dec, 0
+  br i1 %cmp, label %loop, label %end
+
+end:
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+
+; CHECK: loop:
+; CHECK: %[[ACC0:.+]] = phi <16 x float> [ %[[FMA0:.+]], %loop ], [ %[[LDA0]], %entry ]
+; CHECK: %[[ACC1:.+]] = phi <16 x float> [ %[[FMA1:.+]], %loop ], [ %[[LDA1]], %entry ]
+
+; CHECK: %[[FMA0]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[ACC0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
+; CHECK: %[[FMA1]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[ACC1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
+
+; CHECK: end:
+; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
+; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
new file mode 100644
index 0000000000000..559517fb8a8c1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr i8, i8* %pa, i64 %idx
+  %b = getelementptr i8, i8* %pb, i64 %idx
+  %d = getelementptr i8, i8* %pd, i64 %idx
+  %la = load i8, i8* %a, align 16
+  %lb = load i8, i8* %b, align 16
+  %res = tail call i8 @llvm.fshl.i8(i8 %la, i8 %lb, i8 4)
+  store i8 %res, i8* %d, align 16
+  ret void
+}
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+
+; CHECK: define spir_kernel void @__vecz_v16_test_calls(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fshl intrinsic of i8 gets widened by a factor of 16
+; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> <{{(i8 4, )+i8 4}}>)
+; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
new file mode 100644
index 0000000000000..334b68988fdd2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr i8, i8* %pa, i64 %idx
+  %b = getelementptr i8, i8* %pb, i64 %idx
+  %d = getelementptr i8, i8* %pd, i64 %idx
+  %la = load i8, i8* %a, align 16
+  %lb = load i8, i8* %b, align 16
+  %res = tail call i8 @llvm.fshr.i8(i8 %la, i8 %lb, i8 2)
+  store i8 %res, i8* %d, align 16
+  ret void
+}
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+
+; CHECK: define spir_kernel void @__vecz_v16_test_calls(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fshr intrinsic of i8 gets widened by a factor of 16
+; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> <{{(i8 2, )+i8 2}}>)
+; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
new file mode 100644
index 0000000000000..950704e8c4b7e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k widen_shufflevector -vecz-simd-width=2 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @widen_shufflevector(<2 x float> addrspace(1)* %a, <2 x float> addrspace(1)* %b, <4 x float> addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidxa = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %b, i64 %call
+  %la = load <2 x float>, <2 x float> addrspace(1)* %arrayidxa, align 4
+  %lb = load <2 x float>, <2 x float> addrspace(1)* %arrayidxb, align 4
+  %shuffle = shufflevector <2 x float> %la, <2 x float> %lb, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  %arrayidx1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %shuffle, <4 x float> addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v2_widen_shufflevector
+; CHECK: %[[LDA:.+]] = load <4 x float>, ptr addrspace(1) %
+; CHECK: %[[LDB:.+]] = load <4 x float>, ptr addrspace(1) %
+; CHECK: %[[SHF:.+]] = shufflevector <4 x float> %[[LDA]], <4 x float> %[[LDB]], <8 x i32> <i32 0, i32 5, i32 1, i32 4, i32 2, i32 7, i32 3, i32 6>
+; CHECK: store <8 x float> %[[SHF]], ptr addrspace(1) %
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
new file mode 100644
index 0000000000000..7bd56cb87287c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_sqrt -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func float @_Z4sqrtf(float)
+declare spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float>)
+declare spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float>)
+declare spir_func <8 x float> @_Z4sqrtDv8_f(<8 x float>)
+declare spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float>)
+
+define spir_kernel void @test_sqrt(<2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %out2,
+                                   <4 x float> addrspace(1)* %in4, <4 x float> addrspace(1)* %out4) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayin2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in2, i64 %gid
+  %arrayin4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in4, i64 %gid
+  %arrayout2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out2, i64 %gid
+  %arrayout4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out4, i64 %gid
+  %ld2 = load <2 x float>, <2 x float> addrspace(1)* %arrayin2, align 16
+  %ld4 = load <4 x float>, <4 x float> addrspace(1)* %arrayin4, align 16
+  %sqrt2 = call spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float> %ld2)
+  %sqrt4 = call spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float> %ld4)
+  store <2 x float> %sqrt2, <2 x float> addrspace(1)* %arrayout2, align 16
+  store <4 x float> %sqrt4, <4 x float> addrspace(1)* %arrayout4, align 16
+  ret void
+}
+
+; The purpose of this test is to check that the vector context is able to
+; supply the packetizer with two versions of the builtin vectorized to two
+; different widths.
+;
+; CHECK: define spir_kernel void @__vecz_v4_test_sqrt
+; CHECK: call spir_func <8 x float> @_Z4sqrtDv8_f(<8 x float> %{{.*}})
+; CHECK: call spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float> %{{.*}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
new file mode 100644
index 0000000000000..3153b90afd567
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.testStruct = type { <3 x i32> }
+
+define spir_kernel void @alloca_alias(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %myStructs = alloca [2 x %struct.testStruct], align 16
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = bitcast [2 x %struct.testStruct]* %myStructs to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull %0)
+  %1 = trunc i64 %call to i32
+  %conv = add nuw nsw i32 %1, 2
+  %2 = insertelement <4 x i32> undef, i32 %conv, i64 0
+  %conv2 = add nuw nsw i32 %1, 3
+  %3 = insertelement <4 x i32> %2, i32 %conv2, i64 1
+  %4 = insertelement <4 x i32> %3, i32 %1, i64 2
+  %i = getelementptr inbounds [2 x %struct.testStruct], [2 x %struct.testStruct]* %myStructs, i64 0, i64 1, i32 0
+  %storetmp8 = bitcast <3 x i32>* %i to <4 x i32>*
+  store <4 x i32> %4, <4 x i32>* %storetmp8, align 16
+  %idxprom = sext i32 %index to i64
+  %i9 = getelementptr inbounds [2 x %struct.testStruct], [2 x %struct.testStruct]* %myStructs, i64 0, i64 %idxprom, i32 0
+  %castToVec410 = bitcast <3 x i32>* %i9 to <4 x i32>*
+  %loadVec411 = load <4 x i32>, <4 x i32>* %castToVec410, align 16
+  %extractVec12 = shufflevector <4 x i32> %loadVec411, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %5 = mul i64 %call, 3
+  %vstore_base = getelementptr i32, i32 addrspace(1)* %out, i64 %5
+  %vstore_extract = extractelement <3 x i32> %extractVec12, i32 0
+  %6 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 0
+  store i32 %vstore_extract, i32 addrspace(1)* %6, align 4
+  %vstore_extract1 = extractelement <3 x i32> %extractVec12, i32 1
+  %7 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 1
+  store i32 %vstore_extract1, i32 addrspace(1)* %7, align 4
+  %vstore_extract2 = extractelement <3 x i32> %extractVec12, i32 2
+  %8 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 2
+  store i32 %vstore_extract2, i32 addrspace(1)* %8, align 4
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* nonnull %0)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8*)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func void @_Z7vstore3Dv3_imPU3AS1i(<3 x i32>, i64, i32 addrspace(1)*)
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8*)
+
+; CHECK: spir_kernel void @__vecz_v4_alloca_alias
+; CHECK: alloca [4 x [2 x %struct.testStruct{{.*}}]]
+; CHECK-NOT: = alloca .*
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
new file mode 100644
index 0000000000000..085479c8e79c8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: arm
+
+; RUN: %veczc -k short3_char3_codegen -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-unknown-linux-gnueabihf"
+
+; Function Attrs: nounwind
+define spir_kernel void @short3_char3_codegen(i8 addrspace(1)* %src, i16 addrspace(1)* %dest) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+entry:
+  %call = call spir_func i32 @_Z13get_global_idj(i32 0) #3
+  %call1 = call spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32 %call, i8 addrspace(1)* %src) #3
+  %call3 = call spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8> %call1) #3
+  call spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16> %call3, i32 %call, i16 addrspace(1)* %dest) #3
+  ret void
+}
+
+declare spir_func i32 @_Z13get_global_idj(i32) #1
+
+declare spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32, i8 addrspace(1)*) #1
+
+declare spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8>) #1
+
+declare spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16>, i32, i16 addrspace(1)*) #1
+
+; Function Attrs: inlinehint nounwind
+declare spir_func signext i16 @_Z13convert_shortc(i8 signext) #2
+
+; Function Attrs: inlinehint nounwind
+declare spir_func <16 x i16> @_Z15convert_short16Dv16_c(<16 x i8>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.spir.version = !{!0, !0, !0, !0, !0}
+!opencl.ocl.version = !{!1, !1, !1, !1, !1}
+
+!0 = !{i32 2, i32 0}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 1, i32 1}
+!3 = !{!"none", !"none"}
+!4 = !{!"char*", !"short*"}
+!5 = !{!"", !""}
+
+; Assert call to neon intrinsic exists
+; CHECK: call void @llvm.arm.neon.vst3.p1.v4i16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
new file mode 100644
index 0000000000000..84ab91fd86cca
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%opencl.event_t = type opaque
+
+; Function Attrs: nounwind
+define spir_kernel void @test(i32 addrspace(1)* %input, i32 addrspace(3)* %output, i32 addrspace(1)* %elements) {
+  %ev = alloca %opencl.event_t*, align 8
+  %1 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %2 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %3 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %4 = mul i64 %3, %2
+  %5 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %4
+  %6 = mul i64 %3, %2
+  %7 = getelementptr inbounds i32, i32 addrspace(3)* %output, i64 %6
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %elements, i64 %2
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %10 = sext i32 %9 to i64
+  %11 = load %opencl.event_t*, %opencl.event_t** %ev, align 8
+  %12 = call spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)* %5, i32 addrspace(3)* %7, i64 %10, %opencl.event_t* %11)
+  %13 = trunc i64 %3 to i32
+  call spir_func void @_Z17wait_group_eventsiP9ocl_event(i32 %13, %opencl.event_t** nonnull %ev)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z12get_group_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+declare spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)*, i32 addrspace(3)*, i64, %opencl.event_t*)
+declare spir_func void @_Z17wait_group_eventsiP9ocl_event(i32, %opencl.event_t**)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; Check if we have one and exactly one call to async_workgroup copy
+; CHECK: call spir_func ptr @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event
+; CHECK-NOT: async_workgroup_copy
+
+; Check if we have one and exactly one call to wait_group_events
+; CHECK: call spir_func void @_Z17wait_group_eventsiP9ocl_event
+; CHECK-NOT: wait_group_events
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
new file mode 100644
index 0000000000000..1dc3694249d2f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire
+  %1 = extractvalue { i32, i1 } %0, 0
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %1, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub  acq_rel
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %0, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %conv, i32 addrspace(1)* %arrayidx
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; We no longer support instantiating atomic instructions in diverged blocks,
+; since they require masking. FileCheck does not support comments, so the CHECKs
+; have been removed or reversed in the following lines
+; CHECK-NOT: define spir_kernel void @__vecz_v4_atomic_cmpxchg_builtin
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
new file mode 100644
index 0000000000000..bb8d22596f31e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire
+  %1 = extractvalue { i32, i1 } %0, 0
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %1, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub  acq_rel
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %0, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %conv, i32 addrspace(1)* %arrayidx
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; We no longer support instantiating atomic instructions in diverged blocks,
+; since they require masking. FileCheck does not support comments, so the CHECKs
+; have been removed or reversed in the following lines
+; CHECK-NOT: define spir_kernel void @__vecz_v4_atomic_atomicrmw_builtin
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
new file mode 100644
index 0000000000000..260f96dfc6897
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
@@ -0,0 +1,81 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire
+  %1 = extractvalue { i32, i1 } %0, 0
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %1, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub  acq_rel
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %0, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %conv, i32 addrspace(1)* %arrayidx
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_atomic_rmw
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
new file mode 100644
index 0000000000000..a8377695cc81c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 %a, i32 %b, i32* %c, float %rf) {
+entry:
+  %d = alloca i32
+  %e = alloca i32
+  %f = alloca float
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %d, align 4
+  store i32 %sum, i32* %e, align 4
+  %call = call spir_func i32 @foo(i32* %e)
+  %d.load = load i32, i32* %d, align 4
+  %e.load = load i32, i32* %e, align 4
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %d.load, i32* %c0, align 4
+  %c1 = getelementptr i32, i32* %c0, i64 1
+  store i32 %e.load, i32* %c1, align 4
+  store float %rf, float* %f
+  %ri = bitcast float* %f to i32*
+  %ri.load = load i32, i32* %ri, align 4
+  %c2 = getelementptr i32, i32* %c1, i64 2
+  store i32 %ri.load, i32* %c2, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @foo(i32*)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf)
+; CHECK: entry:
+; CHECK: %e = alloca i32
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %sum = add i32 %a, %b
+; CHECK: store i32 %sum, ptr %e
+; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e)
+; CHECK: %e.load = load i32, ptr %e
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %sum, ptr %c0
+; CHECK: %c1 = getelementptr i32, ptr %c0, i64 1
+; CHECK: store i32 %e.load, ptr %c1
+; CHECK: %0 = bitcast float %rf to i32
+; CHECK: %c2 = getelementptr i32, ptr %c1, i64 2
+; CHECK: store i32 %0, ptr %c2, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
new file mode 100644
index 0000000000000..3657705a49308
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test(i32* %in, i32* %out) {
+entry:
+  %in.addr = alloca i32*, align 8
+  %out.addr = alloca i32*, align 8
+  %gid = alloca i64, align 8
+  store i32* %in, i32** %in.addr, align 8
+  store i32* %out, i32** %out.addr, align 8
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  store i64 %call, i64* %gid, align 8
+  %0 = load i64, i64* %gid, align 8
+  %rem = urem i64 %0, 16
+  %cmp = icmp eq i64 %rem, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = load i64, i64* %gid, align 8
+  %2 = load i32*, i32** %in.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %1
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i64, i64* %gid, align 8
+  %5 = load i32*, i32** %in.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 %4
+  %call2 = call spir_func i32 bitcast (i32 (i32, i32 addrspace(1)*)* @foo to i32 (i32, i32*)*)(i32 %3, i32* %arrayidx1)
+  %6 = load i64, i64* %gid, align 8
+  %7 = load i32*, i32** %out.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %7, i64 %6
+  store i32 %call2, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %8 = load i64, i64* %gid, align 8
+  %9 = load i32*, i32** %in.addr, align 8
+  %arrayidx4 = getelementptr inbounds i32, i32* %9, i64 %8
+  %10 = load i32, i32* %arrayidx4, align 4
+  %11 = load i64, i64* %gid, align 8
+  %12 = load i32*, i32** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %12, i64 %11
+  store i32 %10, i32* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @foo(i32, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: ret void
+
+; CHECK: define private spir_func i32 @__vecz_b_masked_foo(i32{{( %0)?}}, ptr{{( %1)?}}, i1{{( %2)?}}
+; CHECK: call spir_func i32 @foo(i32 %0, ptr %1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
new file mode 100644
index 0000000000000..53dfd1dda9e79
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
+  %varying = load i32, i32 addrspace(1)* %a_gep
+  %uniform = load i32, i32 addrspace(1)* %b_gep
+  %cmp_v = icmp sgt i32 %varying, 0
+  %cmp_u = icmp sgt i32 %uniform, 0
+  %and_vu = and i1 %cmp_v, %cmp_u
+  br i1 %and_vu, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %inc = add i32 %uniform, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %result = phi i32 [ %inc, %if.then ], [ %varying, %entry ]
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %result, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks that a conditional branch based on an AND of both
+; a uniform and a varying value gets split into two separate branches
+; CHECK: define spir_kernel void @__vecz_v4_split_branch
+
+; CHECK: %cmp_v = icmp sgt i32 %varying, 0
+; CHECK: %cmp_u = icmp sgt i32 %uniform, 0
+
+; ensure the original binary operator got deleted
+; CHECK-NOT: and i1
+; CHECK: br i1 %cmp_u, label %entry.cond_split, label %if.end
+
+; CHECK: entry.cond_split:
+; CHECK: br i1 %cmp_v, label %if.then, label %if.end
+
+; CHECK: if.then:
+; CHECK: %inc = add i32 %uniform, 1
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[RESULT:.+]] = phi i32 [ %inc, %if.then ], [ %varying, %entry.cond_split ], [ %varying, %entry ]
+; CHECK: store i32 %[[RESULT]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
new file mode 100644
index 0000000000000..e94aedc791c82
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
+  %varying = load i32, i32 addrspace(1)* %a_gep
+  %uniform = load i32, i32 addrspace(1)* %b_gep
+  %cmp_v = icmp sgt i32 %varying, 0
+  %cmp_u = icmp sgt i32 %uniform, 0
+  %or_vu = or i1 %cmp_v, %cmp_u
+  br i1 %or_vu, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %inc = add i32 %uniform, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %result = phi i32 [ %inc, %if.then ], [ %varying, %entry ]
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %result, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks that a conditional branch based on an OR of both
+; a uniform and a varying value gets split into two separate branches
+; CHECK: define spir_kernel void @__vecz_v4_split_branch
+
+; CHECK: %cmp_v = icmp sgt i32 %varying, 0
+; CHECK: %cmp_u = icmp sgt i32 %uniform, 0
+
+; ensure the original binary operator got deleted
+; CHECK-NOT: or i1
+; CHECK: br i1 %cmp_u, label %if.then, label %entry.cond_split
+
+; CHECK: entry.cond_split:
+; CHECK: br i1 %cmp_v, label %if.then, label %if.end
+
+; CHECK: if.then:
+; CHECK: %inc = add i32 %uniform, 1
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[RESULT:.+]] = phi i32 [ %inc, %if.then ], [ %varying, %entry.cond_split ]
+; CHECK: store i32 %[[RESULT]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
new file mode 100644
index 0000000000000..dd5957ff3ef58
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @saddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i8 @_Z7add_satcc(i8 %0, i8 %1)
+  store i8 %call2, i8 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @uaddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i8 @_Z7add_sathh(i8 %0, i8 %1)
+  store i8 %call2, i8 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @saddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i32 @_Z7add_satii(i32 %0, i32 %1)
+  store i32 %call2, i32 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @uaddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i32 @_Z7add_satjj(i32 %0, i32 %1)
+  store i32 %call2, i32 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @saddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func <4 x i32> @_Z7add_satDv2_iS_(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %call2, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @uaddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func <4 x i32> @_Z7add_satDv2_jS_(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %call2, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i8 @_Z7add_satcc(i8, i8)
+declare spir_func i8 @_Z7add_sathh(i8, i8)
+declare spir_func i32 @_Z7add_satii(i32, i32)
+declare spir_func i32 @_Z7add_satjj(i32, i32)
+declare spir_func <4 x i32> @_Z7add_satDv2_iS_(<4 x i32>, <4 x i32>)
+declare spir_func <4 x i32> @_Z7add_satDv2_jS_(<4 x i32>, <4 x i32>)
+
+; CHECK: define spir_kernel void @__vecz_v4_saddsatc(
+; CHECK: = call i8 @llvm.sadd.sat.i8(i8 %{{.*}}, i8 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_uaddsatc(
+; CHECK: = call i8 @llvm.uadd.sat.i8(i8 %{{.*}}, i8 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_saddsati(
+; CHECK: = call i32 @llvm.sadd.sat.i32(i32 %{{.*}}, i32 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_uaddsati(
+; CHECK: = call i32 @llvm.uadd.sat.i32(i32 %{{.*}}, i32 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_saddsati4(
+; CHECK: = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_uaddsati4(
+; CHECK: = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
new file mode 100644
index 0000000000000..87f798a6b4905
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k clampkernel -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @clampkernel(float %a, float* %c) {
+entry:
+  %clmp = call spir_func float @_Z5clampfff(float %a, float 0.0, float 1.0)
+  store float %clmp, float* %c, align 4
+  ret void
+}
+
+define spir_func float @_Z5clampfff(float %x, float %y, float %z) {
+entry:
+  %call.i.i = tail call spir_func float @_Z13__abacus_fmaxff(float %x, float %y)
+  %call1.i.i = tail call spir_func float @_Z13__abacus_fminff(float %call.i.i, float %z)
+  ret float %call1.i.i
+; CHECK-LABEL: float @_Z5clampfff(
+; CHECK: [[TMP:%.*]] = call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK:             = call float @llvm.minnum.f32(float [[TMP]], float %z)
+}
+
+declare spir_func float @_Z13__abacus_fminff(float, float)
+declare spir_func float @_Z13__abacus_fmaxff(float, float)
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
new file mode 100644
index 0000000000000..2ca6820a71665
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @fmaxff(float %a, float %b, float* %c) {
+entry:
+  %max = call spir_func float @_Z4fmaxff(float %a, float %b)
+  store float %max, float* %c, align 4
+  ret void
+}
+
+define spir_kernel void @fmaxvf(<2 x float> %a, float %b, <2 x float>* %c) {
+entry:
+  %max = call spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float> %a, float %b)
+  store <2 x float> %max, <2 x float>* %c, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func float @_Z4fmaxff(float, float)
+declare spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float>, float)
+
+; CHECK: define spir_kernel void @__vecz_v4_fmaxff(float %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %0 = call float @llvm.maxnum.f32(float %a, float %b)
+; CHECK: store float %0, ptr %c, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_fmaxvf(<2 x float> %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %.splatinsert = insertelement <2 x float> {{.*}}, float %b, {{(i32|i64)}} 0
+; CHECK: %.splat = shufflevector <2 x float> %.splatinsert, <2 x float> {{.*}}, <2 x i32> zeroinitializer
+; CHECK: %0 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %.splat)
+; CHECK: store <2 x float> %0, ptr %c, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
new file mode 100644
index 0000000000000..4227ce3161b68
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @fminff(float %a, float %b, float* %c) {
+entry:
+  %min = call spir_func float @_Z4fminff(float %a, float %b)
+  store float %min, float* %c, align 4
+  ret void
+}
+
+define spir_kernel void @fminvf(<2 x float> %a, float %b, <2 x float>* %c) {
+entry:
+  %min = call spir_func <2 x float> @_Z4fminDv2_ff(<2 x float> %a, float %b)
+  store <2 x float> %min, <2 x float>* %c, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func float @_Z4fminff(float, float)
+declare spir_func <2 x float> @_Z4fminDv2_ff(<2 x float>, float)
+
+; CHECK: define spir_kernel void @__vecz_v4_fminff(float %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %0 = call float @llvm.minnum.f32(float %a, float %b)
+; CHECK: store float %0, ptr %c, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_fminvf(<2 x float> %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %.splatinsert = insertelement <2 x float> {{.*}}, float %b, {{(i32|i64)}} 0
+; CHECK: %.splat = shufflevector <2 x float> %.splatinsert, <2 x float> {{.*}}, <2 x i32> zeroinitializer
+; CHECK: %0 = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %.splat)
+; CHECK: store <2 x float> %0, ptr %c, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
new file mode 100644
index 0000000000000..1aaec11f02648
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
@@ -0,0 +1,77 @@
+
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; FIXME: CA-4331 - we can't inline non-i8 memcpy/memset
+
+define spir_kernel void @test_memset_i16(i64* %z) {
+  %dst = bitcast i64* %z to i16*
+  call void @llvm.memset.p0i16.i64(i16* %dst, i8 42, i64 18, i32 8, i1 false)
+  ret void
+}
+
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i16(ptr %z)
+; CHECK: [[D1:%.*]] = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK: store i64 3038287259199220266, ptr [[D1]], align 8
+
+; CHECK: [[D2:%.*]] = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK: store i64 3038287259199220266, ptr [[D2]], align 8
+
+; CHECK: [[D3:%.*]] = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK: store i8 42, ptr [[D3]], align 1
+
+; CHECK: [[D4:%.*]] = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK: store i8 42, ptr [[D4]], align 1
+; CHECK: }
+
+define spir_kernel void @test_memcpy_i16(i64* %a, i64* %z) {
+  %src = bitcast i64* %a to i16*
+  %dst = bitcast i64* %z to i16*
+  call void @llvm.memcpy.p0i16.p0i16.i64(i16* %dst, i16* %src, i64 18, i32 8, i1 false)
+  ret void
+}
+
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i16(ptr %a, ptr %z)
+; CHECK: [[S1:%.*]] = getelementptr inbounds i8, ptr %src, i64 0
+; CHECK: [[D1:%.*]] = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK: [[SRC1:%.*]] = load i64, ptr [[S1]], align 8
+; CHECK: store i64 [[SRC1]], ptr [[D1]], align 8
+
+; CHECK: [[S2:%.*]] = getelementptr inbounds i8, ptr %src, i64 8
+; CHECK: [[D2:%.*]] = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK: [[SRC2:%.*]] = load i64, ptr [[S2]], align 8
+; CHECK: store i64 [[SRC2]], ptr [[D2]], align 8
+
+; CHECK: [[S3:%.*]] = getelementptr inbounds i8, ptr %src, i64 16
+; CHECK: [[D3:%.*]] = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK: [[SRC3:%.*]] = load i8, ptr [[S3]], align 1
+; CHECK: store i8 [[SRC3]], ptr [[D3]], align 1
+
+; CHECK: [[S4:%.*]] = getelementptr inbounds i8, ptr %src, i64 17
+; CHECK: [[D4:%.*]] = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK: [[SRC4:%.*]] = load i8, ptr [[S4]], align 1
+; CHECK: store i8 [[SRC4]], ptr [[D4]], align 1
+; CHECK: }
+
+declare void @llvm.memset.p0i16.i64(i16*, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i16.p0i16.i64(i16*, i16*, i64, i32, i1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
new file mode 100644
index 0000000000000..b3e369f899603
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_rhadd -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_normalize(float %a, float %b, i32* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %norm = call spir_func float @_Z9normalizef(float %a)
+  %normi = fptosi float %norm to i32
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %normi, i32* %c0, align 4
+  ret void
+}
+
+define spir_kernel void @test_rhadd(i32 %a, i32 %b, i32* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b)
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %add, i32* %c0, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func float @_Z9normalizef(float)
+declare spir_func i32 @_Z5rhaddjj(i32, i32)
+
+; CHECK-NOT: define spir_kernel void @__vecz_v4_test_normalize(float %a, float %b, ptr %c)
+
+; CHECK: define spir_kernel void @__vecz_v4_test_rhadd(i32 %a, i32 %b, ptr %c)
+; CHECK: entry:
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b)
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %add, ptr %c0, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
new file mode 100644
index 0000000000000..67c2f054c1c77
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(float %a, float %b, i32* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b)
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %cmp, i32* %c0, align 4
+  %cmp1 = call spir_func i32 @_Z6islessff(float %a, float %b)
+  %c1 = getelementptr i32, i32* %c0, i32 1
+  store i32 %cmp1, i32* %c1, align 4
+  %cmp2 = call spir_func i32 @_Z7isequalff(float %a, float %b)
+  %c2 = getelementptr i32, i32* %c0, i32 2
+  store i32 %cmp2, i32* %c2, align 4
+  %cmp3 = call spir_func i32 @opt_Z7isequalff(float %a, float %b)
+  %c3 = getelementptr i32, i32* %c0, i32 3
+  store i32 %cmp3, i32* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z9isgreaterff(float, float)
+declare spir_func i32 @_Z6islessff(float, float)
+declare spir_func i32 @_Z7isequalff(float, float)
+
+; Test that a non-builtin function is inlined.
+define spir_func i32 @opt_Z7isequalff(float, float) {
+  ret i32 zeroinitializer
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test(float %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %relational = fcmp ogt float %a, %b
+; CHECK: %relational[[R1:[0-9]+]] = zext i1 %relational to i32
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %relational[[R1]], ptr %c0, align 4
+; CHECK: %relational[[R2:[0-9]+]] = fcmp olt float %a, %b
+; CHECK: %relational[[R3:[0-9]+]] = zext i1 %relational[[R2:[0-9]+]] to i32
+; CHECK: %c1 = getelementptr i32, ptr %c0, {{(i32|i64)}} 1
+; CHECK: store i32 %relational[[R3:[0-9]+]], ptr %c1, align 4
+; CHECK: %relational[[R4:[0-9]+]] = fcmp oeq float %a, %b
+; CHECK: %relational[[R5:[0-9]+]] = zext i1 %relational[[R4:[0-9]+]] to i32
+; CHECK: %c2 = getelementptr i32, ptr %c0, {{(i32|i64)}} 2
+; CHECK: store i32 %relational[[R5:[0-9]+]], ptr %c2, align 4
+; CHECK: %c3 = getelementptr i32, ptr %c0, {{(i32|i64)}} 3
+; CHECK: store i32 0, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
new file mode 100644
index 0000000000000..30f9c4d6fa472
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func float @_Z5fractfPf(float, float*)
+declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
+declare spir_func <4 x float> @_Z5fractDv4_fPS_(<4 x float>, <4 x float>*)
+declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
+
+; FIXME: Both of these are instantiating when we have vector equivalents: see
+; CA-4046.
+
+define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
+  %iouta = alloca float
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx
+  %x = load float, float* %arrayidx.x, align 4
+  %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta)
+  %arrayidx.out = getelementptr inbounds float, float* %outptr, i64 %idx
+  %arrayidx.iout = getelementptr inbounds float, float* %ioutptr, i64 %idx
+  store float %out, float* %arrayidx.out, align 4
+  %iout = load float, float* %iouta, align 4
+  store float %iout, float* %arrayidx.iout, align 4
+  ret void
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
+}
+
+define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
+  %iouta = alloca <2 x float>
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx
+  %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8
+  %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta)
+  %arrayidx.out = getelementptr inbounds <2 x float>, <2 x float>* %outptr, i64 %idx
+  %arrayidx.iout = getelementptr inbounds <2 x float>, <2 x float>* %ioutptr, i64 %idx
+  store <2 x float> %out, <2 x float>* %arrayidx.out, align 8
+  %iout = load <2 x float>, <2 x float>* %iouta, align 8
+  store <2 x float> %iout, <2 x float>* %arrayidx.iout, align 8
+  ret void
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
new file mode 100644
index 0000000000000..bad5df9fa5bf5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
@@ -0,0 +1,128 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k cantduplicate -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
new file mode 100644
index 0000000000000..1264348d42e68
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
@@ -0,0 +1,128 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k cantinline -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
new file mode 100644
index 0000000000000..6ed1ae339e101
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
@@ -0,0 +1,128 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k optnone -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
new file mode 100644
index 0000000000000..a359ccaa5c093
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
@@ -0,0 +1,128 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k user_undefined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
new file mode 100644
index 0000000000000..3298c21625852
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
@@ -0,0 +1,129 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k builtin -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; We should be able to handle builtins
+; CHECK: define spir_kernel void @__vecz_v4_builtin
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
new file mode 100644
index 0000000000000..464752bd72179
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
@@ -0,0 +1,129 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k instrinsic -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; We should be able to handle intrinsics
+; CHECK: define spir_kernel void @__vecz_v4_instrinsic
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
new file mode 100644
index 0000000000000..5ee1bfd24c9be
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
@@ -0,0 +1,129 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k user_defined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
+  %2 = load float, float* %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
+  store float %3, float* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @builtin(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @user_defined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @user_undefined(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantinline(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
+  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
+  ret void
+}
+
+define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %call1, i32* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %0, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func void @undefined(i32*, i32*)
+
+; Functions with definitions
+
+define spir_func void @defined(i32* %in, i32* %out) {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(i32* %in, i32* %out) #0 {
+entry:
+  %0 = load i32, i32* %in, align 4
+  store i32 %0, i32* %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
+
+; We should be able to handle user functions for which we have a definition
+; CHECK: define spir_kernel void @__vecz_v4_user_defined
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
new file mode 100644
index 0000000000000..bd8280cb08250
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %out) #0 {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0) #1
+  %conv = trunc i64 %gid to i32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!opencl.kernels = !{!0}
+!opencl.spir.version = !{!7}
+!opencl.ocl.version = !{!7}
+!opencl.used.extensions = !{!8}
+!opencl.used.optional.core.features = !{!8}
+!opencl.compiler.options = !{!8}
+
+!0 = !{void (i32 addrspace(1)*)* @test, !1, !2, !3, !4, !5, !6}
+!1 = !{!"kernel_arg_addr_space", i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"kernel_arg_name", !"out"}
+!7 = !{i32 1, i32 2}
+!8 = !{}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT: %conv = trunc i64 %gid to i32
+; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+; CHECK-NEXT: store i32 %conv, ptr addrspace(1) %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
new file mode 100644
index 0000000000000..418514047886c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %out2) {
+entry:
+  %gid = call i32 @get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 3
+  store i32 %gid, i32 addrspace(1)* %arrayidx, align 4
+
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %out2, i32 %gid
+  store i32 addrspace(1)* %arrayidx, i32 addrspace(1)* addrspace(1)* %arrayidx2, align 4
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %gid = call i32 @get_global_id(i32 0)
+; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 3
+; CHECK: store i32 %gid, ptr addrspace(1) %arrayidx, align 4
+; CHECK: store <4 x ptr addrspace(1)> %{{.+}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
new file mode 100644
index 0000000000000..d0ebb3630d3e4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
@@ -0,0 +1,74 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -vecz-auto -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@entry_test_alloca.lm = internal unnamed_addr addrspace(3) constant [16 x <2 x float>] undef, align 8
+
+define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr {
+entry:
+  %a.sroa.0 = alloca <2 x float>, align 8
+  %b.sroa.2 = alloca <2 x float>, align 8
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8*
+  %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8*
+  %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx2, align 8
+  %conv = sext i32 %offset to i64
+  %add = add i64 %call1, %conv
+  %arrayidx4 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %add
+  %1 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx4, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup10
+  %mul.le.le = fmul <2 x float> %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0., %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8.
+  %arrayidx17 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i64 %call
+  store <2 x float> %mul.le.le, <2 x float> addrspace(1)* %arrayidx17, align 8
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup10, %entry
+  %i.038 = phi i32 [ 0, %entry ], [ %inc15, %for.cond.cleanup10 ]
+  store volatile <2 x float> %0, <2 x float>* %a.sroa.0, align 8
+  store volatile <2 x float> %1, <2 x float>* %b.sroa.2, align 8
+  br label %for.body11
+
+for.cond.cleanup10:                               ; preds = %for.body11
+  %inc15 = add nuw nsw i32 %i.038, 1
+  %cmp = icmp ult i32 %inc15, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body11:                                       ; preds = %for.body11, %for.body
+  %i6.037 = phi i32 [ 0, %for.body ], [ %inc, %for.body11 ]
+  %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0. = load volatile <2 x float>, <2 x float>* %a.sroa.0, align 8
+  %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. = load volatile <2 x float>, <2 x float>* %b.sroa.2, align 8
+  %inc = add nuw nsw i32 %i6.037, 1
+  %cmp8 = icmp ult i32 %inc, 16
+  br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+declare spir_func i64 @_Z12get_local_idj(i32) local_unnamed_addr
+
+; Check that all the allocas come before anything else
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <8 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <8 x float>, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
new file mode 100644
index 0000000000000..e53f3088929b6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
@@ -0,0 +1,208 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; A nested loop, in the form of
+;
+;  int gid = get_global_id(0);
+;  for (int i = 16 - gid; i < 16; ++i) {
+;    for (int j = 24 - gid; i < 24; ++j) {
+;      b[i + gid] = a[j + gid] + i + j;
+;    }
+;  }
+;
+; The important bit is that both of the loops have their iterations dependent on
+; the global ID
+; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b)
+; CHECK: entry:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
+; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
+; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[CMP]]
+; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[NOT_CMP]]
+; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
+; CHECK: br label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND3:.+]]
+
+; CHECK: [[FORCOND3]]:
+; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
+; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
+; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[CMP4]]
+; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[NOT_CMP4]]
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
+; CHECK: br label %[[FORBODY6:.+]]
+
+; CHECK: [[FORBODY6]]:
+; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]])
+; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]]
+
+; CHECK: [[FORINC12]]:
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]])
+; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]]
+
+; CHECK: [[FOREND14]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
new file mode 100644
index 0000000000000..863cb33395418
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
@@ -0,0 +1,208 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 1 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; A nested loop, in the form of
+;
+;  int gid = get_global_id(1);
+;  for (int i = 16 - gid; i < 16; ++i) {
+;    for (int j = 24 - gid; i < 24; ++j) {
+;      b[i + gid] = a[j + gid] + i + j;
+;    }
+;  }
+;
+; The important bit is that both of the loops have their iterations dependent on
+; the global ID
+; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b)
+; CHECK: entry:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
+; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
+; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[CMP]]
+; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[NOT_CMP]]
+; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
+; CHECK: br label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND3:.+]]
+
+; CHECK: [[FORCOND3]]:
+; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
+; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
+; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[CMP4]]
+; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[NOT_CMP4]]
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
+; CHECK: br label %[[FORBODY6:.+]]
+
+; CHECK: [[FORBODY6]]:
+; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]])
+; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]]
+
+; CHECK: [[FORINC12]]:
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]])
+; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]]
+
+; CHECK: [[FOREND14]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
new file mode 100644
index 0000000000000..2545484f95565
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
@@ -0,0 +1,208 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 2 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; A nested loop, in the form of
+;
+;  int gid = get_global_id(1);
+;  for (int i = 16 - gid; i < 16; ++i) {
+;    for (int j = 24 - gid; i < 24; ++j) {
+;      b[i + gid] = a[j + gid] + i + j;
+;    }
+;  }
+;
+; The important bit is that both of the loops have their iterations dependent on
+; the global ID
+; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b)
+; CHECK: entry:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
+; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
+; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[CMP]]
+; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[NOT_CMP]]
+; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
+; CHECK: br label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND3:.+]]
+
+; CHECK: [[FORCOND3]]:
+; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
+; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
+; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[CMP4]]
+; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[NOT_CMP4]]
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
+; CHECK: br label %[[FORBODY6:.+]]
+
+; CHECK: [[FORBODY6]]:
+; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]])
+; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]]
+
+; CHECK: [[FORINC12]]:
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]])
+; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]]
+
+; CHECK: [[FOREND14]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
new file mode 100644
index 0000000000000..bc01f0df0f2d7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_varying_if_ptr(i32 %a, i32** %b, i32* %on_true, i32* %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32*, i32** %b, i64 %idxprom
+  store i32* %on_true, i32** %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32*, i32** %b, i64 42
+  store i32* %on_false, i32** %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store ptr [[A]], ptr [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
+}
+
+define spir_kernel void @test_varying_if_ptrptr(i32 %a, i32*** %b, i32** %on_true, i32** %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32**, i32*** %b, i64 %idxprom
+  store i32** %on_true, i32*** %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32**, i32*** %b, i64 42
+  store i32** %on_false, i32*** %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
new file mode 100644
index 0000000000000..02432d3e7d090
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
@@ -0,0 +1,168 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_uniform_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This tests a uniform if statement that shouldn't be touched by the CFC pass
+; CHECK: define spir_kernel void @__vecz_v4_test_uniform_if(i32 %a, ptr %b)
+; CHECK: br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: store i32 11, ptr %arrayidx, align 4
+
+; CHECK: if.else:
+; CHECK: store i32 13, ptr %arrayidx1, align 4
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
new file mode 100644
index 0000000000000..88ac83ac97480
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
@@ -0,0 +1,176 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_uniform_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This tests for a uniform loop that should remain untouched by the CFC pass
+; CHECK: define spir_kernel void @__vecz_v4_test_uniform_loop(i32 %a, ptr %b)
+; CHECK: br label %for.cond
+
+; CHECK: for.cond:
+; CHECK: %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+; CHECK: %cmp = icmp slt i32 %storemerge, 16
+; CHECK: br i1 %cmp, label %for.body, label %for.end
+
+; CHECK: for.body:
+; CHECK: %add = add nsw i32 %storemerge, %a
+; CHECK: %idxprom = sext i32 %add2 to i64
+; CHECK: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %idxprom
+; CHECK: store i32 %add, ptr %arrayidx, align 4
+; CHECK: %inc = add nsw i32 %storemerge, 1
+; CHECK: br label %for.cond
+
+; CHECK: for.end:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
new file mode 100644
index 0000000000000..566048e4009d9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
@@ -0,0 +1,166 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_varying_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Check for a varying that needs masked operations
+; CHECK: define spir_kernel void @__vecz_v4_test_varying_if(i32 %a, ptr %b)
+; CHECK: %cmp = icmp eq i64 %conv, %call
+; CHECK: %cmp.not = xor i1 %cmp, true
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 11, ptr %arrayidx, i1 %cmp)
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 13, ptr %arrayidx2, i1 %cmp.not)
+
+; Note that the entry mask would be removed by any DCE pass
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
new file mode 100644
index 0000000000000..f1784ab18d68b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
@@ -0,0 +1,185 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_varying_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; The loop's start condition depends on the global ID
+; Note that the mask names are hardcoded in vecz, if they change they need to be
+; changed here as well. We do need them though, to make sure that we are
+; checking for the correct stuff. Since we don't have any duplicate names, they
+; should all be deterministic.
+; CHECK: define spir_kernel void @__vecz_v4_test_varying_loop(i32 %a, ptr %b)
+; CHECK: br label %for.cond
+
+; CHECK: for.cond:
+; CHECK: %for.cond.entry_mask = phi i1 [ true, %entry ], [ %for.body.exit_mask, %for.body ]
+; CHECK: %for.end.loop_exit_mask = phi i1 [ false, %entry ], [ %for.end.loop_exit_mask.update, %for.body ]
+; CHECK: %cmp = icmp slt i32 %storemerge, 16
+; CHECK: %for.body.exit_mask = and i1 %for.cond.entry_mask, %cmp
+; CHECK: %cmp.not = xor i1 %cmp, true
+; CHECK: %for.end.exit_mask = and i1 %for.cond.entry_mask, %cmp.not
+; CHECK: %for.end.loop_exit_mask.update = or i1 %for.end.loop_exit_mask, %for.end.exit_mask
+; CHECK: br label %for.body
+
+; CHECK: for.body:
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %add, ptr %arrayidx, i1 %for.body.exit_mask)
+; CHECK: %[[EXIT_MASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %for.body.exit_mask)
+; CHECK: br i1 %[[EXIT_MASK_ANY]], label %for.cond, label %for.cond.pure_exit
+
+; CHECK: for.cond.pure_exit:
+; CHECK: br label %for.end
+
+; CHECK: for.end:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
new file mode 100644
index 0000000000000..e413827450cf5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k convert3 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @convert3(i64 addrspace(1)* %src, float addrspace(1)* %dest) local_unnamed_addr {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64 %call, i64 addrspace(1)* %src)
+  %call2 = tail call spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64> %call1)
+  tail call spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float> %call2, i64 %call, float addrspace(1)* %dest)
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float>, i64, float addrspace(1)*) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64>) local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64, i64 addrspace(1)*) local_unnamed_addr
+
+; Note that we have to declare the scalar version, because when we vectorize
+; an already-vector builtin, we have to scalarize it first. The scalar call
+; exists during the intermediate stage between scalarization and packetization,
+; and so has to exist in the module.
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func float @_Z13convert_floatl(i64) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <2 x float> @_Z14convert_float2Dv2_l(<2 x i64>) local_unnamed_addr
+
+; With SIMD width 2, should have 3 x convert_float2.
+
+; CHECK: define spir_kernel void @__vecz_v2_convert3
+; CHECK: call <2 x i64> @__vecz_b_interleaved_load8_3
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK-NOT: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call void @__vecz_b_interleaved_store4_3_Dv2_fu3ptrU3AS1(<2 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
new file mode 100644
index 0000000000000..8458f7e6a7e83
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k convert4 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nofree nounwind
+define spir_kernel void @convert4(<4 x i64> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call
+  %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 32
+  %call1 = tail call spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64> %0)
+  %arrayidx2 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %call1, <4 x float> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64>) local_unnamed_addr
+
+; Note that we have to declare the scalar version, because when we vectorize
+; an already-vector builtin, we have to scalarize it first. The scalar call
+; exists during the intermediate stage between scalarization and packetization,
+; and so has to exist in the module.
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func float @_Z13convert_floatl(i64) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <2 x float> @_Z14convert_float2Dv2_l(<2 x i64>) local_unnamed_addr
+
+; With SIMD width 2, should have 4 x convert_float2.
+
+; CHECK: call <2 x i64> @__vecz_b_interleaved_load8_4
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK-NOT: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call void @__vecz_b_interleaved_store4_4_Dv2_fu3ptrU3AS1(<2 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
new file mode 100644
index 0000000000000..2315cb6d2d9b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k convert_contiguity -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @convert_contiguity(float addrspace(1)* %m_ptr) {
+  %1 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %2 = call spir_func i32 @_Z12convert_uintm(i64 %1)
+  %3 = icmp slt i32 %2, 100
+  %4 = select i1 %3, float 1.000000e+00, float 0.000000e+00
+  %5 = call spir_func i64 @_Z12convert_longi(i32 %2)
+  %6 = getelementptr inbounds float, float addrspace(1)* %m_ptr, i64 %5
+  store float %4, float addrspace(1)* %6, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare spir_func i32 @_Z12convert_uintm(i64)
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z12convert_longi(i32)
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; It checks that the store address was identified as congituous through the
+; OpenCL convert builtin function
+
+; CHECK: void @__vecz_v4_convert_contiguity
+; CHECK: store <4 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
new file mode 100644
index 0000000000000..b2132ed438cbc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) {
+; CHECK: %[[V1:[0-9]+]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>,
+; CHECK: ret <4 x i64> %[[V1]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
new file mode 100644
index 0000000000000..ca88290e09a4e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) {
+; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> undef)
+; CHECK: ret <4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
new file mode 100644
index 0000000000000..a82f222e1ef07
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -0,0 +1,62 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func void @_Z7barrierj(i32)
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+; Test if the interleaved load is defined correctly
+; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[TMP1:.*]] = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK: ret <4 x double> %[[TMP2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
new file mode 100644
index 0000000000000..6f61e5f2b2d76
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved load is defined correctly
+; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %1 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK: ret <4 x double> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
new file mode 100644
index 0000000000000..b637c20eba7c5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2)
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func void @_Z7barrierj(i32)
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
new file mode 100644
index 0000000000000..6501a01180f61
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
new file mode 100644
index 0000000000000..c5e622f735c71
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
@@ -0,0 +1,32 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  %b = bitcast i32 addrspace(2)* %in to <4 x i32> addrspace(2)*
+  %v = call <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)* %b, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)*, <4 x i1>)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(ptr addrspace(2){{.*}}, <4 x i1>{{.*}}) {
+; CHECK:   %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32 4, <4 x i1> %1, <4 x i32> {{undef|poison}})
+; CHECK:   ret <4 x i32> %2
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
new file mode 100644
index 0000000000000..cac0e72fb210b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k masked_gather -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the masked gather load is defined correctly
+; CHECK: define <4 x i32> @__vecz_b_masked_gather_load4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x ptr addrspace(1)>{{( %0)?}}, <4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> undef)
+; CHECK: ret <4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
new file mode 100644
index 0000000000000..abb16c603e52a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0) #5
+  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z12get_local_idj(i32) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func i64 @_Z14get_local_sizej(i32) #1
+
+declare spir_func i64 @_Z12get_group_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { argmemonly nounwind readonly }
+attributes #5 = { nobuiltin nounwind }
+attributes #6 = { nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"const", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+
+
+; Test if the masked load is defined correctly
+; CHECK: define <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrU3AS2Dv4_b(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> {{undef|poison}})
+; CHECK: ret <4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
new file mode 100644
index 0000000000000..8d5834321522c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
@@ -0,0 +1,85 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k masked_scatter -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the masked scatter store is defined correctly
+; CHECK: define void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, <4 x ptr addrspace(1)>{{( %1)?}}, <4 x i1>{{( %2)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 4, <4 x i1> %2) #[[ATTRS:[0-9]+]]
+; CHECK: ret void
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
new file mode 100644
index 0000000000000..db4e8461f5aa3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0) #5
+  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z12get_local_idj(i32) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func i64 @_Z14get_local_sizej(i32) #1
+
+declare spir_func i64 @_Z12get_group_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { argmemonly nounwind readonly }
+attributes #5 = { nobuiltin nounwind }
+attributes #6 = { nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"const", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+
+
+; Test if the masked store is defined correctly
+; CHECK: define void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.masked.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, i32 4, <4 x i1> %2)
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
new file mode 100644
index 0000000000000..3404f4dc87b07
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) {
+; CHECK: entry
+; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
new file mode 100644
index 0000000000000..a6ddefb418556
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) {
+; CHECK: entry:
+; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..0c44c62e34d6d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> zeroinitializer)
+  %b = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> zeroinitializer)
+  ret void
+}
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32>)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %0) {
+; CHECK: entry:
+; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+; CHECK:  %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]]
+; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
+; CHECK:  %[[RESULT:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]]
+; CHECK:  ret <4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32>)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> %0) {
+; CHECK: entry:
+; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+; CHECK:  %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]]
+; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
+; CHECK:  %[[ADD2:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]]
+; CHECK:  %[[ROTATE:.+]] = shufflevector <4 x i32> %[[ADD2]], <4 x i32> undef, <4 x i32> <i32 {{.+}}, i32 0, i32 1, i32 2>
+; CHECK:  %[[RESULT:.+]] = insertelement <4 x i32> %[[ROTATE]], i32 0, i64 0
+; CHECK:  ret <4 x i32> %[[RESULT]]
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
new file mode 100644
index 0000000000000..3089422ea70e6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
@@ -0,0 +1,77 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.addr.0, %e
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in)
+  call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out)
+  %0 = extractelement <4 x i32> %call1, i64 0
+  %tobool = icmp ne i32 %0, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond, label %for.inc
+
+while.cond:                                       ; preds = %while.cond, %for.body
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc, label %while.cond
+
+for.inc:                                          ; preds = %for.body, %while.cond
+  %inc = add nsw i32 %i.addr.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
+
+declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Check if we have the packetized and only the packetized version of the memop.
+; Vecz should assert if this test fails, as we will not define the interleaved
+; op with width of 1.
+; Interleaved Group Combine gets rid of all the interleaved loads created by
+; the re-vectorization process
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK-NOT: call {{.*}}i32 @__vecz_b_interleaved_load4_ju3ptrU3AS1
+
+; CHECK: ret void
+
+; Check if the declaration is missing as well
+; CHECK-NOT: @__vecz_b_interleaved_load4_ju3ptrU3AS1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
new file mode 100644
index 0000000000000..f24dd98bd7f5e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i32 @get_local_id(i32 0)
+  %size = call i32 @get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %loop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %id
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: loop:
+; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %loop.entry_mask{{[0-9]*}} to i4
+; CHECK: %[[MASK:[^ ]+]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %[[LOAD:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[LOAD]], ptr addrspace(1) %{{.+}}, i1 %[[MASK]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
new file mode 100644
index 0000000000000..59ad68fefd9ab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i32 @get_local_id(i32 0)
+  %size = call i32 @get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %nested_merge]
+  br label %koop
+
+koop:
+  %kndex = phi i32 [%index, %loop], [%knc, %koop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %knc = add i32 %kndex, 1
+  %kmp = icmp ne i32 %knc, %id
+  br i1 %kmp, label %koop, label %nested_merge
+
+nested_merge:
+  %old = atomicrmw add i32 addrspace(1)* %in, i32 42 acq_rel
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %size
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: koop:
+; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %koop.entry_mask{{[0-9]*}} to i4
+; CHECK: %[[MASK:[^ ]+]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %[[LOAD:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[LOAD]], ptr addrspace(1) %{{.+}}, i1 %[[MASK]])
+; CHECK: nested_merge:
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
new file mode 100644
index 0000000000000..e3b9583af073f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
@@ -0,0 +1,78 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k multiple_dimensions_0 -vecz-simd-width 4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z15get_global_sizej(i32) #1
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @multiple_dimensions_0(i32 addrspace(1)* %output) #2 {
+entry:
+  %call.i = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call1.i = call spir_func i64 @_Z15get_global_sizej(i32 1) #3
+  %mul.i = mul i64 %call1.i, %call.i
+  %call2.i = call spir_func i64 @_Z15get_global_sizej(i32 2) #3
+  %mul3.i = mul i64 %mul.i, %call2.i
+  %call4.i = call spir_func i64 @_Z13get_global_idj(i32 1) #3
+  %mul6.i = mul i64 %call2.i, %call4.i
+  %add.i = add i64 %mul6.i, %mul3.i
+  %call7.i = call spir_func i64 @_Z13get_global_idj(i32 2) #3
+  %add8.i = add i64 %add.i, %call7.i
+  %conv = trunc i64 %add8.i to i32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %add8.i
+  store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i32 addrspace(1)*)* @multiple_dimensions_0, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{!"kernel_arg_name", !"output"}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_multiple_dimensions_0
+
+; make sure the stride calculation uses the correct operand of the multiply
+; CHECK: %[[CALL1:.+]] = call spir_func i64 @_Z15get_global_sizej(i32 1)
+; CHECK: %[[CALL2:.+]] = call spir_func i64 @_Z15get_global_sizej(i32 2)
+; CHECK: %[[NEWMUL:.+]] = mul i64 %[[CALL1]], %[[CALL2]]
+; CHECK: call void @__vecz_b_interleaved_store4_V_Dv4_ju3ptrU3AS1({{.+}} %[[NEWMUL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
new file mode 100644
index 0000000000000..5ea43e4582a75
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -0,0 +1,195 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k entry -vecz-passes="builtin-inlining,function(instcombine,early-cse),cfg-convert,packetizer" -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Laid out, this struct is 80 bytes
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) {
+entry:
+  %gid = call i64 @_Z12get_local_idj(i32 0)
+  %sa = alloca %struct.S2, align 16
+  %sb = alloca %struct.S2, align 16
+  %sa_i8 = bitcast %struct.S2* %sa to i8*
+  %sb_i8 = bitcast %struct.S2* %sb to i8*
+  %sb_i8as = addrspacecast i8* %sb_i8 to i8 addrspace(1)*
+  %rsi = ptrtoint i64 addrspace(1)* %result to i64
+  %rsit = trunc i64 %rsi to i8
+  call void @llvm.memset.p0i8.i64(i8* %sa_i8, i8 %rsit, i64 80, i32 16, i1 false)
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %sb_i8as, i8 0, i64 80, i32 16, i1 false)
+  %lr = addrspacecast %struct.S2* %result2 to %struct.S2 addrspace(1)*
+  %lri = bitcast %struct.S2 addrspace(1)* %lr to i64 addrspace(1)*
+  %cond = icmp eq i64 addrspace(1)* %result, %lri
+  br i1 %cond, label %middle, label %end
+
+middle:
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %sb_i8as, i8* %sa_i8, i64 80, i32 16, i1 false)
+  br label %end
+
+end:
+  %g_343 = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 0
+  %g_343_load = load i16, i16* %g_343
+  %g_343_zext = zext i16 %g_343_load to i64
+  %resp = getelementptr i64, i64 addrspace(1)* %result, i64 %gid
+  store i64 %g_343_zext, i64 addrspace(1)* %resp, align 8
+  %result2_i8 = bitcast %struct.S2* %result2 to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 16, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1)
+
+declare i64 @_Z12get_local_idj(i32)
+
+; Sanity checks: Make sure the non-vecz entry function is still in place and
+; contains memset and memcpy. This is done in order to prevent future bafflement
+; in case some pass optimizes them out.
+; CHECK: define spir_kernel void @entry
+; CHECK: entry:
+; CHECK: call void @llvm.memset
+; CHECK: call void @llvm.memset
+; CHECK: middle:
+; CHECK: call void @llvm.memcpy
+; CHECK: end:
+; CHECK: call void @llvm.memcpy
+
+; And now for the actual checks
+
+; Check if the kernel was vectorized
+; CHECK: define spir_kernel void @__vecz_v{{[0-9]+}}_entry
+; CHECK: %[[SB_I8AS:.*]] = addrspacecast ptr %sb to ptr addrspace(1)
+
+; Check if the memset and memcpy calls have been removed
+; CHECK-NOT: call void @llvm.memset
+; CHECK-NOT: call void @llvm.memcpy
+
+; Check if the calculation of the stored value for the second memset is in place
+; CHECK: %ms64val
+
+; Check if the generated loads and stores are in place
+; Check the stores for the first memset
+; CHECK: store i64 %ms64val, ptr %sa, align 16
+; CHECK: %[[V14:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 8
+; CHECK: store i64 %ms64val, ptr %[[V14]], align 8
+; CHECK: %[[V15:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 16
+; CHECK: store i64 %ms64val, ptr %[[V15]], align {{(8|16)}}
+; CHECK: %[[V16:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 24
+; CHECK: store i64 %ms64val, ptr %[[V16]], align 8
+; CHECK: %[[V17:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 32
+; CHECK: store i64 %ms64val, ptr %[[V17]], align 16
+; CHECK: %[[V18:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 40
+; CHECK: store i64 %ms64val, ptr %[[V18]], align 8
+; CHECK: %[[V19:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 48
+; CHECK: store i64 %ms64val, ptr %[[V19]], align 16
+; CHECK: %[[V20:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 56
+; CHECK-EQ14: %[[V20:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 3, i64 8
+; CHECK: %[[V21:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 64
+; CHECK: %[[V22:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 72
+
+; Check the stores for the second memset
+; CHECK: store i64 0, ptr addrspace(1) %[[SB_I8AS]], align 16
+; CHECK: %[[V24:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V24]], align 8
+; CHECK: %[[V26:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 16
+; CHECK: store i64 0, ptr addrspace(1) %[[V26]], align 8
+; CHECK: %[[V28:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 24
+; CHECK: store i64 0, ptr addrspace(1) %[[V28]], align 8
+; CHECK: %[[V30:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 32
+; CHECK: store i64 0, ptr addrspace(1) %[[V30]], align 8
+; CHECK: %[[V32:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 40
+; CHECK: store i64 0, ptr addrspace(1) %[[V32]], align 8
+; CHECK: %[[V33:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 48
+; CHECK: store i64 0, ptr addrspace(1) %[[V33]], align 8
+; CHECK: %[[V35T:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 56
+; CHECK-EQ14: %[[V35T:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %sb, i64 0, i32 3, i64 8
+; CHECK-EQ14: %[[V35:[0-9]+]] = bitcast i8* %[[V35T]] to i64*
+; CHECK-EQ14: %[[SB_I8AS18:.+]] = addrspacecast i64* %[[V35]] to i64 addrspace(1)*
+; CHECK: store i64 0, ptr addrspace(1) %[[V35T]], align 8
+; CHECK: %[[V36:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 64
+; CHECK: store i64 0, ptr addrspace(1) %[[V36]], align 8
+; CHECK: %[[V38:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 72
+; CHECK: store i64 0, ptr addrspace(1) %[[V38]], align 8
+
+
+; Check the loads and stores for the first memcpy
+; CHECK:middle:                                           ; preds = %entry
+; CHECK: %[[SA_I822:.+]] = load i64, ptr %sa, align 16
+; CHECK: store i64 %[[SA_I822]], ptr addrspace(1) %[[SB_I8AS]], align 16
+; CHECK: %[[SA_I824:.+]] = load i64, ptr %[[V14]], align 8
+; CHECK: store i64 %[[SA_I824]], ptr addrspace(1) %[[V24]], align 8
+; CHECK: %[[SA_I826:.+]] = load i64, ptr %[[V15]], align {{(8|16)}}
+; CHECK: store i64 %[[SA_I826]], ptr addrspace(1) %[[V26]], align 8
+; CHECK: %[[SA_I828:.+]] = load i64, ptr %[[V16]], align 8
+; CHECK: store i64 %[[SA_I828]], ptr addrspace(1) %[[V28]], align 8
+; CHECK: %[[SA_I830:.+]] = load i64, ptr %[[V17]], align 16
+; CHECK: store i64 %[[SA_I830]], ptr addrspace(1) %[[V30]], align 8
+; CHECK: %[[SA_I832:.+]] = load i64, ptr %[[V18]], align 8
+; CHECK: store i64 %[[SA_I832]], ptr addrspace(1) %[[V32]], align 8
+; CHECK: %[[SA_I834:.+]] = load i64, ptr %[[V19]], align 16
+; CHECK: store i64 %[[SA_I834]], ptr addrspace(1) %[[V33]], align 8
+; CHECK: %[[SA_I836:.+]] = load i64, ptr %[[V20]], align 8
+; CHECK: store i64 %[[SA_I836]], ptr addrspace(1) %[[V35T]], align 8
+; CHECK: %[[SA_I838:.+]] = load i64, ptr %[[V21]], align 16
+; CHECK: store i64 %[[SA_I838]], ptr addrspace(1) %[[V36]], align 8
+; CHECK: %[[SA_I840:.+]] = load i64, ptr %[[V22]], align 8
+; CHECK: store i64 %[[SA_I840]], ptr addrspace(1) %[[V38]], align 8
+
+; Check the loads and stores for the second memcpy
+; CHECK:end:                                              ; preds = %middle, %entry
+; CHECK: %[[SB_I8AS42:.+]] = load i64, ptr addrspace(1) %[[SB_I8AS]], align 16
+; CHECK: store i64 %[[SB_I8AS42]], ptr %result2, align 16
+; CHECK: %[[V42:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 8
+; CHECK: %[[SB_I8AS44:.+]] = load i64, ptr addrspace(1) %[[V24]], align 8
+; CHECK: store i64 %[[SB_I8AS44]], ptr %[[V42]], align 8
+; CHECK: %[[V43:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 16
+; CHECK: %[[SB_I8AS46:.+]] = load i64, ptr addrspace(1) %[[V26]], align 8
+; CHECK: store i64 %[[SB_I8AS46]], ptr %[[V43]], align 8
+; CHECK: %[[V44:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 24
+; CHECK: %[[SB_I8AS48:.+]] = load i64, ptr addrspace(1) %[[V28]], align 8
+; CHECK: store i64 %[[SB_I8AS48]], ptr %[[V44]], align 8
+; CHECK: %[[V45:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 32
+; CHECK: %[[SB_I8AS50:.+]] = load i64, ptr addrspace(1) %[[V30]], align 8
+; CHECK: store i64 %[[SB_I8AS50]], ptr %[[V45]], align 8
+; CHECK: %[[V46:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 40
+; CHECK: %[[SB_I8AS52:.+]] = load i64, ptr addrspace(1) %[[V32]], align 8
+; CHECK: store i64 %[[SB_I8AS52]], ptr %[[V46]], align 8
+; CHECK: %[[V47:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 48
+; CHECK: %[[SB_I8AS54:.+]] = load i64, ptr addrspace(1) %[[V33]], align 8
+; CHECK: store i64 %[[SB_I8AS54]], ptr %[[V47]], align 8
+; CHECK: %[[V48:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 56
+; CHECK-EQ14: %[[V48:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %result2, i64 0, i32 3, i64 8
+; CHECK: %[[SB_I8AS56:.+]] = load i64, ptr addrspace(1) %[[V35T]], align 8
+; CHECK: store i64 %[[SB_I8AS56]], ptr %[[V48]], align 8
+; CHECK: %[[V49:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 64
+; CHECK: %[[SB_I8AS58:.+]] = load i64, ptr addrspace(1) %[[V36]], align 8
+; CHECK: store i64 %[[SB_I8AS58]], ptr %[[V49]], align 8
+; CHECK: %[[V50:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 72
+; CHECK: %[[SB_I8AS60:.+]] = load i64, ptr addrspace(1) %[[V38]], align 8
+; CHECK: store i64 %[[SB_I8AS60]], ptr %[[V50]], align 8
+
+; End of function
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
new file mode 100644
index 0000000000000..cb617c0f7517b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
@@ -0,0 +1,91 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k entry -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Laid out, this struct is 80 bytes
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) {
+entry:
+  %gid = call i64 @_Z12get_local_idj(i32 0)
+  %sa = alloca %struct.S2, align 16
+  %sb = alloca %struct.S2, align 16
+  %sa_i8 = bitcast %struct.S2* %sa to i8*
+  %sb_i8 = bitcast %struct.S2* %sb to i8*
+  %sb_i8as = addrspacecast i8* %sb_i8 to i8 addrspace(1)*
+  %rsi = ptrtoint i64 addrspace(1)* %result to i64
+  %rsit = trunc i64 %rsi to i8
+  call void @llvm.memset.p0i8.i64(i8*  %sa_i8, i8 %rsit, i64 80, i32 4, i1 false)
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)*  %sb_i8as, i8 0, i64 80, i32 4, i1 false)
+  %lr = addrspacecast %struct.S2* %result2 to %struct.S2 addrspace(1)*
+  %lri = bitcast %struct.S2 addrspace(1)* %lr to i64 addrspace(1)*
+  %cond = icmp eq i64 addrspace(1)* %result, %lri
+  br i1 %cond, label %middle, label %end
+
+middle:
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)*  %sb_i8as, i8* %sa_i8, i64 80, i32 4, i1 false)
+  br label %end
+
+end:
+  %g_343 = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 0
+  %g_343_load = load i16, i16* %g_343
+  %g_343_zext = zext i16 %g_343_load to i64
+  %resp = getelementptr i64, i64 addrspace(1)* %result, i64 %gid
+  store i64 %g_343_zext, i64 addrspace(1)* %resp, align 8
+  %result2_i8 = bitcast %struct.S2* %result2 to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 4, i1 false)
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 4, i1 false)  
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1)
+
+declare i64 @_Z12get_local_idj(i32)
+
+; Sanity checks: Make sure the non-vecz entry function is still in place and
+; contains memset and memcpy. This is done in order to prevent future bafflement
+; in case some pass optimizes them out.
+; CHECK: define spir_kernel void @entry
+; CHECK: entry:
+; CHECK: call void @llvm.memset
+; CHECK: call void @llvm.memset
+; CHECK: middle:
+; CHECK: call void @llvm.memcpy
+; CHECK: end:
+; CHECK: call void @llvm.memcpy
+
+; And now for the actual checks
+
+; Check if the kernel was vectorized
+; CHECK: define spir_kernel void @__vecz_v{{[0-9]+}}_entry
+
+; Check if the memset and memcpy calls are still there
+; CHECK: call void @llvm.memset
+; CHECK: call void @llvm.memcpy
+
+; End of function
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
new file mode 100644
index 0000000000000..4d0117479d970
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare void @llvm.assume(i1)
+declare i32 @llvm.expect.i32(i32, i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_assume(
+; CHECK: [[A:%.*]] = load <4 x i32>, ptr %arrayidxa, align 4
+; CHECK: [[B:%.*]] = load <4 x i32>, ptr %arrayidxb, align 4
+; CHECK: [[SUM:%.*]] = add <4 x i32> %0, %1
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i32> [[SUM]], zeroinitializer
+; CHECK: [[E0:%.*]] = extractelement <4 x i1> [[CMP]], i64 0
+; CHECK: [[E1:%.*]] = extractelement <4 x i1> [[CMP]], i64 1
+; CHECK: [[E2:%.*]] = extractelement <4 x i1> [[CMP]], i64 2
+; CHECK: [[E3:%.*]] = extractelement <4 x i1> [[CMP]], i64 3
+; CHECK: call void @llvm.assume(i1 [[E0]])
+; CHECK: call void @llvm.assume(i1 [[E1]])
+; CHECK: call void @llvm.assume(i1 [[E2]])
+; CHECK: call void @llvm.assume(i1 [[E3]])
+; CHECK: store <4 x i32> [[SUM]], ptr %arrayidxz, align 4
+define spir_kernel void @assume(ptr %aptr, ptr %bptr, ptr %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+  %a = load i32, ptr %arrayidxa, align 4
+  %b = load i32, ptr %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  %cond = icmp sgt i32 %sum, 0
+  call void @llvm.assume(i1 %cond)
+  store i32 %sum, ptr %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_expect(
+; CHECK: [[A:%.*]] = load <4 x i32>, ptr %arrayidxa, align 4
+; CHECK: [[B:%.*]] = load <4 x i32>, ptr %arrayidxb, align 4
+; CHECK: [[SUM:%.*]] = add <4 x i32> %0, %1
+; CHECK: [[E0:%.*]] = extractelement <4 x i32> [[SUM]], i64 0
+; CHECK: [[E1:%.*]] = extractelement <4 x i32> [[SUM]], i64 1
+; CHECK: [[E2:%.*]] = extractelement <4 x i32> [[SUM]], i64 2
+; CHECK: [[E3:%.*]] = extractelement <4 x i32> [[SUM]], i64 3
+; CHECK: [[EX0:%.*]] = call i32 @llvm.expect.i32(i32 [[E0]], i32 42)
+; CHECK: [[EX1:%.*]] = call i32 @llvm.expect.i32(i32 [[E1]], i32 42)
+; CHECK: [[EX2:%.*]] = call i32 @llvm.expect.i32(i32 [[E2]], i32 42)
+; CHECK: [[EX3:%.*]] = call i32 @llvm.expect.i32(i32 [[E3]], i32 42)
+; CHECK: [[C0:%.*]] = insertelement <4 x i32> undef, i32 [[EX0]], i64 0
+; CHECK: [[C1:%.*]]  = insertelement <4 x i32> [[C0]], i32 [[EX1]], i64 1
+; CHECK: [[C2:%.*]]  = insertelement <4 x i32> [[C1]], i32 [[EX2]], i64 2
+; CHECK: [[C3:%.*]]  = insertelement <4 x i32> [[C2]], i32 [[EX3]], i64 3
+; CHECK: store <4 x i32> [[C3]], ptr %arrayidxz, align 4
+
+define spir_kernel void @expect(ptr %aptr, ptr %bptr, ptr %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+  %a = load i32, ptr %arrayidxa, align 4
+  %b = load i32, ptr %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  %cond = icmp sgt i32 %sum, 0
+  %v = call i32 @llvm.expect.i32(i32 %sum, i32 42)
+  store i32 %v, ptr %arrayidxz, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
new file mode 100644
index 0000000000000..ca5e39f7c2efa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k extract_constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_constant_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x float> %0, i32 0;
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
+; CHECK: call <4 x float> @__vecz_b_interleaved_load4_4_Dv4
+; CHECK: getelementptr inbounds float
+; CHECK: store <4 x float>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
new file mode 100644
index 0000000000000..fdd570bc0f47a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x float> %0, i32 %x
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: icmp eq i32 0, %x
+; CHECK: select i1
+; CHECK: icmp eq i32 1, %x
+; CHECK: select i1
+; CHECK: icmp eq i32 2, %x
+; CHECK: select i1
+; CHECK: icmp eq i32 3, %x
+; CHECK: select i1
+; CHECK: store float
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
new file mode 100644
index 0000000000000..02d2fa8c3c4fa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -0,0 +1,75 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.testStruct = type { [2 x i32] }
+
+define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) {
+entry:
+  %out.addr = alloca ptr addrspace(1), align 8
+  %global_id = alloca i32, align 4
+  %myStruct = alloca %struct.testStruct, align 4
+  store ptr addrspace(1) %out, ptr %out.addr, align 8
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  store i32 %conv, ptr %global_id, align 4
+  %x = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %x, i64 0, i64 0
+  store i32 0, ptr %arrayidx, align 4
+  %x1 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx2 = getelementptr inbounds [2 x i32], ptr %x1, i64 0, i64 1
+  store i32 1, ptr %arrayidx2, align 4
+  %0 = load i32, ptr %global_id, align 4
+  %and = and i32 %0, 1
+  %tobool = icmp ne i32 %and, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %x3 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx4 = getelementptr inbounds [2 x i32], ptr %x3, i64 0, i64 0
+  %1 = load i32, ptr %arrayidx4, align 4
+  %2 = load ptr addrspace(1), ptr %out.addr, align 8
+  %3 = load i32, ptr %global_id, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %idxprom
+  store i32 %1, ptr addrspace(1) %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %x6 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx7 = getelementptr inbounds [2 x i32], ptr %x6, i64 0, i64 1
+  %4 = load i32, ptr %arrayidx7, align 4
+  %5 = load ptr addrspace(1), ptr %out.addr, align 8
+  %6 = load i32, ptr %global_id, align 4
+  %idxprom8 = sext i32 %6 to i64
+  %arrayidx9 = getelementptr inbounds i32, ptr addrspace(1) %5, i64 %idxprom8
+  store i32 %4, ptr addrspace(1) %arrayidx9, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
+; CHECK: entry:
+; CHECK: getelementptr inbounds [2 x i32], ptr %myStruct, i{{32|64}} 0, i{{32|64}} 1
+; CHECK-NOT: getelementptr {{.*}}%myStruct
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
new file mode 100644
index 0000000000000..aef3ff002cff3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-15+
+; RUN: %veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.mystruct = type { [2 x i32], ptr }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @test(ptr addrspace(1) nocapture writeonly align 4 %output) {
+entry:
+  %foo = alloca [4 x %struct.mystruct], align 4
+  %call = tail call spir_func i32 @_Z13get_global_idj(i32 0)
+  store i32 20, ptr %foo, align 4
+  %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1
+  store i32 22, ptr %arrayidx4, align 4
+  %y31 = getelementptr inbounds %struct.mystruct, ptr %foo, i32 0, i32 1
+  store ptr %foo, ptr %y31, align 4
+  %mul = shl nuw nsw i32 %call, 2
+  store i32 1, ptr %foo, align 4
+  %0 = load ptr, ptr %y31, align 4
+  %1 = load i32, ptr %0, align 4
+  %add98 = add nsw i32 %mul, %1
+  %arrayidx117 = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %mul
+  store i32 %add98, ptr addrspace(1) %arrayidx117, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+
+; Make sure all three GEPs are retained
+; CHECK: %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1
+; CHECK: %y31 = getelementptr inbounds %struct.mystruct, ptr %foo, i32 0, i32 1
+; CHECK: %arrayidx117 = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %mul
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
new file mode 100644
index 0000000000000..d20bb3e42429b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
@@ -0,0 +1,30 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(void (i32)*, i32) {
+entry:
+  call void %0 (i32 %1)
+  ret void
+}
+
+; This is really a check to see if opt crashed or not
+; CHECK: define spir_kernel void @test(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
new file mode 100644
index 0000000000000..4dee3cd867207
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -0,0 +1,141 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check VECZ debug info for inlined DILocation metadata nodes
+
+; RUN: %veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = '/tmp/inlined_function.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: alwaysinline
+define spir_func i32 @k_one(i32 %x, i32 %y) #0 !dbg !4 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !38), !dbg !39
+  call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !38), !dbg !39
+  %mul = mul nsw i32 %x, %y, !dbg !40
+  ret i32 %mul, !dbg !40
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @functions_one(i32 addrspace(1)* %in1i, i32 addrspace(1)* %in2i, float addrspace(1)* %in1f, float addrspace(1)* %in2f, i32 addrspace(1)* %out1i, float addrspace(1)* %out1f) #2 !dbg !11 {
+entry:
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %in1i, i64 0, metadata !18, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %in2i, i64 0, metadata !19, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata float addrspace(1)* %in1f, i64 0, metadata !20, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata float addrspace(1)* %in2f, i64 0, metadata !21, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %out1i, i64 0, metadata !22, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata float addrspace(1)* %out1f, i64 0, metadata !23, metadata !38), !dbg !41
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4, !dbg !42
+  call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !24, metadata !38), !dbg !42
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1i, i64 %call, !dbg !43
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !43
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2i, i64 %call, !dbg !43
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !43
+  call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !9, metadata !38), !dbg !44
+  call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !10, metadata !38), !dbg !44
+  %mul.i = mul nsw i32 %0, %1, !dbg !46
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out1i, i64 %call, !dbg !43
+  store i32 %mul.i, i32 addrspace(1)* %arrayidx3, align 4, !dbg !43
+  ret void, !dbg !47
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #3
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { alwaysinline }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!29}
+!llvm.module.flags = !{!36}
+!llvm.ident = !{!37}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "kernel.opencl", directory: "Aorta/vecz_build")
+!2 = !{}
+!3 = !{!4, !11}
+!4 = distinct !DISubprogram(name: "k_one", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9, !10}
+!9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 1, type: !7)
+!11 = distinct !DISubprogram(name: "functions_one", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !14, !15, !15, !14, !15}
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 64)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 64)
+!16 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!17 = !{!18, !19, !20, !21, !22, !23, !24}
+!18 = !DILocalVariable(name: "in1i", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!19 = !DILocalVariable(name: "in2i", arg: 2, scope: !11, file: !1, line: 6, type: !14)
+!20 = !DILocalVariable(name: "in1f", arg: 3, scope: !11, file: !1, line: 6, type: !15)
+!21 = !DILocalVariable(name: "in2f", arg: 4, scope: !11, file: !1, line: 6, type: !15)
+!22 = !DILocalVariable(name: "out1i", arg: 5, scope: !11, file: !1, line: 6, type: !14)
+!23 = !DILocalVariable(name: "out1f", arg: 6, scope: !11, file: !1, line: 6, type: !15)
+!24 = !DILocalVariable(name: "tid", scope: !11, file: !1, line: 7, type: !25)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !26, line: 33, baseType: !27)
+!26 = !DIFile(filename: "Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "Aorta/vecz_build")
+!27 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !26, line: 31, baseType: !28)
+!28 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!29 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32 addrspace(1)*, float addrspace(1)*)* @functions_one, !30, !31, !32, !33, !34, !35}
+!30 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!31 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!32 = !{!"kernel_arg_type", !"int*", !"int*", !"float*", !"float*", !"int*", !"float*"}
+!33 = !{!"kernel_arg_base_type", !"int*", !"int*", !"float*", !"float*", !"int*", !"float*"}
+!34 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!35 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1}
+!36 = !{i32 2, !"Debug Info Version", i32 3}
+!37 = !{!"clang version 3.8.1 "}
+!38 = !DIExpression()
+!39 = !DILocation(line: 1, scope: !4)
+!40 = !DILocation(line: 2, scope: !4)
+!41 = !DILocation(line: 6, scope: !11)
+!42 = !DILocation(line: 7, scope: !11)
+!43 = !DILocation(line: 8, scope: !11)
+!44 = !DILocation(line: 1, scope: !4, inlinedAt: !45)
+!45 = distinct !DILocation(line: 8, scope: !11)
+!46 = !DILocation(line: 2, scope: !4, inlinedAt: !45)
+!47 = !DILocation(line: 9, scope: !11)
+
+; CHECK: spir_func i32 @k_one
+; CHECK-SAME: !dbg [[HELPER_DI:![0-9]+]]
+
+; CHECK: define spir_kernel void @__vecz_v4_functions_one
+; CHECK-SAME: !dbg [[KERN_DI:![0-9]+]]
+
+; CHECK: %[[LOAD1:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
+; CHECK: %[[LOAD2:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
+; CHECK: call void @llvm.dbg.value(metadata i32 %[[LOAD1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1:![0-9]+]]
+; CHECK: call void @llvm.dbg.value(metadata i32 %[[LOAD2]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1]]
+; CHECK: %{{.*}} = mul nsw i32 %[[LOAD1]], %[[LOAD2]], !dbg [[DI_LOC2:![0-9]+]]
+
+; CHECK: [[HELPER_SUBPROGRAM:![0-9]+]] = distinct !DISubprogram(name: "k_one",
+
+; CHECK: [[DI_LOC1]] = !DILocation(line: 1, scope: [[HELPER_SUBPROGRAM]], inlinedAt: [[DI_INLINED_AT:![0-9]+]])
+; CHECK: [[DI_INLINED_AT]] = distinct !DILocation(line: 8,
+; CHECK: [[DI_LOC2]] = !DILocation(line: 2, scope: [[HELPER_SUBPROGRAM]], inlinedAt: [[DI_INLINED_AT]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
new file mode 100644
index 0000000000000..5dfe0f4e551c4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -0,0 +1,142 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Regression test for debug info bug related to creating llvm.dbg.value
+; intrinsics across all lanes even when scalarization masks disable some
+; of the lanes. This occurs when we scalarize insertelement instructions.
+
+; RUN: %veczc -k unaligned_load -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @unaligned_load(i32 addrspace(1)* %in, i32 addrspace(1)* %offsets, i32 addrspace(1)* %out) #0 !dbg !7 {
+entry:
+  %in.addr = alloca i32 addrspace(1)*, align 8
+  %offsets.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i32, align 4
+  %tmp = alloca <3 x i32>, align 16
+  store i32 addrspace(1)* %in, i32 addrspace(1)** %in.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %offsets, i32 addrspace(1)** %offsets.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %offsets.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i32* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  %conv = trunc i64 %call to i32, !dbg !31
+  store i32 %conv, i32* %tid, align 4, !dbg !31
+  call void @llvm.dbg.declare(metadata <3 x i32>* %tmp, metadata !15, metadata !29), !dbg !32
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in.addr, align 8, !dbg !32
+  %1 = load i32, i32* %tid, align 4, !dbg !32
+  %mul = mul nsw i32 3, %1, !dbg !32
+  %idx.ext = sext i32 %mul to i64, !dbg !32
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idx.ext, !dbg !32
+  %call1 = call spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64 0, i32 addrspace(1)* %add.ptr) #3, !dbg !32
+  %extractVec = shufflevector <3 x i32> %call1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>, !dbg !32
+  %storetmp = bitcast <3 x i32>* %tmp to <4 x i32>*, !dbg !32
+  store <4 x i32> %extractVec, <4 x i32>* %storetmp, align 16, !dbg !32
+  %2 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !33
+  %3 = extractelement <3 x i32> %2, i64 0, !dbg !33
+  %4 = load i32, i32* %tid, align 4, !dbg !33
+  %mul2 = mul nsw i32 3, %4, !dbg !33
+  %idxprom = sext i32 %mul2 to i64, !dbg !33
+  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !33
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom, !dbg !33
+  store i32 %3, i32 addrspace(1)* %arrayidx, align 4, !dbg !33
+  %6 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !34
+  %7 = extractelement <3 x i32> %6, i64 1, !dbg !34
+  %8 = load i32, i32* %tid, align 4, !dbg !34
+  %mul3 = mul nsw i32 3, %8, !dbg !34
+  %add = add nsw i32 %mul3, 1, !dbg !34
+  %idxprom4 = sext i32 %add to i64, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %idxprom4, !dbg !34
+  store i32 %7, i32 addrspace(1)* %arrayidx5, align 4, !dbg !34
+  %10 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !35
+  %11 = extractelement <3 x i32> %10, i64 2, !dbg !35
+  %12 = load i32, i32* %tid, align 4, !dbg !35
+  %mul6 = mul nsw i32 3, %12, !dbg !35
+  %add7 = add nsw i32 %mul6, 2, !dbg !35
+  %idxprom8 = sext i32 %add7 to i64, !dbg !35
+  %13 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !35
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %13, i64 %idxprom8, !dbg !35
+  store i32 %11, i32 addrspace(1)* %arrayidx9, align 4, !dbg !35
+  ret void, !dbg !36
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+declare spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64, i32 addrspace(1)*) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/vecz_build")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, align: 64)
+!5 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!6 = !{!7}
+!7 = distinct !DISubprogram(name: "unaligned_load", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !4, !4, !4}
+!10 = !{!11, !12, !13, !14, !15}
+!11 = !DILocalVariable(name: "in", arg: 1, scope: !7, file: !1, line: 1, type: !4)
+!12 = !DILocalVariable(name: "offsets", arg: 2, scope: !7, file: !1, line: 1, type: !4)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !7, file: !1, line: 1, type: !4)
+!14 = !DILocalVariable(name: "tid", scope: !7, file: !1, line: 2, type: !5)
+!15 = !DILocalVariable(name: "tmp", scope: !7, file: !1, line: 3, type: !16)
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int3", file: !17, line: 64, baseType: !18)
+!17 = !DIFile(filename: "/home//Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/home/Aorta/vecz_build")
+!18 = !DICompositeType(tag: DW_TAG_array_type, baseType: !5, size: 128, align: 128, flags: DIFlagVector, elements: !19)
+!19 = !{!20}
+!20 = !DISubrange(count: 3)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @unaligned_load, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.1 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !7)
+!31 = !DILocation(line: 2, scope: !7)
+!32 = !DILocation(line: 3, scope: !7)
+!33 = !DILocation(line: 4, scope: !7)
+!34 = !DILocation(line: 5, scope: !7)
+!35 = !DILocation(line: 6, scope: !7)
+!36 = !DILocation(line: 7, scope: !7)
+
+; CHECK: define spir_kernel void @__vecz_v4_unaligned_load
+; CHECK: %tmp = alloca <16 x i32>, align 16
+; CHECK: %[[TMP_LD:.+]] = call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr(ptr nonnull %tmp)
+; CHECK: call void @llvm.dbg.value(metadata <4 x i32> %[[TMP_LD]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg !{{[0-9]+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
new file mode 100644
index 0000000000000..827a2debad03e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 2
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx2
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_constant_index
+
+; We should only have 3 loads since one of the elements will be replaced
+; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+
+; We should have four stores, one of which would use the constant given
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
new file mode 100644
index 0000000000000..ca1b9b4ce1ad4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k runtime_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  store <4 x i32> %0, <4 x i32>* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %call
+  %1 = load i32, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 %1
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx3
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_runtime_index
+
+; Four icmps and selects
+; CHECK: icmp eq <4 x i32> %{{.+}}, zeroinitializer
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+; CHECK: icmp eq <4 x i32> %{{.+}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+; CHECK: icmp eq <4 x i32> %{{.+}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+; CHECK: icmp eq <4 x i32> %{{.+}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+
+; Four stores
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
new file mode 100644
index 0000000000000..3e7c52d758602
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -0,0 +1,95 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width 4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test(half addrspace(1)* nocapture readonly %p, float addrspace(1)* nocapture %f) local_unnamed_addr #0 {
+entry:
+  %data = alloca [1 x i16], align 2
+  %0 = bitcast [1 x i16]* %data to i8*
+  %arraydecay = getelementptr inbounds [1 x i16], [1 x i16]* %data, i64 0, i64 0
+  %1 = bitcast [1 x i16]* %data to half*
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %arrayidx7 = getelementptr inbounds half, half addrspace(1)* %p, i64 %call
+  %arrayidx = bitcast half addrspace(1)* %arrayidx7 to i16 addrspace(1)*
+  %2 = load i16, i16 addrspace(1)* %arrayidx, align 2, !tbaa !9
+  store i16 %2, i16* %arraydecay, align 2, !tbaa !9
+  %call2 = call spir_func float @_Z11vloada_halfmPKDh(i64 0, half* nonnull %1) #6
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %f, i64 %call
+  store float %call2, float addrspace(1)* %arrayidx3, align 4, !tbaa !13
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #2
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z11vloada_halfmPKDh(i64, half*) local_unnamed_addr #3
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+attributes #5 = { convergent nobuiltin nounwind readonly }
+attributes #6 = { convergent nobuiltin nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+!host.build_options = !{!8}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (half addrspace(1)*, float addrspace(1)*)* @test, !3, !4, !5, !6, !7}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"half*", !"float*"}
+!6 = !{!"kernel_arg_base_type", !"half*", !"float*"}
+!7 = !{!"kernel_arg_type_qual", !"const", !""}
+!8 = !{!""}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"short", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"float", !11, i64 0}
+
+; This test checks that an instantiated call with a constant operand gets
+; that operand instantiated (packet-broadcast) correctly instead of causing the
+; instantiation of the call to fail, thereby causing the packetization of the
+; store to fail.
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; CHECK: %[[C0:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[C1:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[C2:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[C3:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[G0:.+]] = insertelement <4 x float> undef, float %[[C0]], {{(i32|i64)}} 0
+; CHECK: %[[G1:.+]] = insertelement <4 x float> %[[G0]], float %[[C1]], {{(i32|i64)}} 1
+; CHECK: %[[G2:.+]] = insertelement <4 x float> %[[G1]], float %[[C2]], {{(i32|i64)}} 2
+; CHECK: %[[G3:.+]] = insertelement <4 x float> %[[G2]], float %[[C3]], {{(i32|i64)}} 3
+; CHECK: store <4 x float> %[[G3]], ptr addrspace(1) %{{.+}}
+; CHECK-NOT: store float
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
new file mode 100644
index 0000000000000..bd2595f5c40dd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k printf_kernel -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
+@.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %cmp = icmp eq i32 %width, 13
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+ x i8] addrspace(2)* @.str, i64 0, i64 0), i32 %0) #3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define spir_kernel void @test_float(float* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %0
+  %conv = fpext float %mul to double
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(2)* @.strf, i64 0, i64 0), double %conv)
+  ret void
+}
+
+
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @printf_kernel, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+; CHECK: entry:
+; CHECK: if.then:
+; CHECK  extractelement
+; CHECK-NEXT  extractelement
+; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+; CHECK  extractelement
+; CHECK-NEXT  extractelement
+; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+; CHECK  extractelement
+; CHECK-NEXT  extractelement
+; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+; CHECK  extractelement
+; CHECK-NEXT  extractelement
+; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
new file mode 100644
index 0000000000000..637a8ee2a7b47
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k load16 -vecz-simd-width 4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-p:32:32-f64:64-i64:64-v128:64-v64:64-v32:32-v16:16-n8:16:32-S64"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) #0 !shave_original_kernel !10 {
+entry:
+  %call = call spir_func i32 @_Z13get_global_idj(i32 0) #2
+  %call1 = call spir_func i32 @_Z13get_global_idj(i32 1) #2
+  %mul = mul nsw i32 %call1, %stride
+  %add = add nsw i32 %mul, %call
+  %mul2 = shl nsw i32 %add, 1
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i32 %mul2
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %mul3 = mul nsw i32 %call1, %stride
+  %add4 = add nsw i32 %mul3, %call
+  %mul5 = shl nsw i32 %add4, 1
+  %add6 = add i32 %mul5, 3
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %in, i32 %add6
+  %1 = load i8, i8 addrspace(1)* %arrayidx7, align 1
+  %add9 = add i8 %1, %0
+  %mul11 = mul nsw i32 %call1, %stride
+  %add12 = add nsw i32 %mul11, %call
+  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %add12
+  store i8 %add9, i8 addrspace(1)* %arrayidx13, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 7.0.0 (tags/RELEASE_700/final) (based on LLVM 7.0.0)"}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i32)* @load16, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"uchar*", !"uchar*", !"int"}
+!7 = !{!"kernel_arg_base_type", !"uchar*", !"uchar*", !"int"}
+!8 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"in", !"stride"}
+!10 = !{!"load16"}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_load16
+
+; There should be exactly 2 interleaved loads in the code
+; CHECK: call <4 x i8> @__vecz_b_interleaved_load1_2_Dv4_hu3ptrU3AS1
+; CHECK: call <4 x i8> @__vecz_b_interleaved_load1_2_Dv4_hu3ptrU3AS1
+
+; There shouldn't be any more interleaved loads or stores left
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+
+; There definitely shouldn't be any gather loads
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+
+; Function end
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
new file mode 100644
index 0000000000000..602d4ebfd60b0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; This test checks that we can optimize interleaved accesses out of order.
+
+define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %conv, %mul
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = or i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx)
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx9)
+  %sub1 = sub nsw <4 x i32> %0, %1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12
+  %2 = bitcast i32 addrspace(1)* %arrayidx13 to <4 x i32> addrspace(1)*
+  store <4 x i32> %sub1, <4 x i32> addrspace(1)* %2, align 4
+  ret void
+}
+
+; CHECK: __vecz_v4_interleaved_load_4(
+; CHECK:  [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK:  [[TMP2:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; CHECK:  [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 4
+; CHECK:  %deinterleave = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:  %deinterleave1 = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK:  %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave
+
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
new file mode 100644
index 0000000000000..ae8d6eb8e8ed8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -0,0 +1,95 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k f -vecz-simd-width 4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call spir_func void @_Z7barrierj(i32 2) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func void @_Z7barrierj(i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_f
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+
+; There should be exactly 4 interleaved loads and one store in the code
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+
+; And in between them there should be a barrier call
+; CHECK: call spir_func void @_Z7barrierj
+; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+
+; There shouldn't be any more interleaved loads or stores left
+; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1
+; CHECK-NOT: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
+
+; There should be some sufflevector instructions after the simplification
+; CHECK: shufflevector
+
+; Function end
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
new file mode 100644
index 0000000000000..f272670b419a3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
@@ -0,0 +1,207 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k ctpop -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix CTPOP
+; RUN: %veczc -k ctlz -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix CTLZ
+; RUN: %veczc -k cttz -vecz-simd-width=8 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix CTTZ
+; RUN: %veczc -k sadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix SADD_SAT
+; RUN: %veczc -k uadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix UADD_SAT
+; RUN: %veczc -k ssub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix SSUB_SAT
+; RUN: %veczc -k usub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix USUB_SAT
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; It checks that the scalar intrinsics get vectorized,
+; and the vector intrinsics get scalarized and then re-vectorized.
+
+define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a)
+  %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b)
+  store i32 %ctpopi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %ctlzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %cttzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CTPOP: void @__vecz_v2_ctpop
+; CTPOP: = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
+; CTPOP: = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
+; CTPOP: = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
+
+; CTLZ: void @__vecz_v4_ctlz
+; CTLZ: = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
+; CTLZ: = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
+
+; CTTZ: void @__vecz_v8_cttz
+; CTTZ: = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
+; CTTZ: = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
+
+; SADD_SAT: void @__vecz_v2_sadd_sat
+; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32(
+; SADD_SAT: = call <2 x i8> @llvm.sadd.sat.v2i8(
+; SADD_SAT: = call <2 x i8> @llvm.sadd.sat.v2i8(
+
+; UADD_SAT: void @__vecz_v2_uadd_sat
+; UADD_SAT: = call <2 x i32> @llvm.uadd.sat.v2i32(
+; UADD_SAT: = call <2 x i8> @llvm.uadd.sat.v2i8(
+; UADD_SAT: = call <2 x i8> @llvm.uadd.sat.v2i8(
+
+; SSUB_SAT: void @__vecz_v2_ssub_sat
+; SSUB_SAT: = call <2 x i32> @llvm.ssub.sat.v2i32(
+; SSUB_SAT: = call <2 x i8> @llvm.ssub.sat.v2i8(
+; SSUB_SAT: = call <2 x i8> @llvm.ssub.sat.v2i8(
+
+; USUB_SAT: void @__vecz_v2_usub_sat
+; USUB_SAT: = call <2 x i32> @llvm.usub.sat.v2i32(
+; USUB_SAT: = call <2 x i8> @llvm.usub.sat.v2i8(
+; USUB_SAT: = call <2 x i8> @llvm.usub.sat.v2i8(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
new file mode 100644
index 0000000000000..3970e9902af39
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
@@ -0,0 +1,200 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k ctpop -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix CTPOP
+; RUN: %veczc -k ctlz -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s --check-prefix CTLZ
+; RUN: %veczc -k cttz -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s --check-prefix CTTZ
+; RUN: %veczc -k sadd_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix SADD_SAT
+; RUN: %veczc -k uadd_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix UADD_SAT
+; RUN: %veczc -k ssub_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix SSUB_SAT
+; RUN: %veczc -k usub_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix USUB_SAT
+
+; It checks that the scalar intrinsics get vectorized,
+; and the vector intrinsics get widened.
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a)
+  %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b)
+  store i32 %ctpopi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %ctlzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %cttzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CTPOP: void @__vecz_v2_ctpop
+; CTPOP: = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
+; CTPOP: = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %{{.*}})
+
+; CTLZ: void @__vecz_v4_ctlz
+; CTLZ: = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %{{.*}}, i1 false)
+
+; CTTZ: void @__vecz_v8_cttz
+; CTTZ: = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %{{.*}}, i1 false)
+
+; SADD_SAT: void @__vecz_v2_sadd_sat
+; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32(
+; SADD_SAT: = call <4 x i8> @llvm.sadd.sat.v4i8(
+
+; UADD_SAT: void @__vecz_v2_uadd_sat
+; UADD_SAT: = call <2 x i32> @llvm.uadd.sat.v2i32(
+; UADD_SAT: = call <4 x i8> @llvm.uadd.sat.v4i8(
+
+; SSUB_SAT: void @__vecz_v2_ssub_sat
+; SSUB_SAT: = call <2 x i32> @llvm.ssub.sat.v2i32(
+; SSUB_SAT: = call <4 x i8> @llvm.ssub.sat.v4i8(
+
+; USUB_SAT: void @__vecz_v2_usub_sat
+; USUB_SAT: = call <2 x i32> @llvm.usub.sat.v2i32(
+; USUB_SAT: = call <4 x i8> @llvm.usub.sat.v4i8(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
new file mode 100644
index 0000000000000..9c1d42295ddaf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -0,0 +1,68 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: %veczc -k irreducible_loop -S < %s | %filecheck %t
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @irreducible_loop(i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
+  %ld = load i32, i32 addrspace(1)* %arrayidx4, align 4
+  %cmp = icmp sgt i32 %ld, -1
+  br i1 %cmp, label %label, label %do.body
+
+do.body:                                          ; preds = %entry, %label
+  %id.0 = phi i64 [ %conv10, %label ], [ %call, %entry ]
+  br label %label
+
+label:                                            ; preds = %entry, %do.body
+  %id.1 = phi i64 [ %id.0, %do.body ], [ %call, %entry ]
+  %conv10 = add i64 %id.1, 1
+  %cmp11 = icmp slt i64 %conv10, 16
+  br i1 %cmp11, label %do.body, label %do.end
+
+do.end:                                           ; preds = %label
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_irreducible_loop
+; CHECK: entry:
+; CHECK:   br label %irr.guard.outer
+
+; CHECK: irr.guard.outer:                                  ; preds = %irr.guard.pure_exit, %entry
+; CHECK:   br label %irr.guard
+
+; LLVM 16 re-orders the Basic Blocks, without any change to the CFG.
+; CHECK-LE15: irr.guard.pure_exit:                              ; preds = %irr.guard
+; CHECK-LE15:   br i1 %{{.+}}, label %do.end, label %irr.guard.outer
+
+; CHECK: do.end:                                           ; preds = %irr.guard.pure_exit
+; CHECK:   ret void
+
+; CHECK: irr.guard:                                        ; preds = %irr.guard, %irr.guard.outer
+; CHECK:   br i1 %{{.+}}, label %irr.guard.pure_exit, label %irr.guard
+
+; CHECK-GT15: irr.guard.pure_exit:                              ; preds = %irr.guard
+; CHECK-GT15:   br i1 %{{.+}}, label %do.end, label %irr.guard.outer
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
new file mode 100644
index 0000000000000..b4c4943b6f101
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-choices=InstantiateCallsInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [23 x i8] c"Hello from %d with %d\0A\00", align 1
+@.str.1 = private unnamed_addr addrspace(2) constant [14 x i8] c"Hello from %d\00", align 1
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([23 x i8], [23 x i8] addrspace(2)* @.str, i64 0, i64 0), i64 %call, i32 %0)
+  %call2 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([14 x i8], [14 x i8] addrspace(2)* @.str.1, i64 0, i64 0), i64 %call)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in)
+
+; CHECK: [[LOOPHEADER1:instloop.header.*]]:
+; CHECK: %[[INSTANCE1:instance.*]] = phi i32 [ 0, {{.+}} ], [ %[[V7:[0-9]+]], %[[LOOPBODY1:instloop.body.*]] ]
+; CHECK: %[[V3:[0-9]+]] = icmp ult i32 %[[INSTANCE1]], 4
+; CHECK: br i1 %[[V3]], label %[[LOOPBODY1]], label {{.+}}
+
+; CHECK: [[LOOPBODY1]]:
+; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE1]]
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %{{.+}}, i32 %[[INSTANCE1]]
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V4]], i32 %[[V5]])
+; CHECK: %[[V7]] = add i32 %[[INSTANCE1]], 1
+; CHECK: br label %[[LOOPHEADER1]]
+
+; CHECK: [[LOOPHEADER2:instloop.header.*]]:
+; CHECK: %[[INSTANCE3:.+]] = phi i32 [ %[[V11:[0-9]+]], %[[LOOPBODY2:instloop.body.*]] ], [ 0, {{.+}} ]
+; CHECK: %[[V8:[0-9]+]] = icmp ult i32 %[[INSTANCE3]], 4
+; CHECK: br i1 %[[V8]], label %[[LOOPBODY2]], label {{.+}}
+
+; CHECK: [[LOOPBODY2]]:
+; CHECK: %[[V9:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE3]]
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V9]])
+; CHECK: %[[V11]] = add i32 %[[INSTANCE3]], 1
+; CHECK: br label %[[LOOPHEADER2]]
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
new file mode 100644
index 0000000000000..bacafd7367582
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
@@ -0,0 +1,81 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check if the call to max in the if block has been replaced with its vector
+; equivalent
+; CHECK: call spir_func <[[WIDTH:[0-9]+]] x i32> @_Z3maxDv[[WIDTH]]_iS_(<[[WIDTH]] x i32> {{.+}}, <[[WIDTH]] x i32> {{.+}})
+; CHECK: call spir_func <[[WIDTH]] x i32> @_Z3maxDv[[WIDTH]]_iS_(<[[WIDTH]] x i32> {{.+}}, <[[WIDTH]] x i32> {{.+}})
+
+; There shouldn't be any masked versions of max
+; CHECK-NOT: masked_Z3max
+
+define spir_kernel void @entry(ptr addrspace(1) %input, ptr addrspace(1) %output) {
+entry:
+  %call = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %call
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %add = add nsw i32 %0, 1
+  %add3 = add nsw i32 %1, 1
+  %call4 = tail call spir_func i32 @_Z3maxii(i32 %add, i32 %add3)
+  %add.i = shl nsw i32 %call4, 1
+  %idxprom.i = sext i32 %add.i to i64
+  %arrayidx.i = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %idxprom.i
+  store i32 %add.i, ptr addrspace(1) %arrayidx.i, align 4
+  %2 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %3 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %4 = icmp eq i32 %2, -2147483648
+  %5 = icmp eq i32 %3, -1
+  %6 = and i1 %4, %5
+  %7 = icmp eq i32 %3, 0
+  %8 = or i1 %7, %6
+  %9 = select i1 %8, i32 1, i32 %3
+  %10 = icmp eq i32 %9, -1
+  %11 = and i1 %4, %10
+  %12 = select i1 %11, i32 1, i32 %9
+  %rem = srem i32 %2, %12
+  %tobool.not = icmp eq i32 %rem, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %call9 = tail call spir_func i32 @_Z3maxii(i32 %0, i32 %1)
+  %add.i27 = shl nsw i32 %call9, 1
+  %idxprom.i28 = sext i32 %add.i27 to i64
+  %arrayidx.i29 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %idxprom.i28
+  store i32 %add.i27, ptr addrspace(1) %arrayidx.i29, align 4
+  br label %if.end
+
+if.end:
+  %idxprom.i31.pre-phi = phi i64 [ %idxprom.i28, %if.then ], [ %idxprom.i, %entry ]
+  %add.i30.pre-phi = phi i32 [ %add.i27, %if.then ], [ %add.i, %entry ]
+  %r.0 = phi i32 [ %call9, %if.then ], [ %call4, %entry ]
+  %arrayidx.i32 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %idxprom.i31.pre-phi
+  store i32 %add.i30.pre-phi, ptr addrspace(1) %arrayidx.i32, align 4
+  store i32 %r.0, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+declare spir_func i32 @_Z3maxii(i32, i32)
+
+declare spir_func <4 x i32> @_Z3maxDv4_iS_(<4 x i32>, <4 x i32>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
new file mode 100644
index 0000000000000..076fee77621f1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -0,0 +1,74 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_fn -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test_fn(i32 addrspace(1)* %results) #0 {
+entry:
+  %results.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i32, align 4
+  store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  store i32 %conv, i32* %tid, align 4
+  %0 = load i32, i32* %tid, align 4
+  %cmp = icmp sgt i32 3, %0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %tid, align 4
+  %mul = mul nsw i32 2, %1
+  %add = add nsw i32 %mul, 2
+  %idxprom = sext i32 %add to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %results.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  store i32 5, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*)* @test_fn, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+
+; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) {
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %3 = getelementptr i32, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 2, i64 4, i64 6>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %3, i32{{( immarg)?}} 4, <4 x i1> %2) #
+; CHECK ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
new file mode 100644
index 0000000000000..ffb8f57bf91b9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -0,0 +1,75 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_fn -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test_fn(i32 addrspace(1)* %results) #0 {
+entry:
+  %results.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i32, align 4
+  store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  store i32 %conv, i32* %tid, align 4
+  %0 = load i32, i32* %tid, align 4
+  %cmp = icmp sgt i32 3, %0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %tid, align 4
+  %mul = mul nsw i32 2, %1
+  %add = add nsw i32 %mul, 2
+  %idxprom = sext i32 %add to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %results.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  store i32 5, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*)* @test_fn, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+
+; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) {
+
+; Check for the address splat
+; CHECK: %[[BROADCASTADDRSPLATINSERT:.+]] = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %{{.+}}, {{i32|i64}} 0
+; CHECK: %[[BROADCASTADDRSPLAT:.+]] = shufflevector <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLATINSERT]], <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: getelementptr i32, <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLAT]], <4 x i64> <i64 0, i64 2, i64 4, i64 6>
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
new file mode 100644
index 0000000000000..3c47caa892836
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
@@ -0,0 +1,99 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k mask -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call.tr = trunc i64 %call to i32
+  %conv = shl i32 %call.tr, 1
+  %idx.ext = sext i32 %conv to i64
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i8, i8 addrspace(1)* %add.ptr, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idx.ext
+  %conv4 = sext i8 %0 to i32
+  %conv5 = sext i8 %1 to i32
+  %add = add nsw i32 %conv5, %conv4
+  %cmp = icmp slt i32 %add, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr3, i64 1
+  store i8 %0, i8 addrspace(1)* %arrayidx7, align 1
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i8 %1, i8 addrspace(1)* %add.ptr3, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*)* @mask, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"char*", !"char*"}
+!7 = !{!"kernel_arg_base_type", !"char*", !"char*"}
+!8 = !{!"kernel_arg_type_qual", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"in"}
+
+; This test makes sure we combine a group of masked interleaved stores
+; into a single masked interleaved store using interleave operations.
+; We expect the interleaved stores to come out unaltered.
+
+; CHECK: entry:
+
+; The data to store gets interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+
+; The masks get interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+
+; The stores are masked stores:
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+
+; Definitely no unmasked stores:
+; CHECK-NOT: store <16 x i8>
+; CHECK ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
new file mode 100644
index 0000000000000..c2837cfa07eb6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k mask -vecz-simd-width=16 -S -vecz-choices=TargetIndependentPacketization < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(1)* %doit) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call.tr = trunc i64 %call to i32
+  %conv = shl i32 %call.tr, 1
+  %idx.ext = sext i32 %conv to i64
+  %doit.ptr = getelementptr inbounds i8, i8 addrspace(1)* %doit, i64 %idx.ext
+  %ldbool = load i8, i8 addrspace(1)* %doit.ptr, align 1
+  %skip = icmp slt i8 %ldbool, 0
+  br i1 %skip, label %if.end, label %yes
+
+yes:                                              ; preds = %entry
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i8, i8 addrspace(1)* %add.ptr, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idx.ext
+  %conv4 = sext i8 %0 to i32
+  %conv5 = sext i8 %1 to i32
+  %add = add nsw i32 %conv5, %conv4
+  %cmp = icmp slt i32 %add, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %yes
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr3, i64 1
+  store i8 %0, i8 addrspace(1)* %arrayidx7, align 1
+  br label %if.end
+
+if.else:                                          ; preds = %yes
+  store i8 %1, i8 addrspace(1)* %add.ptr3, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then, %entry
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @mask, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"char*", !"char*"}
+!7 = !{!"kernel_arg_base_type", !"char*", !"char*"}
+!8 = !{!"kernel_arg_type_qual", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"in"}
+
+; This test makes sure we combine a group of masked interleaved stores
+; into a single masked interleaved store using interleave operations.
+; We expect the interleaved stores to come out unaltered.
+
+; CHECK: entry:
+; CHECK: yes:
+
+; The masks get interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+
+; The loads are masked loads:
+; CHECK: call <16 x i8> @llvm.masked.load.v16i8.p1(ptr
+; CHECK: call <16 x i8> @llvm.masked.load.v16i8.p1(ptr
+
+; The loaded data gets deinterleaved:
+; CHECK: %deinterleave{{.*}} = shufflevector <16 x i8>
+; CHECK: %deinterleave{{.*}} = shufflevector <16 x i8>
+
+; The data to store gets interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+
+; The masks get interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+
+; The stores are masked stores:
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+
+; Definitely no unmasked stores:
+; CHECK-NOT: store <16 x i8>
+; CHECK ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
new file mode 100644
index 0000000000000..7f6674c8b0a51
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [18 x i8] c"Doing stuff, yay!\00", align 1
+
+define spir_kernel void @test(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %add = add i64 %call, 1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  br label %entry.1
+
+entry.1:                                          ; preds = %entry
+  %add1 = add i64 %call, 1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add1
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry.1
+  %call3 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([18 x i8], [18 x i8] addrspace(2)* @.str, i64 0, i64 0))
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry.1
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  br label %if.end1
+
+if.end1:                                          ; preds = %if.end
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; Check if the divergent block is masked correctly
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+
+; Check if the exit block is not masked
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
new file mode 100644
index 0000000000000..dedfdbb1ce9c5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
@@ -0,0 +1,72 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k entry -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(%struct.S2** %result) #0 {
+entry:
+  %c_640 = alloca %struct.S2, align 16
+  %p_639 = alloca %struct.S2*, align 8
+  store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+  %0 = load %struct.S2*, %struct.S2** %p_639, align 8
+  store %struct.S2* %0, %struct.S2** %result, align 8
+  ret void
+}
+
+define spir_func void @func_10(%struct.S2* %p_484, i64** %ret) {
+entry:
+  %l_462 = alloca i64, align 8
+  %l_461 = alloca i64*, align 8
+  %.cast = ptrtoint %struct.S2* %p_484 to i64
+  store i64 %.cast, i64* %l_462, align 8
+  store i64* %l_462, i64** %l_461, align 8
+  store i64* %l_462, i64** %ret, align 8
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!opencl.kernels = !{!1}
+
+!0 = !{!"clang version 3.8.1 "}
+!1 = !{void (%struct.S2**)* @entry, !2, !3, !4, !5, !6}
+!2 = !{!"kernel_arg_addr_space", i32 1}
+!3 = !{!"kernel_arg_access_qual", !"none"}
+!4 = !{!"kernel_arg_type", !"ulong*"}
+!5 = !{!"kernel_arg_base_type", !"ulong*"}
+!6 = !{!"kernel_arg_type_qual", !""}
+
+; CHECK: @__vecz_v4_entry
+
+; Check if the alloca with no value (c_640) is still here
+; CHECK: %c_640 = alloca %struct.S2, align 16
+
+; Check if the alloca with value (p_639) has been promoted
+; CHECK-NOT: %p_639 = alloca %struct.S2*, align 8
+; CHECK-NOT: store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+; CHECK: ret
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
new file mode 100644
index 0000000000000..9ceebfdc7592f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
@@ -0,0 +1,72 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k func_10 -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(%struct.S2** %result) #0 {
+entry:
+  %c_640 = alloca %struct.S2, align 16
+  %p_639 = alloca %struct.S2*, align 8
+  store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+  %0 = load %struct.S2*, %struct.S2** %p_639, align 8
+  store %struct.S2* %0, %struct.S2** %result, align 8
+  ret void
+}
+
+define spir_func void @func_10(%struct.S2* %p_484, i64** %ret) {
+entry:
+  %l_462 = alloca i64, align 8
+  %l_461 = alloca i64*, align 8
+  %.cast = ptrtoint %struct.S2* %p_484 to i64
+  store i64 %.cast, i64* %l_462, align 8
+  store i64* %l_462, i64** %l_461, align 8
+  store i64* %l_462, i64** %ret, align 8
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!opencl.kernels = !{!1}
+
+!0 = !{!"clang version 3.8.1 "}
+!1 = !{void (%struct.S2**)* @entry, !2, !3, !4, !5, !6}
+!2 = !{!"kernel_arg_addr_space", i32 1}
+!3 = !{!"kernel_arg_access_qual", !"none"}
+!4 = !{!"kernel_arg_type", !"ulong*"}
+!5 = !{!"kernel_arg_base_type", !"ulong*"}
+!6 = !{!"kernel_arg_type_qual", !""}
+
+; Check if the alloca used for its pointer is still here
+; CHECK: @__vecz_v4_func_10
+; CHECK: %l_462 = alloca i64, align 8
+
+; Check that the other alloca(s) have been promoted
+; CHECK-NOT: alloca
+
+; Check if the store using the alloca is still here
+; CHECK:  store i64 %.cast, ptr %l_462, align 8
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
new file mode 100644
index 0000000000000..02af40372812d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
@@ -0,0 +1,34 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
new file mode 100644
index 0000000000000..5dd370297701e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
new file mode 100644
index 0000000000000..fc21fa5f9cd8b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, 9
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
new file mode 100644
index 0000000000000..b34df1dc298c4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, %n
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
new file mode 100644
index 0000000000000..5dd370297701e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
new file mode 100644
index 0000000000000..10b377545fa87
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, 18
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
new file mode 100644
index 0000000000000..0cdf65e4cc162
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = shl i32 %n, 1
+  %mul = mul i32 %add, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
new file mode 100644
index 0000000000000..b8dc5a489cf72
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = shl nuw nsw i64 %call, 1
+  %mul = mul nuw nsw i64 %add, %call
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
new file mode 100644
index 0000000000000..d6eb78cf728f1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nsw i32 %conv2, %n
+  %add = add nsw i32 %mul, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
new file mode 100644
index 0000000000000..b56bd47fbad2d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 addrspace(1)* readnone %r) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = add nuw nsw i64 %call, 255
+  %idxprom = and i64 %conv, 255
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %src, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _gather_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
new file mode 100644
index 0000000000000..6e963f28dc809
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nuw nsw i64 %call, 9
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
new file mode 100644
index 0000000000000..fb3be14950ec3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, 9
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
new file mode 100644
index 0000000000000..05e5392273595
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %idxprom = sext i32 %add to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
new file mode 100644
index 0000000000000..d20739cc1d26d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, 5
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
new file mode 100644
index 0000000000000..f757f558d89bc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = shl nuw nsw i64 %call, 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
new file mode 100644
index 0000000000000..8dcc3272b4c2c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, %call
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
new file mode 100644
index 0000000000000..61627754b359f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %1 = mul nuw nsw i64 %call, 9
+  %mul = add nuw nsw i64 %1, 81
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
new file mode 100644
index 0000000000000..b689ad4bad742
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nuw nsw i32 %conv, 9
+  %mul = mul nsw i32 %add, %n
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
new file mode 100644
index 0000000000000..8aa9aefa122a9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k multiple_exit_blocks -vecz-passes="function(simplifycfg,dce),mergereturn,cfg-convert" -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @multiple_exit_blocks(i64 %n) {
+entry:
+  %gid = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %cmp1 = icmp slt i64 %lid, %n
+  %cmp2 = icmp slt i64 %gid, %n
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                             ; preds = %entry
+  %cmp3 = and i1 %cmp1, %cmp2
+  br i1 %cmp3, label %if.then2, label %if.else2
+
+if.then2:                                             ; preds = %if.then
+  br label %if.else2
+
+if.else2:                                             ; preds = %if.then, %if.then2
+  br i1 %cmp1, label %if.then3, label %if.end
+
+if.then3:                                             ; preds = %if.else2
+  br label %if.end
+
+if.end:                                             ; preds = %entry, %if.else2, %if.then3
+  ret void
+}
+
+; The purpose of this test is to make sure we do not have a kernel that has more
+; than one exit block after following the preparation pass.
+
+; CHECK: define spir_kernel void @__vecz_v4_multiple_exit_blocks
+
+; We don't want to generate any ROSCC branches:
+; CHECK-NOT: entry.ROSCC:
+
+; Only one return statement:
+; CHECK: ret void
+; CHECK-NOT: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
new file mode 100644
index 0000000000000..492d3b90deb46
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k foo3 -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  call spir_kernel void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
+  ret void
+}
+
+define spir_kernel void @foo3(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  call spir_kernel void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_foo3(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK-NOT: call spir_kernel
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: load <4 x i32>, ptr addrspace(1) %{{.+}}, align 4
+; CHECK: store <4 x i32> %{{.+}}, ptr addrspace(1) %{{.+}}, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
new file mode 100644
index 0000000000000..6ed03228c87ad
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check some basic properties of the veczc command line interface for multiple
+; vectorizations works in various configurations. The kernel outputs here are
+; not interesting, only their names.
+; REQUIRES: llvm-12+
+; RUN: %veczc -w 8 -k foo:4,8,16.2@32s -k bar:,64s -S < %s | %filecheck %s
+
+; CHECK-DAG: define spir_kernel void @foo
+; CHECK-DAG: define spir_kernel void @bar
+; CHECK-DAG: define spir_kernel void @__vecz_v4_foo
+; CHECK-DAG: define spir_kernel void @__vecz_v8_foo
+; CHECK-DAG: define spir_kernel void @__vecz_nxv16_foo
+; CHECK-DAG: define spir_kernel void @__vecz_v8_bar
+; CHECK-DAG: define spir_kernel void @__vecz_nxv64_bar
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @foo(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  ret void
+}
+
+define spir_kernel void @bar(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  ret void
+}
+
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
new file mode 100644
index 0000000000000..6682062cc8ce8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
@@ -0,0 +1,133 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that veczc can vectorize a kernel multiple times in one go, with a
+; correct mapping between the vectorized versions of the kernels and their
+; scalar base
+; RUN: %veczc -k add:4,8,16 -S < %s | %filecheck %s
+
+; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_3:[0-9]+]] {
+; CHECK: define spir_kernel void @__vecz_v[[DERIVED_1_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] {
+; CHECK: define spir_kernel void @__vecz_v[[DERIVED_2_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_2:[0-9]+]] {
+; CHECK: define spir_kernel void @__vecz_v[[DERIVED_3_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_3:[0-9]+]] {
+
+; CHECK: ![[BASE_1]] = !{![[VFMD_1:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_1_VF]]_add
+; CHECK: ![[VFMD_1]] = !{i32 [[DERIVED_1_VF]], i32 0, i32 0, i32 0}
+; CHECK: ![[BASE_2]] = !{![[VFMD_2:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_2_VF]]_add
+; CHECK: ![[VFMD_2]] = !{i32 [[DERIVED_2_VF]], i32 0, i32 0, i32 0}
+; CHECK: ![[BASE_3]] = !{![[VFMD_3:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_3_VF]]_add
+; CHECK: ![[VFMD_3]] = !{i32 [[DERIVED_3_VF]], i32 0, i32 0, i32 0}
+
+; CHECK: ![[DERIVED_1]] = !{![[VFMD_1]], {{.*}} @add
+; CHECK: ![[DERIVED_2]] = !{![[VFMD_2]], {{.*}} @add
+; CHECK: ![[DERIVED_3]] = !{![[VFMD_3]], {{.*}} @add
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  store i64 %call, i64* %tid, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
+  %0 = load i64, i64* %tid, align 8, !dbg !32
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
+  store i32 %2, i32* %a, align 4, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
+  %3 = load i64, i64* %tid, align 8, !dbg !33
+  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33
+  store i32 %5, i32* %b, align 4, !dbg !33
+  %6 = load i32, i32* %a, align 4, !dbg !34
+  %7 = load i32, i32* %b, align 4, !dbg !34
+  %add = add nsw i32 %6, %7, !dbg !34
+  %8 = load i64, i64* %tid, align 8, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34
+  ret void, !dbg !35
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12, !13, !14, !19, !20}
+!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9)
+!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.0 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !4)
+!31 = !DILocation(line: 3, scope: !4)
+!32 = !DILocation(line: 5, scope: !4)
+!33 = !DILocation(line: 6, scope: !4)
+!34 = !DILocation(line: 7, scope: !4)
+!35 = !DILocation(line: 8, scope: !4)
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
new file mode 100644
index 0000000000000..1124314e5fe1d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that veczc can vectorize a kernel then vectorize the vectorized kernel,
+; with base mappings from 1->2 and 2->3 and derived mappings back from 2->1 and
+; 3->2.
+; RUN: %veczc -k add:2 -S < %s > %t2
+; RUN: %veczc -k __vecz_v2_add:4 -S < %t2 | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  %tid = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid
+  %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %tid
+  %i2 = load i32, i32 addrspace(1)* %arrayidx1, align 16
+  %add = add nsw i32 %i1, %i2
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]]
+; CHECK: define spir_kernel void @__vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] {
+  ; CHECK: define spir_kernel void @__vecz_v4___vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.derived ![[DERIVED_2:[0-9]+]] {
+
+; CHECK: ![[BASE_1]] = !{![[VMD_1:[0-9]+]], {{.*}} @__vecz_v2_add}
+; CHECK: ![[VMD_1]] = !{i32 2, i32 0, i32 0, i32 0}
+; CHECK: ![[BASE_2]] = !{![[VMD_2:[0-9]+]], {{.*}} @__vecz_v4___vecz_v2_add}
+; CHECK: ![[VMD_2]] = !{i32 4, i32 0, i32 0, i32 0}
+; CHECK: ![[DERIVED_1]] = !{![[VMD_1]], {{.*}} @add}
+; CHECK: ![[DERIVED_2]] = !{![[VMD_2]], {{.*}} @__vecz_v2_add}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
new file mode 100644
index 0000000000000..04bc0b4489c5c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
@@ -0,0 +1,39 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that veczc can vectorize a kernel multiple times in one go, with an
+; equal width but with one enabling vector predication.
+; RUN: %veczc -k add:1s,1sp -S < %s | %filecheck %s
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: define spir_kernel void @add(
+define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx.in1 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx
+  %arrayidx.in2 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx
+  %in1.v = load i32, ptr addrspace(1) %arrayidx.in1, align 4
+  %in2.v = load i32, ptr addrspace(1) %arrayidx.in2, align 4
+  %add.v = add i32 %in1.v, %in2.v
+  %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
+  store i32 %add.v, ptr addrspace(1) %arrayidx.out
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv1_add
+
+; CHECK: define spir_kernel void @__vecz_nxv1_vp_add
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
new file mode 100644
index 0000000000000..ecd3e0ca49140
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k priv -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @priv(i32 addrspace(3)* %a) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %storemerge, %conv
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %a, i64 %idxprom
+  store i32 %conv, i32 addrspace(3)* %arrayidx, align 4
+  %inc = add i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(3)*)* @priv, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 3}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+
+; Test if the masked store is defined correctly
+; CHECK: call void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS3Dv4_b
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
new file mode 100644
index 0000000000000..f8adf1480dc93
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
@@ -0,0 +1,68 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k memop_loop_dep -vecz-passes=builtin-inlining,scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.addr.0, %e
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in)
+  call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out)
+  %0 = extractelement <4 x i32> %call1, i64 0
+  %tobool = icmp ne i32 %0, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond, label %for.inc
+
+while.cond:                                       ; preds = %while.cond, %for.body
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc, label %while.cond
+
+for.inc:                                          ; preds = %for.body, %while.cond
+  %inc = add nsw i32 %i.addr.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
+
+declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Make sure Scalarization only results in four loads, NOT FIVE
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+; CHECK-NOT: load i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
new file mode 100644
index 0000000000000..b7c3fd0218abc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
@@ -0,0 +1,93 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k memop_loop_dep -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp1 = icmp slt i32 %i, %e
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.addr.02 = phi i32 [ %i, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %0 = shl i64 %call, 2
+  %vload_base = getelementptr i32, i32 addrspace(1)* %in, i64 %0
+  %vload_ptr = bitcast i32 addrspace(1)* %vload_base to <4 x i32> addrspace(1)*
+  %vload = load <4 x i32>, <4 x i32> addrspace(1)* %vload_ptr, align 16
+  %1 = shl i64 %call, 2
+  %vstore_base = getelementptr i32, i32 addrspace(1)* %out, i64 %1
+  %vstore_ptr = bitcast i32 addrspace(1)* %vstore_base to <4 x i32> addrspace(1)*
+  store <4 x i32> %vload, <4 x i32> addrspace(1)* %vstore_ptr, align 16
+  %2 = extractelement <4 x i32> %vload, i64 0
+  %tobool = icmp ne i32 %2, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond.preheader, label %for.inc
+
+while.cond.preheader:                             ; preds = %for.body
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %while.cond.preheader
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc.loopexit, label %while.cond
+
+for.inc.loopexit:                                 ; preds = %while.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.body
+  %inc = add nsw i32 %i.addr.02, 1
+  %exitcond = icmp ne i32 %inc, %e
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+; CA-1431 when we scalarize the vector load, the pointer bitcast back to the
+; scalar type is not needed, since the original pointer was the same scalar
+; type and can be used directly.
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Make sure Scalarization doesn't create any redundant bitcasts
+; CHECK-NOT: bitcast
+; CHECK: getelementptr i32, ptr addrspace(1) %{{.+}}, i32 0
+; CHECK-NOT: bitcast
+; CHECK: load i32
+; CHECK-NOT: bitcast
+
+; Make sure there is no duplicate GEP that gets the 0-indexed element from the vector
+; CHECK-NOT: getelementptr i32, ptr addrspace(1) %{{.+}}, i32 0
+; CHECK-NOT: bitcast
+; CHECK: load i32
+; CHECK-NOT: bitcast
+; CHECK: load i32
+; CHECK-NOT: bitcast
+; CHECK: load i32
+; CHECK-NOT: bitcast
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
new file mode 100644
index 0000000000000..d33a7fbaf01fe
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s -vecz-auto | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @no_vecz1(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.cond.preheader, label %if.end
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp19 = icmp sgt i32 %n, 0
+  %spec.select = select i1 %cmp19, i32 %n, i32 0
+  store i32 %spec.select, i32 addrspace(1)* %out, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.cond.preheader, %entry
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK-NOT: insertelement
+; CHECK-NOT: shufflevector
+; CHECK-NOT: extractelement
+; CHECK-NOT: define void @__vecz_b_masked_store
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
new file mode 100644
index 0000000000000..a40a7fcd54261
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s -vecz-auto | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @no_vecz2(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %m) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %m, align 4
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.cond.preheader, label %if.end
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp167 = icmp sgt i32 %n, 0
+  br i1 %cmp167, label %for.body29.lr.ph, label %for.cond.cleanup28
+
+for.body29.lr.ph:                                 ; preds = %for.cond.preheader
+  %add = add i32 %0, 1
+  %factor = shl i32 %0, 2
+  %1 = shl i32 %n, 2
+  %2 = add i32 %1, -4
+  %reass.mul = mul i32 %2, %add
+  %3 = add i32 %factor, 4
+  %4 = add i32 %3, %reass.mul
+  br label %for.cond.cleanup28
+
+for.cond.cleanup28:                               ; preds = %for.body29.lr.ph, %for.cond.preheader
+  %ret.3.lcssa = phi i32 [ %4, %for.body29.lr.ph ], [ 0, %for.cond.preheader ]
+  store i32 %ret.3.lcssa, i32 addrspace(1)* %out, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.cond.cleanup28, %entry
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @{{(__vecz_v16_)?}}no_vecz2
+; CHECK-NOT: extractelement
+; CHECK-NOT: define void @__vecz_b_masked_store
+; CHECK: store i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
new file mode 100644
index 0000000000000..f51b667c8a5ef
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k offset_info_analysis -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @offset_info_analysis(i8 addrspace(1)* noalias %in, i8 addrspace(1)* noalias %out, i32 %width) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %call1 = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %width
+  %0 = xor i32 %width, -1
+  %add = add i32 %conv, %0
+  %add5 = add i32 %add, %mul
+  %idxprom = sext i32 %add5 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %1 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %mul10 = mul nsw i32 %conv2, %width
+  %add11 = add nsw i32 %mul10, %conv
+  %idxprom15 = sext i32 %add11 to i64
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom15
+  store i8 %1, i8 addrspace(1)* %arrayidx16, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks that a 'xor' as a binop operand does correctly get analyzed.
+; and masked properly
+; CHECK: define spir_kernel void @__vecz_v4_offset_info_analysis
+; CHECK: load <4 x i8>, ptr addrspace(1)
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load_Dv4_hDv4_u3ptrU3AS1
+; CHECK: ret void
+
+; Check the gather load definition is not generated.
+;CHECK-NOT: declare <4 x i8> @__vecz_b_gather_load_Dv4_hDv4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
new file mode 100644
index 0000000000000..5791487af388f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
new file mode 100644
index 0000000000000..350f8596122f6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
new file mode 100644
index 0000000000000..9ef093ee2e04b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isfinited -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfinited
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
new file mode 100644
index 0000000000000..79a1412459b28
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isfinitef -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfinitef
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
new file mode 100644
index 0000000000000..18fe8b3bb99d6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_d
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
new file mode 100644
index 0000000000000..443e5c45933cc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
new file mode 100644
index 0000000000000..b45071ffd9f69
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isinfd -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfd
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
new file mode 100644
index 0000000000000..1101efd59f06f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isinff -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinff
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
new file mode 100644
index 0000000000000..e9188f2764866
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_d
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
new file mode 100644
index 0000000000000..d3bcc7334a3f4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
new file mode 100644
index 0000000000000..8a6105451b12f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
@@ -0,0 +1,271 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnand -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnand
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: and <4 x i1>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
new file mode 100644
index 0000000000000..c4ab772d1ee00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
@@ -0,0 +1,271 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnanf -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanf
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: and <4 x i1>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
new file mode 100644
index 0000000000000..935a7ac56a58b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_d
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
new file mode 100644
index 0000000000000..492e9f78dcdd3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
new file mode 100644
index 0000000000000..fb4b861638f6b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
@@ -0,0 +1,269 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnormald -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormald
+; CHECK: and <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
new file mode 100644
index 0000000000000..d2d55fc36d237
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
@@ -0,0 +1,269 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_isnormalf -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalf
+; CHECK: and <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
new file mode 100644
index 0000000000000..eb9c8fcfa878f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
@@ -0,0 +1,77 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+define spir_kernel void @second_test(i32 %a, i32 %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+!opencl.kernels = !{!0, !6}
+!opencl.kernel_wg_size_info = !{!12}
+!llvm.ident = !{!13}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"}
+!5 = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""}
+!6 = !{void (i32, i32)* @second_test, !7, !8, !9, !10, !11}
+!7 = !{!"kernel_arg_addr_space", i32 0, i32 0}
+!8 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!9 = !{!"kernel_arg_type", !"int", !"int"}
+!10 = !{!"kernel_arg_base_type", !"int", !"int"}
+!11 = !{!"kernel_arg_type_qual", !"", !""}
+!12 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, i32 16, i32 1, i32 1, i1 true}
+!13 = !{!"clang version 3.8.1 "}
+
+; Sanity checking
+;CHECK-DAG: define spir_kernel void @test(ptr addrspace(2) %in, ptr addrspace(1) %out, ptr addrspace(2) %text, double %f)
+;CHECK-DAG: define spir_kernel void @__vecz_v4_test(ptr addrspace(2) %in, ptr addrspace(1) %out, ptr addrspace(2) %text, double %f)
+
+; Check if we have the metadata for the kernels
+; CHECK: !opencl.kernels = !{![[MD0:[0-9]+]], ![[MD6:[0-9]+]], ![[MD12:[0-9]+]]}
+; CHECK: !opencl.kernel_wg_size_info = !{![[MD13:[0-9]+]], ![[MD14:[0-9]+]]}
+; CHECK: !llvm.ident = !{![[MD15:[0-9]+]]}
+
+; Check the actual metadata
+; CHECK: ![[MD0]] = !{ptr @test, ![[MD1:[0-9]+]], ![[MD2:[0-9]+]], ![[MD3:[0-9]+]], ![[MD4:[0-9]+]], ![[MD5:[0-9]+]]}
+; CHECK: ![[MD1]] = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0}
+; CHECK: ![[MD2]] = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+; CHECK: ![[MD3]] = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"}
+; CHECK: ![[MD4]] = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"}
+; CHECK: ![[MD5]] = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""}
+; CHECK: ![[MD12]] = !{ptr @__vecz_v4_test, ![[MD1]], ![[MD2]], ![[MD3]], ![[MD4]], ![[MD5]]}
+; CHECK: ![[MD13]] = !{ptr @test, i32 16, i32 1, i32 1, i1 true}
+; CHECK: ![[MD14]] = !{ptr @__vecz_v4_test, i32 16, i32 1, i32 1, i1 true}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
new file mode 100644
index 0000000000000..1da14c05fff68
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k second_test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+define spir_kernel void @second_test(i32 %a, i32 %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+!opencl.kernels = !{!0, !6}
+!opencl.kernel_wg_size_info = !{!12}
+!llvm.ident = !{!13}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"}
+!5 = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""}
+!6 = !{void (i32, i32)* @second_test, !7, !8, !9, !10, !11}
+!7 = !{!"kernel_arg_addr_space", i32 0, i32 0}
+!8 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!9 = !{!"kernel_arg_type", !"int", !"int"}
+!10 = !{!"kernel_arg_base_type", !"int", !"int"}
+!11 = !{!"kernel_arg_type_qual", !"", !""}
+!12 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, i32 16, i32 1, i32 1, i1 true}
+!13 = !{!"clang version 3.8.1 "}
+
+; Sanity checking
+; CHECK: define spir_kernel void @second_test(i32 %a, i32 %b)
+; CHECK: define spir_kernel void @__vecz_v4_second_test(i32 %a, i32 %b)
+
+; Check if we have the metadata for the kernels
+; CHECK: !opencl.kernels = !{![[MD0:[0-9]+]], ![[MD6:[0-9]+]], ![[MD12:[0-9]+]]}
+; CHECK: !opencl.kernel_wg_size_info = !{![[MD13:[0-9]+]]}
+; CHECK: !llvm.ident = !{![[MD14:[0-9]+]]}
+
+; Check the actual metadata
+; CHECK: ![[MD6]] = !{ptr @second_test, ![[MD7:[0-9]+]], ![[MD8:[0-9]+]], ![[MD9:[0-9]+]], ![[MD10:[0-9]+]], ![[MD11:[0-9]+]]}
+; CHECK: ![[MD7]] = !{!"kernel_arg_addr_space", i32 0, i32 0}
+; CHECK: ![[MD8]] = !{!"kernel_arg_access_qual", !"none", !"none"}
+; CHECK: ![[MD9]] = !{!"kernel_arg_type", !"int", !"int"}
+; CHECK: ![[MD10]] = !{!"kernel_arg_base_type", !"int", !"int"}
+; CHECK: ![[MD11]] = !{!"kernel_arg_type_qual", !"", !""}
+; CHECK: ![[MD12]] = !{ptr @__vecz_v4_second_test, ![[MD7]], ![[MD8]], ![[MD9]], ![[MD10]], ![[MD11]]}
+; CHECK: ![[MD13]] = !{ptr @test, i32 16, i32 1, i32 1, i1 true}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
new file mode 100644
index 0000000000000..aaf38c1333647
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -vecz-auto -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@entry_test_alloca.lm = internal unnamed_addr addrspace(3) constant [16 x <2 x float>] undef, align 8
+
+define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr {
+entry:
+  %a.sroa.0 = alloca <2 x float>, align 16
+  %b.sroa.2 = alloca <2 x float>, align 16
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8*
+  %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8*
+  %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx2, align 8
+  %conv = sext i32 %offset to i64
+  %add = add i64 %call1, %conv
+  %arrayidx4 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %add
+  %1 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx4, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup10
+  %mul.le.le = fmul <2 x float> %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0., %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8.
+  %arrayidx17 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i64 %call
+  store <2 x float> %mul.le.le, <2 x float> addrspace(1)* %arrayidx17, align 8
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup10, %entry
+  %i.038 = phi i32 [ 0, %entry ], [ %inc15, %for.cond.cleanup10 ]
+  store volatile <2 x float> %0, <2 x float>* %a.sroa.0, align 8
+  store volatile <2 x float> %1, <2 x float>* %b.sroa.2, align 8
+  br label %for.body11
+
+for.cond.cleanup10:                               ; preds = %for.body11
+  %inc15 = add nuw nsw i32 %i.038, 1
+  %cmp = icmp ult i32 %inc15, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body11:                                       ; preds = %for.body11, %for.body
+  %i6.037 = phi i32 [ 0, %for.body ], [ %inc, %for.body11 ]
+  %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0. = load volatile <2 x float>, <2 x float>* %a.sroa.0, align 8
+  %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. = load volatile <2 x float>, <2 x float>* %b.sroa.2, align 8
+  %inc = add nuw nsw i32 %i6.037, 1
+  %cmp8 = icmp ult i32 %inc, 16
+  br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+declare spir_func i64 @_Z12get_local_idj(i32) local_unnamed_addr
+
+; Check that all the allocas come before anything else
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
new file mode 100644
index 0000000000000..b105a9b6adf77
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_branch -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_branch(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks if the branch conditions and the branch BBs are vectorized
+; and masked properly
+; CHECK: define spir_kernel void @__vecz_v4_test_branch(i32 %a, ptr %b)
+; CHECK: %conv = sext i32 %a to i64
+; CHECK: %[[A_SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %conv, {{i32|i64}} 0
+; CHECK: %[[A_SPLAT:.+]] = shufflevector <4 x i64> %[[A_SPLATINSERT]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %[[GID_SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %call, {{i32|i64}} 0
+; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[GID:.+]] = add <4 x i64> %[[GID_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK: %[[CMP3:.+]] = icmp eq <4 x i64> %[[A_SPLAT]], %[[GID]]
+; CHECK: %[[NOT_CMP4:.+]] = xor <4 x i1> %[[CMP3]], <i1 true, i1 true, i1 true, i1 true>
+
+; CHECK: %[[IDX:.+]] = sext i32 %a to i64
+; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr %b, i64 %[[IDX]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 11, ptr %[[GEP1]], i1 %{{any_of_mask[0-9]*}})
+
+; CHECK: %[[GEP2:.+]] = getelementptr inbounds i32, ptr %b, i64 42
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 13, ptr %[[GEP2]], i1 %{{any_of_mask[0-9]*}})
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
new file mode 100644
index 0000000000000..0cecd48c243c0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -0,0 +1,167 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info is preserved in the vectorized kernel.
+; Specifically that the packetization pass creates vector types
+; in the DI for the variables.
+; RUN: %veczc -k add -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  store i64 %call, i64* %tid, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
+  %0 = load i64, i64* %tid, align 8, !dbg !32
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
+  store i32 %2, i32* %a, align 4, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
+  %3 = load i64, i64* %tid, align 8, !dbg !33
+  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33
+  store i32 %5, i32* %b, align 4, !dbg !33
+  %6 = load i32, i32* %a, align 4, !dbg !34
+  %7 = load i32, i32* %b, align 4, !dbg !34
+  %add = add nsw i32 %6, %7, !dbg !34
+  %8 = load i64, i64* %tid, align 8, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34
+  ret void, !dbg !35
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12, !13, !14, !19, !20}
+!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9)
+!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.0 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !4)
+!31 = !DILocation(line: 3, scope: !4)
+!32 = !DILocation(line: 5, scope: !4)
+!33 = !DILocation(line: 6, scope: !4)
+!34 = !DILocation(line: 7, scope: !4)
+!35 = !DILocation(line: 8, scope: !4)
+
+; Vectorized kernel function
+; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_add({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
+
+; Check that intrinsics for user variable locations are still present
+; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
+; CHECK-SAME: !dbg [[PARAM_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[PARAM_LOC]]
+
+; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[PARAM_LOC]]
+
+; CHECK: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[TID_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.value(metadata {{.*}}, metadata [[DI_A:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME:!dbg [[A_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.value(metadata {{.*}}, metadata [[DI_B:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME:!dbg [[B_LOC:![0-9]+]]
+
+; Debug info metadata entries
+; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_BASE:![0-9]+]], size: 64, align: 64)
+; CHECK:[[DI_BASE]] = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+
+; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "add",
+; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]]
+
+; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A]], [[DI_B]]}
+; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+; CHECK: [[DI_OUT]] = !DILocalVariable(name: "out", arg: 3, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]]
+; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 5, type: [[VECTOR_TYPE:![0-9]+]])
+
+; Vectorized debug info type create in packetization pass
+; CHECK: [[VECTOR_TYPE]] = !DICompositeType(tag: DW_TAG_array_type, baseType: [[DI_BASE]], size: {{[0-9]+}}, align: {{[0-9]+}}
+; CHECK-SAME:flags: DIFlagVector, elements: ![[DI_ELEMS:[0-9]+]])
+; CHECK:[[DI_ELEMS]] = !{[[DI_SUBRANGE:![0-9]+]]}
+; LLVM 11 adds a lowerBound argument to DISubrange, so the optional check below
+; CHECK: [[DI_SUBRANGE]] = !DISubrange(count: [[WIDTH]]{{(, lowerBound: [0-9])?}})
+
+; CHECK: [[DI_B]] = !DILocalVariable(name: "b", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 6, type: [[VECTOR_TYPE]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
new file mode 100644
index 0000000000000..c51911da17ce4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
@@ -0,0 +1,93 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_nonvarying_loadstore -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_branch(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i32 %a, 42
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %idxadd = add i64 %idxprom, %call
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxadd
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %call
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  %ptr = phi i32* [ %arrayidx, %if.then ], [ %arrayidx2, %if.else ]
+  %ptrplus = getelementptr inbounds i32, i32* %ptr, i64 %call
+  store i32 17, i32* %ptrplus, align 4
+  ret void
+}
+
+define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
+  %index = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a.i = getelementptr i32, i32* %a, i64 %index
+  %b.i = getelementptr i32, i32* %b, i64 %index
+  %c.i = getelementptr i32, i32* %c, i64 %index
+  %a.load = load i32, i32* %a.i, align 4
+  %b.load = load i32, i32* %b.i, align 4
+  %add = add i32 %a.load, %b.load
+  store i32 %add, i32* %c.i
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks if a simple kernel is vectorized without any masks
+; CHECK: define spir_func void @__vecz_v4_test_nonvarying_loadstore(ptr %a, ptr %b, ptr %c)
+; CHECK: %index = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %a.i = getelementptr i32, ptr %a, i64 %index
+; CHECK: %b.i = getelementptr i32, ptr %b, i64 %index
+; CHECK: %c.i = getelementptr i32, ptr %c, i64 %index
+; CHECK: %[[LAV:.+]] = load <4 x i32>, ptr %a.i{{(, align 4)?}}
+; CHECK: %[[LBV:.+]] = load <4 x i32>, ptr %b.i{{(, align 4)?}}
+; CHECK: %[[ADD1:.+]] = add <4 x i32> %[[LAV]], %[[LBV]]
+; CHECK: store <4 x i32> %[[ADD1]], ptr %c.i{{(, align 4)?}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
new file mode 100644
index 0000000000000..a062879f1c413
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -0,0 +1,105 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_uniform_branch -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_branch(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i32 %a, 42
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %idxadd = add i64 %idxprom, %call
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxadd
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %call
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  %ptr = phi i32* [ %arrayidx, %if.then ], [ %arrayidx2, %if.else ]
+  %ptrplus = getelementptr inbounds i32, i32* %ptr, i64 %call
+  store i32 17, i32* %ptrplus, align 4
+  ret void
+}
+
+define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
+  %index = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a.i = getelementptr i32, i32* %a, i64 %index
+  %b.i = getelementptr i32, i32* %b, i64 %index
+  %c.i = getelementptr i32, i32* %c, i64 %index
+  %a.load = load i32, i32* %a.i, align 4
+  %b.load = load i32, i32* %b.i, align 4
+  %add = add i32 %a.load, %b.load
+  store i32 %add, i32* %c.i
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks if the if blocks are vectorized without masks and if the phi
+; node is also vectorized properly
+; CHECK: define spir_kernel void @__vecz_v4_test_uniform_branch(i32 %a, ptr %b)
+; CHECK: %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %[[SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %call, {{i32|i64}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i64> %[[SPLATINSERT]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[GID:.+]] = add <4 x i64> %[[SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK: %cmp = icmp eq i32 %a, 42
+; CHECK: br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr %b, <4 x i64>
+; CHECK: store <4 x i32> <i32 11, i32 11, i32 11, i32 11>, ptr %{{.+}}, align 4
+; CHECK: br label %if.end
+
+; CHECK: if.else:
+; CHECK: %[[GEP2:.+]] = getelementptr inbounds i32, ptr %b, <4 x i64>
+; CHECK: store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, ptr %{{.+}}, align 4
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[PTR:.+]] = phi <4 x ptr> [ %[[GEP1]], %if.then ], [ %[[GEP2]], %if.else ]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
new file mode 100644
index 0000000000000..7a0ef406be97e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.T = type { i32, i8, float, i64 }
+
+; Function Attrs: nounwind
+define spir_kernel void @test(%struct.T addrspace(1)* %in, %struct.T addrspace(1)* %out, i32 addrspace(1)* %offsets) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %conv = sext i32 %0 to i64
+  %add = add i64 %conv, %call
+  %c = getelementptr inbounds %struct.T, %struct.T addrspace(1)* %in, i64 %add, i32 2
+  %1 = load float, float addrspace(1)* %c, align 8
+  %c3 = getelementptr inbounds %struct.T, %struct.T addrspace(1)* %out, i64 %add, i32 2
+  store float %1, float addrspace(1)* %c3, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; Check if we can packetize GEPs on structs
+; Note that we only need to packetize the non-uniform operands..
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: getelementptr inbounds %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
+; CHECK: getelementptr inbounds %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
new file mode 100644
index 0000000000000..d918b4cbd2fd1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k conditional -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK: insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %in, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x ptr addrspace(1)>
+; CHECK: call <4 x i32> @__vecz_b_gather_load4_Dv4_jDv4_u3ptrU3AS1
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
new file mode 100644
index 0000000000000..4d18b8b5ef103
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k conditional -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK: load i32, ptr
+; CHECK: insertelement <4 x i32> {{poison|undef}}
+; CHECK: shufflevector <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
new file mode 100644
index 0000000000000..5115d2dd1ee1c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k noreduce -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: and i32{{.*}}, 3
+; CHECK: icmp eq i32
+; CHECK: shl i32
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
new file mode 100644
index 0000000000000..92fcc3273a0dc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k noreduce2 -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 8
+  %1 = select i1 %0, i32 17, i32 %storemerge
+  %rem = urem i32 37, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64 %{{.+}}, 1
+; CHECK: phi i32
+; CHECK: icmp eq i32 %{{.+}}, 8
+; CHECK: urem i32 37
+; CHECK: icmp eq i32 %{{.+}}, 0
+; CHECK: store i32 5
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
new file mode 100644
index 0000000000000..6f32f316032dc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -0,0 +1,165 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k reduce -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: insertelement <4 x i64> {{poison|undef}}, i64
+; CHECK: shufflevector <4 x i64>
+; CHECK: %[[LOCAL_SIZE:[^ ]+]] = call spir_func i64 @_Z14get_local_sizej(i32 0)
+; CHECK: icmp {{(ugt|ult)}} i64 %[[LOCAL_SIZE]], {{(1|2)}}
+; CHECK-NEXT: br
+; CHECK: phi i32
+; CHECK: mul i32 %{{.+}}, 3
+; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS3Dv4_b(<4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
new file mode 100644
index 0000000000000..530a6010f1c1b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
@@ -0,0 +1,160 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k conditional -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK: load i32, {{(ptr|i32)}}
+; CHECK: load i32, {{(ptr|i32)}}
+; CHECK: insertelement <4 x i32> {{poison|undef}}
+; CHECK: shufflevector <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
new file mode 100644
index 0000000000000..6fbc520e4ed02
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k noreduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: and i32{{.*}}, 3
+; CHECK: icmp eq i32
+; CHECK: shl i32
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
new file mode 100644
index 0000000000000..90a3c3d401a00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k noreduce2 -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 8
+  %1 = select i1 %0, i32 17, i32 %storemerge
+  %rem = urem i32 37, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: phi i32
+; CHECK: icmp eq i32
+; CHECK: urem i32 37
+; CHECK: icmp eq i32
+; CHECK: store i32 5
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
new file mode 100644
index 0000000000000..3bef75fe015ca
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k reduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: insertelement <4 x i64> {{poison|undef}}, i64 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: phi <4 x i32>
+; CHECK: mul <4 x i32> %{{.+}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: urem <4 x i64>
+; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
+
+; The branch condition is actually Uniform, despite the divergence analysis
+; CHECK: icmp ugt i64
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
new file mode 100644
index 0000000000000..a1563ab8c4e75
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k noreduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: and i32{{.*}}, 3
+; CHECK: icmp eq i32
+; CHECK: shl i32
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
new file mode 100644
index 0000000000000..1c11c30d70168
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k noreduce2 -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 8
+  %1 = select i1 %0, i32 17, i32 %storemerge
+  %rem = urem i32 37, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: phi i32
+; CHECK: icmp eq i32
+; CHECK: urem i32 37
+; CHECK: icmp eq i32
+; CHECK: store i32 5
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
new file mode 100644
index 0000000000000..af46221d31cbd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k reduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z14get_local_sizej(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: insertelement <4 x i64> {{poison|undef}}, i64 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: phi <4 x i32>
+; CHECK: mul <4 x i32> %{{.+}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: urem <4 x i64>
+; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
+
+; The branch condition is actually Uniform, despite the divergence analysis
+; CHECK: icmp ugt i64
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
new file mode 100644
index 0000000000000..46bcececcc3ad
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
@@ -0,0 +1,377 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;        a
+;       / \
+;      b   c
+;       \ /
+;        d
+;        |
+;        e
+;       / \
+;      /   \
+;     f     g
+;    / \   / \
+;   h   i j   k
+;    \ /   \ /
+;     l     m
+;      \   /
+;       \ /
+;        n
+;
+; * where node e is a uniform branch, and nodes a, f and g are varying
+;   branches.
+; * where nodes b, c, d, h, i, j, k, l, m are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     c
+;     |
+;     b
+;     |
+;     d
+;     |
+;     e
+;    / \
+;   f   g
+;   |   |
+;   i   k
+;   |   |
+;   h   j
+;   |   |
+;   l   m
+;    \ /
+;     n
+;
+; instead of:
+;
+;   a
+;   |
+;   b
+;   |
+;   c
+;   |
+;   d
+;   |
+;   e
+;   |
+;   g
+;   |
+;   j
+;   |
+;   k
+;   |
+;   m
+;   |
+;   f
+;   |
+;   i
+;   |
+;   h
+;   |
+;   l
+;   |
+;   n
+;
+; __kernel void partial_linearization0(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 5 == 0) {
+;     for (int i = 0; i < n * 2; i++) ret++;
+;   } else {
+;     for (int i = 0; i < n / 4; i++) ret++;
+;   }
+;
+;   if (n > 10) { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 10; i++) ret *= 2;
+;     }
+;     ret += id * 10;
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 8; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 8; i++) ret *= 2;
+;     }
+;     ret += id / 2;
+;   }
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem = srem i32 %conv, 5
+  %cmp = icmp eq i32 %rem, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge8, %mul
+  br i1 %cmp2, label %for.body, label %if.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge8, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %entry
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.body9, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %inc10, %for.body9 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc12, %for.body9 ]
+  %div = sdiv i32 %n, 4
+  %cmp7 = icmp slt i32 %storemerge, %div
+  br i1 %cmp7, label %for.body9, label %if.end
+
+for.body9:                                        ; preds = %for.cond6
+  %inc10 = add nsw i32 %ret.1, 1
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond6
+
+if.end:                                           ; preds = %for.cond6, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond6 ]
+  %cmp14 = icmp sgt i32 %n, 10
+  %rem175 = and i32 %conv, 1
+  %cmp18 = icmp eq i32 %rem175, 0
+  br i1 %cmp14, label %if.then16, label %if.else44
+
+if.then16:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then20, label %if.else30
+
+if.then20:                                        ; preds = %if.then16
+  br label %for.cond22
+
+for.cond22:                                       ; preds = %for.body25, %if.then20
+  %ret.3 = phi i32 [ %ret.2, %if.then20 ], [ %inc26, %for.body25 ]
+  %storemerge7 = phi i32 [ 0, %if.then20 ], [ %inc28, %for.body25 ]
+  %add = add nsw i32 %n, 10
+  %cmp23 = icmp slt i32 %storemerge7, %add
+  br i1 %cmp23, label %for.body25, label %if.end41
+
+for.body25:                                       ; preds = %for.cond22
+  %inc26 = add nsw i32 %ret.3, 1
+  %inc28 = add nsw i32 %storemerge7, 1
+  br label %for.cond22
+
+if.else30:                                        ; preds = %if.then16
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.4 = phi i32 [ %ret.2, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge6 = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 10
+  %cmp34 = icmp slt i32 %storemerge6, %add33
+  br i1 %cmp34, label %for.body36, label %if.end41
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.4, 1
+  %inc39 = add nsw i32 %storemerge6, 1
+  br label %for.cond32
+
+if.end41:                                         ; preds = %for.cond32, %for.cond22
+  %ret.5 = phi i32 [ %ret.3, %for.cond22 ], [ %ret.4, %for.cond32 ]
+  %mul42 = mul nsw i32 %conv, 10
+  %add43 = add nsw i32 %ret.5, %mul42
+  br label %if.end73
+
+if.else44:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then48, label %if.else59
+
+if.then48:                                        ; preds = %if.else44
+  br label %for.cond50
+
+for.cond50:                                       ; preds = %for.body54, %if.then48
+  %ret.6 = phi i32 [ %ret.2, %if.then48 ], [ %inc55, %for.body54 ]
+  %storemerge4 = phi i32 [ 0, %if.then48 ], [ %inc57, %for.body54 ]
+  %add51 = add nsw i32 %n, 8
+  %cmp52 = icmp slt i32 %storemerge4, %add51
+  br i1 %cmp52, label %for.body54, label %if.end70
+
+for.body54:                                       ; preds = %for.cond50
+  %inc55 = add nsw i32 %ret.6, 1
+  %inc57 = add nsw i32 %storemerge4, 1
+  br label %for.cond50
+
+if.else59:                                        ; preds = %if.else44
+  br label %for.cond61
+
+for.cond61:                                       ; preds = %for.body65, %if.else59
+  %ret.7 = phi i32 [ %ret.2, %if.else59 ], [ %mul66, %for.body65 ]
+  %storemerge2 = phi i32 [ 0, %if.else59 ], [ %inc68, %for.body65 ]
+  %add62 = add nsw i32 %n, 8
+  %cmp63 = icmp slt i32 %storemerge2, %add62
+  br i1 %cmp63, label %for.body65, label %if.end70
+
+for.body65:                                       ; preds = %for.cond61
+  %mul66 = shl nsw i32 %ret.7, 1
+  %inc68 = add nsw i32 %storemerge2, 1
+  br label %for.cond61
+
+if.end70:                                         ; preds = %for.cond61, %for.cond50
+  %ret.8 = phi i32 [ %ret.6, %for.cond50 ], [ %ret.7, %for.cond61 ]
+  %div71 = sdiv i32 %conv, 2
+  %add72 = add nsw i32 %ret.8, %div71
+  br label %if.end73
+
+if.end73:                                         ; preds = %if.end70, %if.end41
+  %storemerge3 = phi i32 [ %add72, %if.end70 ], [ %add43, %if.end41 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization0, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization0
+; CHECK: br label %[[FORCOND6PREHEADER:.+]]
+
+; CHECK: [[FORCOND6PREHEADER]]:
+; CHECK: br label %[[FORCOND6:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{((%[0-9A-Za-z\.]+))|(false)}}, label %[[FORBODY:.+]], label %[[IFENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND6]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[FORBODY9:.+]], label %[[IFENDLOOPEXIT6:.+]]
+
+; CHECK: [[FORBODY9]]:
+; CHECK: br label %[[FORCOND6]]
+
+; CHECK: [[IFENDLOOPEXIT]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFENDLOOPEXIT6]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br i1 %[[CMP14]], label %[[IFTHEN16:.+]], label %[[IFELSE44:.+]]
+
+; CHECK: [[IFTHEN16]]:
+; CHECK: br label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND32PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND22PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND22:.+]]
+
+; CHECK: [[FORCOND22]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[FORBODY25:.+]], label %[[IFEND41LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY25]]:
+; CHECK: br label %[[FORCOND22]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: %[[CMP34:.+]] = icmp
+; CHECK: br i1 %[[CMP34]], label %[[FORBODY36:.+]], label %[[IFEND41LOOPEXIT4:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[IFEND41LOOPEXIT]]:
+; CHECK: br label %[[IFEND41:.+]]
+
+; CHECK: [[IFEND41LOOPEXIT4]]:
+; CHECK: br label %[[FORCOND22PREHEADER]]
+
+; CHECK: [[IFEND41]]:
+; CHECK: br label %[[IFEND73:.+]]
+
+; CHECK: [[IFELSE44]]:
+; CHECK: br label %[[FORCOND61PREHEADER:.+]]
+
+; CHECK: [[FORCOND61PREHEADER]]:
+; CHECK: br label %[[FORCOND61:.+]]
+
+; CHECK: [[FORCOND50PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND50:.+]]
+
+; CHECK: [[FORCOND50]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[FORBODY54:.+]], label %[[IFEND70LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY54]]:
+; CHECK: br label %[[FORCOND50]]
+
+; CHECK: [[FORCOND61]]:
+; CHECK: %[[CMP63:.+]] = icmp
+; CHECK: br i1 %[[CMP63]], label %[[FORBODY65:.+]], label %[[IFEND70LOOPEXIT5:.+]]
+
+; CHECK: [[FORBODY65]]:
+; CHECK: br label %[[FORCOND61]]
+
+; CHECK: [[IFEND70LOOPEXIT]]:
+; CHECK: br label %[[IFEND70:.+]]
+
+; CHECK: [[IFEND70LOOPEXIT5]]:
+; CHECK: br label %[[FORCOND50PREHEADER]]
+
+; CHECK: [[IFEND70]]:
+; CHECK: br label %[[IFEND73]]
+
+; CHECK: [[IFEND73]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
new file mode 100644
index 0000000000000..c30b1da79f024
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
@@ -0,0 +1,261 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes c and f are uniform branches, and node b is a varying
+;   branch.
+; * where nodes c, d, e, f, g and h are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;   a
+;   |
+;   b <.
+;   |  |
+;   d  |
+;   |  |
+;   c  |
+;   |  |
+;   f -'
+;   |
+;   g
+;   |
+;   e
+;   |
+;   h
+;
+; __kernel void partial_linearization1(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (id + i % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     }
+;     if (n <= 2) break;
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end14, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.2, %if.end14 ]
+  %cmp = icmp eq i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end10
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else
+  %ret.1 = phi i32 [ %ret.0, %if.else ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc9, %for.body ]
+  %add6 = add nsw i32 %n, 10
+  %cmp7 = icmp slt i32 %storemerge, %add6
+  br i1 %cmp7, label %for.body, label %if.end10
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc9 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.end10:                                         ; preds = %for.cond, %if.then
+  %ret.2 = phi i32 [ %ret.0, %if.then ], [ %ret.1, %for.cond ]
+  %cmp11 = icmp slt i32 %n, 3
+  br i1 %cmp11, label %while.end, label %if.end14
+
+if.end14:                                         ; preds = %if.end10
+  br label %while.body
+
+while.end:                                        ; preds = %if.end10
+  %mul = mul i32 %n, 2
+  %add15 = add nsw i32 %ret.2, %mul
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body21, %while.end
+  %ret.3 = phi i32 [ %add15, %while.end ], [ %sub, %for.body21 ]
+  %storemerge1 = phi i32 [ 0, %while.end ], [ %inc23, %for.body21 ]
+  %mul18 = shl nsw i32 %n, 1
+  %cmp19 = icmp slt i32 %storemerge1, %mul18
+  br i1 %cmp19, label %for.body21, label %for.end24
+
+for.body21:                                       ; preds = %for.cond17
+  %sub = sub nsw i32 %ret.3, %storemerge1
+  %inc23 = add nsw i32 %storemerge1, 1
+  br label %for.cond17
+
+for.end24:                                        ; preds = %for.cond17
+  %0 = icmp eq i32 %ret.3, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.3, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %e
+  %ret.4 = phi i32 [ %ret.0, %e ], [ %div31, %for.body30 ]
+  %storemerge3 = phi i32 [ 0, %e ], [ %inc33, %for.body30 ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %for.end34
+
+for.body30:                                       ; preds = %for.cond26
+  %div31 = sdiv i32 %ret.4, 2
+  %inc33 = add nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+for.end34:                                        ; preds = %for.cond26
+  %sub35 = sub nsw i32 %ret.4, %n
+  br label %early
+
+early:                                            ; preds = %for.end34, %for.end24
+  %storemerge2 = phi i32 [ %div, %for.end24 ], [ %sub35, %for.end34 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization1, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization1
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND10:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFEND10LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[IFEND10LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND10]]:
+; CHECK: %[[CMP11:.+]] = icmp
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[WHILEENDELSE:.+]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21:.+]], label %[[FOREND24:.+]]
+
+; CHECK: [[FORBODY21]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FOREND24]]:
+; CHECK: br label %[[WHILEENDELSE]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: %[[CMP28:.+]] = icmp
+; CHECK: br i1 %[[CMP28]], label %[[FORBODY30:.+]], label %[[FOREND34:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[FOREND34]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
new file mode 100644
index 0000000000000..0f515398e5f40
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
@@ -0,0 +1,465 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;            a
+;            |
+;            b <-----.
+;           / \      |
+;          c   d     |
+;         / \ /      |
+;        /   e       |
+;       /    |       |
+;      /     g <---. |
+;     /     / \    | |
+;    /     h   i   | |
+;   f     / \ / \  | |
+;   |    j   k   l | |
+;   |   /|  / \ /  | |
+;   |  m | n   o --' |
+;   | /  |/          |
+;   |/   q ----------'
+;   p    |
+;    \   r
+;     \ /
+;      s
+;
+; * where nodes b, c, g, h, j, k and q are uniform branches, and node i is a
+;   varying branch.
+; * where nodes k, l, o, n, m, p, q, r and s are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;          a
+;          |
+;          b <-----.
+;         / \      |
+;        c   d     |
+;       / \ /      |
+;      /   e       |
+;     /    |       |
+;    /     g <---. |
+;   f     / \    | |
+;   |    /   \   | |
+;   |   h     i  | |
+;   |  / \    |  | |
+;   | j   |   l  | |
+;   | |    \ /   | |
+;   | |     k    | |
+;   |  \    |    | |
+;   |   \   o ---' |
+;   |    \ /       |
+;   |     n        |
+;    \    |        |
+;     \   q -------'
+;      \ /
+;       m
+;       |
+;       r
+;       |
+;       p
+;       |
+;       s
+;
+; __kernel void partial_linearization10(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           // j
+;           goto j;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // l
+;           ret /= n * n + ret;
+;           goto o;
+;         }
+;       }
+;       // k
+;       if (n & 1) {
+;         // n
+;         ret += n * ret;
+;         goto n;
+;       }
+;       // o
+; o:
+;       ret++;
+;     }
+; j:
+;     if (n < 2) {
+;       // m
+;       ret += n * 2 + 20;
+;       goto p;
+;     } else {
+;       goto q;
+;     }
+; n:
+;     ret *= 4;
+; q:
+;     if (n & 1) {
+;       // r
+;       ret++;
+;       goto r;
+;     }
+;   }
+;
+; r:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto s;
+;
+; f:
+;   ret /= n;
+;   goto p;
+;
+; p:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; s:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end55, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.5, %if.end55 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.1 = phi i32 [ %ret.0, %if.then ], [ %inc, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge5, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.2 = phi i32 [ %ret.0, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %o, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %o ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %j, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %o
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %o, label %if.then37
+
+if.then37:                                        ; preds = %if.end34
+  %mul38 = mul nsw i32 %storemerge1, %n
+  %add39 = add nsw i32 %mul38, %storemerge1
+  %mul50 = shl nsw i32 %add39, 2
+  br label %q
+
+o:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+j:                                                ; preds = %if.then21
+  %cmp42 = icmp eq i32 %n, 2
+  br i1 %cmp42, label %q, label %if.then44
+
+if.then44:                                        ; preds = %j
+  %mul45 = mul i32 %n, 2
+  %add46 = add nsw i32 %mul45, 20
+  %add47 = add nsw i32 %add46, %storemerge1
+  br label %p
+
+q:                                                ; preds = %j, %if.then37
+  %ret.5 = phi i32 [ %mul50, %if.then37 ], [ %storemerge1, %j ]
+  %and51 = and i32 %n, 1
+  %tobool52 = icmp eq i32 %and51, 0
+  br i1 %tobool52, label %if.end55, label %if.then53
+
+if.then53:                                        ; preds = %q
+  br label %for.cond57
+
+if.end55:                                         ; preds = %q
+  br label %while.body
+
+for.cond57:                                       ; preds = %for.body61, %if.then53
+  %ret.6.in = phi i32 [ %ret.5, %if.then53 ], [ %ret.6, %for.body61 ]
+  %storemerge2 = phi i32 [ 0, %if.then53 ], [ %inc64, %for.body61 ]
+  %ret.6 = add nsw i32 %ret.6.in, 1
+  %div58 = sdiv i32 %n, 4
+  %cmp59 = icmp slt i32 %storemerge2, %div58
+  br i1 %cmp59, label %for.body61, label %s
+
+for.body61:                                       ; preds = %for.cond57
+  %inc64 = add nsw i32 %storemerge2, 1
+  br label %for.cond57
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div66 = sdiv i32 %ret.1, %7
+  br label %p
+
+p:                                                ; preds = %f, %if.then44
+  %storemerge3 = phi i32 [ %add47, %if.then44 ], [ %div66, %f ]
+  br label %for.cond68
+
+for.cond68:                                       ; preds = %for.body72, %p
+  %ret.7 = phi i32 [ %storemerge3, %p ], [ %inc73, %for.body72 ]
+  %storemerge4 = phi i32 [ 0, %p ], [ %inc75, %for.body72 ]
+  %mul69 = shl nsw i32 %n, 1
+  %cmp70 = icmp slt i32 %storemerge4, %mul69
+  br i1 %cmp70, label %for.body72, label %s
+
+for.body72:                                       ; preds = %for.cond68
+  %inc73 = add nsw i32 %ret.7, 1
+  %inc75 = add nsw i32 %storemerge4, 1
+  br label %for.cond68
+
+s:                                                ; preds = %for.cond68, %for.cond57
+  %ret.8 = phi i32 [ %ret.6, %for.cond57 ], [ %ret.7, %for.cond68 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization10, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization10
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[AND:.+]] = and i32
+; CHECK: %[[TOBOOL:.+]] = icmp eq i32 %[[AND]]
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: %[[CMP22:.+]] = icmp
+; CHECK: br i1 %[[CMP22]], label %[[J:.+]], label %[[IFEND34:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[IFTHEN37:.+]]:
+; CHECK: br label %[[IFTHEN37ELSE:.+]]
+
+; CHECK: [[IFTHEN37ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[JELSE:.+]], label %[[JSPLIT:.+]]
+
+; CHECK: [[O]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[IFTHEN37]]
+
+; CHECK: [[J]]:
+; CHECK: br label %[[WHILEBODY20PUREEXIT]]
+
+; CHECK: [[JELSE]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[JSPLIT]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[IFTHEN44:.+]]:
+; CHECK: br label %[[IFTHEN44ELSE:.+]]
+
+; CHECK: [[IFTHEN44ELSE]]:
+; CHECK: br label %[[FORCOND57PREHEADER:.+]]
+
+; CHECK: [[Q]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN44]]
+
+; CHECK: [[FORCOND57PREHEADER]]:
+; CHECK: br label %[[FORCOND57:.+]]
+
+; CHECK: [[FORCOND57PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND57]]:
+; CHECK: %[[CMP59:.+]] = icmp
+; CHECK: br i1 %[[CMP59]], label %[[FORBODY61:.+]], label %[[SLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY61]]:
+; CHECK: br label %[[FORCOND57]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[P]]
+
+; CHECK: [[P]]:
+; CHECK: br label %[[FORCOND68:.+]]
+
+; CHECK: [[FORCOND68]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72:.+]], label %[[SLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY72]]:
+; CHECK: br label %[[FORCOND68]]
+
+; CHECK: [[SLOOPEXIT]]:
+; CHECK: br label %[[S:.+]]
+
+; CHECK: [[SLOOPEXIT2]]:
+; CHECK: br label %[[FORCOND57PREHEADERELSE]]
+
+; CHECK: [[S]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
new file mode 100644
index 0000000000000..8baab708fcb5d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
@@ -0,0 +1,357 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-------.
+;       |         |
+;       c <---.   |
+;      / \    |   |
+;     d   e   |   |
+;    / \ / \  |   |
+;   i   f   g |   |
+;   |  / \ / \|   |
+;   | j   h --'   |
+;   | |        \  |
+;   | |         k |
+;   |  \       /  |
+;   |   \     /   |
+;   |    \   /    |
+;   |     \ /     |
+;   |      l -----'
+;   |     /
+;    \   m
+;     \ /
+;      n
+;
+; * where nodes c, d, f, g, and l are uniform branches, and node e is a
+;   varying branch.
+; * where nodes i, f, g, j, h, k, l, m and n are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <----.
+;     |      |
+;     c <--. |
+;    / \   | |
+;   d   e  | |
+;   |   |  | |
+;   |   g  | |
+;    \ /   | |
+;     f    | |
+;     |    | |
+;     h ---' |
+;     |      |
+;     k      |
+;     |      |
+;     j      |
+;     |      |
+;     l -----'
+;     |
+;     m
+;     |
+;     i
+;     |
+;     n
+;
+; __kernel void partial_linearization11(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     // b
+;     while (1) {
+;       if (n < 5) { // c
+;         // d
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         if (n <= 3) {
+;           // i
+;           goto i;
+;         }
+;       } else {
+;         // e
+;         if (ret + id >= n) {
+;           // g
+;           ret /= n * n + ret;
+;           if (n <= 10) {
+;             goto k;
+;           } else {
+;             goto h;
+;           }
+;         }
+;       }
+;       // f
+;       ret *= n;
+;       if (n & 1) {
+;         goto j;
+;       }
+;
+;       // h
+; h:
+;       ret++;
+;     }
+;
+; j:
+;     ret += n * 2 + 20;
+;     goto l;
+;
+; k:
+;     ret *= n;
+;     goto l;
+;
+; l:
+;     if (n & 1) {
+;       // m
+;       ret++;
+;       goto m;
+;     }
+;   }
+;
+; m:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto n;
+;
+; i:
+;   ret /= n;
+;
+; n:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end33, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %storemerge, %if.end33 ]
+  br label %while.body2
+
+while.body2:                                      ; preds = %h, %while.body
+  %ret.1 = phi i32 [ %ret.0, %while.body ], [ %inc24, %h ]
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.2 = phi i32 [ %ret.1, %if.then ], [ %inc, %for.body ]
+  %storemerge2 = phi i32 [ 0, %if.then ], [ %inc6, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.2, 1
+  %inc6 = add nsw i32 %storemerge2, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp7 = icmp slt i32 %n, 4
+  br i1 %cmp7, label %i44, label %if.end20
+
+if.else:                                          ; preds = %while.body2
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp slt i32 %add, %n
+  br i1 %cmp10, label %if.end20, label %if.then12
+
+if.then12:                                        ; preds = %if.else
+  %mul13 = mul nsw i32 %n, %n
+  %add14 = add nsw i32 %ret.1, %mul13
+  %0 = icmp eq i32 %ret.1, -2147483648
+  %1 = icmp eq i32 %add14, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %add14, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %add14
+  %div = sdiv i32 %ret.1, %5
+  %cmp15 = icmp slt i32 %n, 11
+  br i1 %cmp15, label %k, label %h
+
+if.end20:                                         ; preds = %if.else, %for.end
+  %ret.3 = phi i32 [ %ret.2, %for.end ], [ %ret.1, %if.else ]
+  %mul21 = mul nsw i32 %ret.3, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %h, label %j
+
+h:                                                ; preds = %if.end20, %if.then12
+  %ret.4 = phi i32 [ %div, %if.then12 ], [ %mul21, %if.end20 ]
+  %inc24 = add nsw i32 %ret.4, 1
+  br label %while.body2
+
+j:                                                ; preds = %if.end20
+  %mul25 = mul i32 %n, 2
+  %add26 = add nsw i32 %mul25, 20
+  %add27 = add nsw i32 %add26, %mul21
+  br label %l
+
+k:                                                ; preds = %if.then12
+  %mul28 = mul nsw i32 %div, %n
+  br label %l
+
+l:                                                ; preds = %k, %j
+  %storemerge = phi i32 [ %add27, %j ], [ %mul28, %k ]
+  %and29 = and i32 %n, 1
+  %tobool30 = icmp eq i32 %and29, 0
+  br i1 %tobool30, label %if.end33, label %if.then31
+
+if.then31:                                        ; preds = %l
+  br label %for.cond35
+
+if.end33:                                         ; preds = %l
+  br label %while.body
+
+for.cond35:                                       ; preds = %for.body39, %if.then31
+  %ret.5.in = phi i32 [ %storemerge, %if.then31 ], [ %ret.5, %for.body39 ]
+  %storemerge1 = phi i32 [ 0, %if.then31 ], [ %inc42, %for.body39 ]
+  %ret.5 = add nsw i32 %ret.5.in, 1
+  %div36 = sdiv i32 %n, 4
+  %cmp37 = icmp slt i32 %storemerge1, %div36
+  br i1 %cmp37, label %for.body39, label %n46
+
+for.body39:                                       ; preds = %for.cond35
+  %inc42 = add nsw i32 %storemerge1, 1
+  br label %for.cond35
+
+i44:                                              ; preds = %for.end
+  %6 = icmp eq i32 %ret.2, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %div45 = sdiv i32 %ret.2, %11
+  br label %n46
+
+n46:                                              ; preds = %i44, %for.cond35
+  %ret.6 = phi i32 [ %div45, %i44 ], [ %ret.5, %for.cond35 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization11, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization11
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[WHILEBODY2:.+]]
+
+; CHECK: [[WHILEBODY2]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br label %[[IFEND20:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN12:.+]]
+
+; CHECK: [[IFTHEN12]]:
+; CHECK: br label %[[IFEND20]]
+
+; CHECK: [[IFEND20]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY2]], label %[[WHILEBODY2PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY2PUREEXIT:.+]]:
+; CHECK: br label %[[K:.+]]
+
+; CHECK: [[J:.+]]:
+; CHECK: br label %[[L:.+]]
+
+; CHECK: [[K]]:
+; CHECK: br label %[[KELSE:.+]]
+
+; CHECK: [[KELSE]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[L]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADER:.+]]
+
+; CHECK: [[FORCOND35PREHEADER]]:
+; CHECK: br label %[[FORCOND35:.+]]
+
+; CHECK: [[FORCOND35PREHEADERELSE:.+]]:
+; CHECK: br label %[[I44:.+]]
+
+; CHECK: [[FORCOND35]]:
+; CHECK: %[[CMP37:.+]] = icmp
+; CHECK: br i1 %[[CMP37]], label %[[FORBODY39:.+]], label %[[N46LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY39]]:
+; CHECK: br label %[[FORCOND35]]
+
+; CHECK: [[I44]]:
+; CHECK: br label %[[N46:.+]]
+
+; CHECK: [[N46LOOPEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADERELSE]]
+
+; CHECK: [[N46]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
new file mode 100644
index 0000000000000..8ba5b404bd7c8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
@@ -0,0 +1,627 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <-----.
+;             / \      |
+;            c   d     |
+;           / \ /      |
+;          /   e       |
+;         /    |       |
+;        /     g <---. |
+;       f     / \    | |
+;       |    h   i   | |
+;       |   /   / \  | |
+;       |  /   k   l | |
+;       | /    |\ /| | |
+;       |/     |/ \| | |
+;       j      m   n | |
+;      /|     / \ /  | |
+;     / |    o   p --' |
+;    /  |   /   /      |
+;   |   |  /   r       |
+;   |   | /    |       |
+;   |   |/     s ------'
+;   |   |     /
+;   |  /|    t
+;   | / |   /
+;   |/  |  /
+;   q   | /
+;   |   |/
+;   |   u
+;    \ /
+;     v
+;
+; * where nodes b, c, g, j, k, l, m, p and s are uniform branches,
+;   and node i is a varying branch.
+; * where nodes k, l, o, n, m, p, q, s, r, t and v are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;         a
+;         |
+;         b <----.
+;        / \     |
+;       c   d    |
+;      / \ /     |
+;     /   e      |
+;    /    |      |
+;   f     g <--. |
+;   |    / \   | |
+;   |   h   i  | |
+;   |  /    |  | |
+;   | /     l  | |
+;   |/      |  | |
+;   j       k  | |
+;   |\      |  | |
+;   | \     n  | |
+;   |  \    |  | |
+;   |   |   m  | |
+;   |   |   |  | |
+;   |   |   p -' |
+;   |   |  /     |
+;   |   | r      |
+;   |   | |      |
+;   |   | s -----'
+;   |   |/
+;   |   o
+;   |  /
+;   | t
+;   |/
+;   u
+;   |
+;   q
+;   |
+;   v
+;
+; __kernel void partial_linearization12(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n < 5) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n <= 2) { // g
+;         // h
+;         ret -= n * ret;
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         // j
+;         goto j;
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           if (n < 5) {
+;             // m
+;             ret -= n;
+;             goto m;
+;           } else {
+;             // n
+;             ret += n;
+;             goto n;
+;           }
+;         } else {
+;           // l
+;           if (n >= 5) {
+;             // m
+;             ret += n;
+;             goto m;
+;           } else {
+;             // n
+;             ret -= n;
+;             goto n;
+;           }
+;         }
+;       }
+;       // m
+; m:
+;       if (n & 1) {
+;         // o
+;         ret *= n;
+;         goto q;
+;       } else {
+;         // p
+;         goto p;
+;       }
+;
+;       // n
+; n:
+;       ret *= ret;
+;       // p
+; p:
+;       if (n > 3) {
+;         goto r;
+;       }
+;       ret++;
+;     }
+;
+;     // r
+; r:
+;     ret *= 4;
+;     for (int i = 0; i < n / 4; i++) ret++;
+;
+;     // s
+;     if (n & 1) {
+;       goto t;
+;     }
+;     ret++;
+;   }
+;
+; f:
+;   ret /= n;
+;   goto j;
+;
+; j:
+;   if (n == 2) {
+;     goto q;
+;   } else {
+;     goto u;
+;   }
+;
+; t:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto u;
+;
+; q:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto v;
+;
+; u:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; v:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end79, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc80, %if.end79 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %storemerge, %if.then ], [ %inc, %for.body ]
+  %storemerge10 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge10, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge10, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 5
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ %storemerge, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge1 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge1, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.1, 1
+  %inc15 = add nsw i32 %storemerge1, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.2 = phi i32 [ %ret.0, %for.end ], [ %ret.1, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %if.end63, %if.end17
+  %storemerge2.in = phi i32 [ %ret.2, %if.end17 ], [ %ret.4, %if.end63 ]
+  %storemerge2 = add nsw i32 %storemerge2.in, 1
+  %cmp21 = icmp slt i32 %n, 3
+  br i1 %cmp21, label %if.then23, label %if.else35
+
+if.then23:                                        ; preds = %while.body20
+  %mul24 = mul nsw i32 %storemerge2, %n
+  %sub = sub nsw i32 %storemerge2, %mul24
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %if.then23
+  %ret.3 = phi i32 [ %sub, %if.then23 ], [ %inc31, %for.body30 ]
+  %storemerge9 = phi i32 [ 0, %if.then23 ], [ %inc33, %for.body30 ]
+  %mul27 = shl nsw i32 %n, 1
+  %cmp28 = icmp slt i32 %storemerge9, %mul27
+  br i1 %cmp28, label %for.body30, label %j
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nsw i32 %ret.3, 1
+  %inc33 = add nsw i32 %storemerge9, 1
+  br label %for.cond26
+
+if.else35:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge2, %conv
+  %cmp36 = icmp slt i32 %add, %n
+  br i1 %cmp36, label %if.else48, label %if.then38
+
+if.then38:                                        ; preds = %if.else35
+  %mul39 = mul nsw i32 %n, %n
+  %add40 = add nsw i32 %storemerge2, %mul39
+  %0 = icmp eq i32 %add40, 0
+  %1 = select i1 %0, i32 1, i32 %add40
+  %div41 = sdiv i32 %storemerge2, %1
+  %cmp42 = icmp slt i32 %n, 5
+  br i1 %cmp42, label %if.then44, label %if.else46
+
+if.then44:                                        ; preds = %if.then38
+  %sub45 = sub nsw i32 %div41, %n
+  br label %m
+
+if.else46:                                        ; preds = %if.then38
+  %add47 = add nsw i32 %div41, %n
+  br label %n58
+
+if.else48:                                        ; preds = %if.else35
+  %cmp49 = icmp sgt i32 %n, 4
+  br i1 %cmp49, label %if.then51, label %if.else53
+
+if.then51:                                        ; preds = %if.else48
+  %add52 = add nsw i32 %storemerge2, %n
+  br label %m
+
+if.else53:                                        ; preds = %if.else48
+  %sub54 = sub nsw i32 %storemerge2, %n
+  br label %n58
+
+m:                                                ; preds = %if.then51, %if.then44
+  %storemerge7 = phi i32 [ %add52, %if.then51 ], [ %sub45, %if.then44 ]
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %p, label %if.then55
+
+if.then55:                                        ; preds = %m
+  %mul56 = mul nsw i32 %storemerge7, %n
+  br label %q
+
+n58:                                              ; preds = %if.else53, %if.else46
+  %storemerge3 = phi i32 [ %sub54, %if.else53 ], [ %add47, %if.else46 ]
+  %mul59 = mul nsw i32 %storemerge3, %storemerge3
+  br label %p
+
+p:                                                ; preds = %n58, %m
+  %ret.4 = phi i32 [ %mul59, %n58 ], [ %storemerge7, %m ]
+  %cmp60 = icmp sgt i32 %n, 3
+  br i1 %cmp60, label %r, label %if.end63
+
+if.end63:                                         ; preds = %p
+  br label %while.body20
+
+r:                                                ; preds = %p
+  %mul65 = shl nsw i32 %ret.4, 2
+  br label %for.cond67
+
+for.cond67:                                       ; preds = %for.body71, %r
+  %ret.5 = phi i32 [ %mul65, %r ], [ %inc72, %for.body71 ]
+  %storemerge4 = phi i32 [ 0, %r ], [ %inc74, %for.body71 ]
+  %div68 = sdiv i32 %n, 4
+  %cmp69 = icmp slt i32 %storemerge4, %div68
+  br i1 %cmp69, label %for.body71, label %for.end75
+
+for.body71:                                       ; preds = %for.cond67
+  %inc72 = add nsw i32 %ret.5, 1
+  %inc74 = add nsw i32 %storemerge4, 1
+  br label %for.cond67
+
+for.end75:                                        ; preds = %for.cond67
+  %and76 = and i32 %n, 1
+  %tobool77 = icmp eq i32 %and76, 0
+  br i1 %tobool77, label %if.end79, label %t
+
+if.end79:                                         ; preds = %for.end75
+  %inc80 = add nsw i32 %ret.5, 1
+  br label %while.body
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %n, 0
+  %3 = select i1 %2, i32 1, i32 %n
+  %div81 = sdiv i32 %ret.0, %3
+  br label %j
+
+j:                                                ; preds = %f, %for.cond26
+  %ret.6 = phi i32 [ %div81, %f ], [ %ret.3, %for.cond26 ]
+  %cmp82 = icmp eq i32 %n, 2
+  br i1 %cmp82, label %q, label %u
+
+t:                                                ; preds = %for.end75
+  br label %for.cond87
+
+for.cond87:                                       ; preds = %for.body91, %t
+  %ret.7 = phi i32 [ %ret.5, %t ], [ %inc92, %for.body91 ]
+  %storemerge5 = phi i32 [ 0, %t ], [ %inc94, %for.body91 ]
+  %cmp89 = icmp sgt i32 %storemerge5, %n
+  br i1 %cmp89, label %u, label %for.body91
+
+for.body91:                                       ; preds = %for.cond87
+  %inc92 = add nsw i32 %ret.7, 1
+  %inc94 = add nsw i32 %storemerge5, 1
+  br label %for.cond87
+
+q:                                                ; preds = %j, %if.then55
+  %ret.8 = phi i32 [ %mul56, %if.then55 ], [ %ret.6, %j ]
+  br label %for.cond97
+
+for.cond97:                                       ; preds = %for.body101, %q
+  %ret.9 = phi i32 [ %ret.8, %q ], [ %inc102, %for.body101 ]
+  %storemerge8 = phi i32 [ 0, %q ], [ %inc104, %for.body101 ]
+  %div98 = sdiv i32 %n, 4
+  %cmp99 = icmp slt i32 %storemerge8, %div98
+  br i1 %cmp99, label %for.body101, label %v
+
+for.body101:                                      ; preds = %for.cond97
+  %inc102 = add nsw i32 %ret.9, 1
+  %inc104 = add nsw i32 %storemerge8, 1
+  br label %for.cond97
+
+u:                                                ; preds = %for.cond87, %j
+  %ret.10 = phi i32 [ %ret.6, %j ], [ %ret.7, %for.cond87 ]
+  br label %for.cond107
+
+for.cond107:                                      ; preds = %for.body111, %u
+  %ret.11 = phi i32 [ %ret.10, %u ], [ %inc112, %for.body111 ]
+  %storemerge6 = phi i32 [ 0, %u ], [ %inc114, %for.body111 ]
+  %mul108 = shl nsw i32 %n, 1
+  %cmp109 = icmp slt i32 %storemerge6, %mul108
+  br i1 %cmp109, label %for.body111, label %v
+
+for.body111:                                      ; preds = %for.cond107
+  %inc112 = add nsw i32 %ret.11, 1
+  %inc114 = add nsw i32 %storemerge6, 1
+  br label %for.cond107
+
+v:                                                ; preds = %for.cond107, %for.cond97
+  %ret.12 = phi i32 [ %ret.9, %for.cond97 ], [ %ret.11, %for.cond107 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization12, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization12
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[CMP21:.+]] = icmp
+; CHECK: br i1 %[[CMP21]], label %[[IFTHEN23:.+]], label %[[IFELSE35:.+]]
+
+; CHECK: [[IFTHEN23]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHEN23ELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[IFTHEN23SPLIT:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[IFELSE35]]:
+; CHECK: br label %[[IFTHEN38:.+]]
+
+; CHECK: [[IFTHEN38]]:
+; CHECK: %[[CMP42:.+]] = icmp slt i32
+; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]]
+
+; CHECK: [[IFTHEN44]]:
+; CHECK: br label %[[IFELSE48:.+]]
+
+; CHECK: [[IFELSE46]]:
+; CHECK: br label %[[IFELSE48]]
+
+; CHECK: [[IFELSE48]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]]
+
+; CHECK: [[IFTHEN51]]:
+; CHECK: br label %[[N58:.+]]
+
+; CHECK: [[IFELSE53]]:
+; CHECK: br label %[[N58]]
+
+; CHECK: [[M:.+]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[IFTHEN55:.+]]:
+; CHECK: br label %[[IFTHEN55ELSE:.+]]
+
+; CHECK: [[IFTHEN55ELSE]]:
+; CHECK: br label %[[FORCOND87PREHEADER:.+]]
+
+; CHECK: [[N58]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[P]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[R:.+]]
+
+; CHECK: [[R]]:
+; CHECK: br label %[[FORCOND67:.+]]
+
+; CHECK: [[FORCOND67]]:
+; CHECK: %[[CMP69:.+]] = icmp
+; CHECK: br i1 %[[CMP69]], label %[[FORBODY71:.+]], label %[[FOREND75:.+]]
+
+; CHECK: [[FORBODY71]]:
+; CHECK: br label %[[FORCOND67]]
+
+; CHECK: [[FOREND75]]:
+; CHECK: br label %[[IFEND79:.+]]
+
+; CHECK: [[FORCOND87PREHEADER]]:
+; CHECK: br label %[[FORCOND87:.+]]
+
+; CHECK: [[FORCOND87PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN23ELSE]], label %[[IFTHEN23SPLIT]]
+
+; CHECK: [[IFEND79]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN55]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[U:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[J]]:
+; CHECK: %[[CMP82:.+]] = icmp
+; CHECK: br i1 %[[CMP82]], label %[[Q:.+]], label %[[U]]
+
+; CHECK: [[FORCOND87]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXIT:.+]], label %[[FORBODY91:.+]]
+
+; CHECK: [[FORBODY91]]:
+; CHECK: br label %[[FORCOND87]]
+
+; CHECK: [[Q]]:
+; CHECK: br label %[[FORCOND97:.+]]
+
+; CHECK: [[FORCOND97]]:
+; CHECK: %[[CMP99:.+]] = icmp
+; CHECK: br i1 %[[CMP99]], label %[[FORBODY101:.+]], label %[[VLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY101]]:
+; CHECK: br label %[[FORCOND97]]
+
+; CHECK: [[ULOOPEXIT]]:
+; CHECK: br label %[[FORCOND87PREHEADERELSE]]
+
+; CHECK: [[U]]:
+; CHECK: br label %[[FORCOND107:.+]]
+
+; CHECK: [[FORCOND107]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111:.+]], label %[[VLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY111]]:
+; CHECK: br label %[[FORCOND107]]
+
+; CHECK: [[VLOOPEXIT]]:
+; CHECK: br label %[[V:.+]]
+
+; CHECK: [[VLOOPEXIT2]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[V]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
new file mode 100644
index 0000000000000..237b24cf1605b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -0,0 +1,218 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;    \ / \
+;     |   \
+;     |    d
+;     |   / \
+;     |  |   e
+;     |   \ /
+;     |    f
+;     |   / \
+;     |  |   g
+;     |   \ /
+;      \   h
+;       \ /
+;        i
+;
+; * where nodes d and f are uniform branches, and nodes a and c are varying
+;   branches.
+; * where nodes b, c, i are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     c
+;     |
+;     d
+;    / \
+;   |   e
+;    \ /
+;     f
+;    / \
+;   |   g
+;    \ /
+;     h
+;     |
+;     b
+;     |
+;     i
+;
+; __kernel void partial_linearization13(__global int *out, int n) {
+;   size_t tid = get_global_id(0);
+;   size_t size = get_global_size(0);
+;   // a
+;   if (tid + 1 < size) {
+;     // b
+;     out[tid] = n;
+;   } else if (tid + 1 == size) { // c
+;     size_t leftovers = 1 + (size & 1);
+;     switch (leftovers) { // d
+;       case 2: // e
+;         out[tid] = 2 * n + 1;
+;         // fall through
+;       case 1: // f
+;         out[tid] += 3 * n - 1;
+;         break;
+;     }
+;     switch (leftovers) { // g
+;       case 2:
+;         out[tid] /= n;
+;         // fall through
+;       case 1: // h
+;         out[tid]--;
+;         break;
+;     }
+;   }
+;   // i
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0) #2
+  %add = add i64 %call, 1
+  %cmp = icmp ult i64 %add, %call1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %n, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end17
+
+if.else:                                          ; preds = %entry
+  %add2 = add i64 %call, 1
+  %cmp3 = icmp eq i64 %add2, %call1
+  br i1 %cmp3, label %if.then4, label %if.end17
+
+if.then4:                                         ; preds = %if.else
+  %0 = and i64 %call1, 1
+  %trunc = icmp eq i64 %0, 0
+  br i1 %trunc, label %sw.bb8, label %sw.bb
+
+sw.bb:                                            ; preds = %if.then4
+  %mul = shl nsw i32 %n, 1
+  %add6 = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %add6, i32 addrspace(1)* %arrayidx7, align 4
+  br label %sw.bb8
+
+sw.bb8:                                           ; preds = %sw.bb, %if.then4
+  %mul9 = mul nsw i32 %n, 3
+  %sub = add nsw i32 %mul9, -1
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %add11 = add nsw i32 %sub, %1
+  store i32 %add11, i32 addrspace(1)* %arrayidx10, align 4
+  %2 = and i64 %call1, 1
+  %trunc2 = icmp ne i64 %2, 0
+  %trunc2.off = add i1 %trunc2, true
+  %switch = icmp ult i1 %trunc2.off, true
+  br i1 %switch, label %sw.bb12, label %sw.bb14
+
+sw.bb12:                                          ; preds = %sw.bb8
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %3 = load i32, i32 addrspace(1)* %arrayidx13, align 4
+  %4 = icmp eq i32 %3, -2147483648
+  %5 = icmp eq i32 %n, -1
+  %6 = and i1 %5, %4
+  %7 = icmp eq i32 %n, 0
+  %8 = or i1 %7, %6
+  %9 = select i1 %8, i32 1, i32 %n
+  %div = sdiv i32 %3, %9
+  store i32 %div, i32 addrspace(1)* %arrayidx13, align 4
+  br label %sw.bb14
+
+sw.bb14:                                          ; preds = %sw.bb12, %sw.bb8
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %10 = load i32, i32 addrspace(1)* %arrayidx15, align 4
+  %dec = add nsw i32 %10, -1
+  store i32 %dec, i32 addrspace(1)* %arrayidx15, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %sw.bb14, %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z15get_global_sizej(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization13, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization13
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND17:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN4:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: %[[TMP:.+]] = and i64 %call1, 1
+; CHECK: %[[TRUNC:.+]] = icmp eq i64 %[[TMP]], 0
+; CHECK: br i1 %[[TRUNC]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
+
+; CHECK: [[SWBB]]:
+; CHECK: br label %[[SWBB8]]
+
+; CHECK: [[SWBB8]]:
+; CHECK: %[[TMP2:.+]] = and i64 %call1, 1
+; CHECK: %[[TRUNC2:.+]] = icmp eq i64 %[[TMP2]], 0
+; CHECK: br i1 %[[TRUNC2]], label %[[SWBB14:.+]], label %[[SWBB12:.+]]
+
+; CHECK: [[SWBB12]]:
+; CHECK: br label %[[SWBB14]]
+
+; CHECK: [[SWBB14]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
new file mode 100644
index 0000000000000..8f45ed6d60907
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
@@ -0,0 +1,292 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c <-.
+;   |  / \  |
+;   | d   e |
+;   |/ \ /  |
+;   f   g --'
+;    \  |
+;     \ h
+;      \|
+;       i
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, h and i are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;    / \
+;   b   c <.
+;   |   |  |
+;   |   e  |
+;   |   |  |
+;   |   d  |
+;   |   |  |
+;   |   g -'
+;    \  |
+;     \ h
+;      \|
+;       f
+;       |
+;       i
+;
+; __kernel void partial_linearization14(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto f;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %f, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+f:                                                ; preds = %if.then9, %for.cond
+  %ret.5 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %if.then9 ]
+  br label %for.cond41
+
+for.cond41:                                       ; preds = %for.body45, %f
+  %ret.6 = phi i32 [ %ret.5, %f ], [ %div46, %for.body45 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc48, %for.body45 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge3, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.6, 2
+  %inc48 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.6, %n
+  br label %early
+
+early:                                            ; preds = %for.end49, %for.end39
+  %storemerge2 = phi i32 [ %div, %for.end39 ], [ %sub50, %for.end49 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization14, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization14
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FLOOPEXIT2:.+]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT2]]:
+; CHECK: br label %[[F]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
new file mode 100644
index 0000000000000..c856b5afbc106
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
@@ -0,0 +1,385 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;           a
+;           |
+;           b <-----.
+;          / \      |
+;         c   d     |
+;        / \ /      |
+;       /   e       |
+;      /    |       |
+;     /     g <---. |
+;    /     / \    | |
+;   f     h   i   | |
+;   |    / \ / \  | |
+;   |   |   j   k | |
+;   |    \ / \ /  | |
+;   |     l   m --' |
+;   |    /          |
+;   |   o ----------'
+;   |   |
+;   n   p
+;    \ /
+;     q
+;
+; * where nodes b, c, g, h, j and o are uniform branches, and node i is a
+;   varying branch.
+; * where nodes j, k, m, l, and o are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-----.
+;      / \      |
+;     c   d     |
+;    / \ /      |
+;   f   e       |
+;   |   |       |
+;   |   g <---. |
+;   |  / \    | |
+;   | h   i   | |
+;   | |   |   | |
+;   | |   k   | |
+;   |  \ /    | |
+;   |   j     | |
+;   |   |     | |
+;   |   m ----' |
+;   |   |       |
+;   |   l       |
+;   |   |       |
+;   |   o ------'
+;   |   |
+;   n   p
+;    \ /
+;     q
+;
+; __kernel void partial_linearization15(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           goto l;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           goto m;
+;         }
+;       }
+;       // j
+;       if (n & 1) {
+;         goto l;
+;       }
+;       // m
+; m:
+;       ret++;
+;     }
+; l:
+;     ret *= 4;
+; o:
+;     if (n & 1) {
+;       // p
+;       ret++;
+;       goto p;
+;     }
+;   }
+;
+; p:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto q;
+;
+; f:
+;   ret /= n;
+;   goto n;
+;
+; n:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; q:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %l, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %mul40, %l ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.cond, label %for.cond9
+
+for.cond:                                         ; preds = %for.body, %while.body
+  %ret.1 = phi i32 [ %inc, %for.body ], [ %ret.0, %while.body ]
+  %storemerge3 = phi i32 [ %inc4, %for.body ], [ 0, %while.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+for.cond9:                                        ; preds = %for.body12, %while.body
+  %ret.2 = phi i32 [ %inc13, %for.body12 ], [ %ret.0, %while.body ]
+  %storemerge = phi i32 [ %inc15, %for.body12 ], [ 0, %while.body ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %m, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %m ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %l, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %m
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %m, label %l
+
+m:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+l:                                                ; preds = %if.end34, %if.then21
+  %mul40 = shl nsw i32 %storemerge1, 2
+  %and41 = and i32 %n, 1
+  %tobool42 = icmp eq i32 %and41, 0
+  br i1 %tobool42, label %while.body, label %if.then43
+
+if.then43:                                        ; preds = %l
+  %inc44 = or i32 %mul40, 1
+  br label %for.cond47
+
+for.cond47:                                       ; preds = %for.body51, %if.then43
+  %ret.5 = phi i32 [ %inc44, %if.then43 ], [ %inc52, %for.body51 ]
+  %storemerge2 = phi i32 [ 0, %if.then43 ], [ %inc54, %for.body51 ]
+  %div48 = sdiv i32 %n, 4
+  %cmp49 = icmp slt i32 %storemerge2, %div48
+  br i1 %cmp49, label %for.body51, label %q
+
+for.body51:                                       ; preds = %for.cond47
+  %inc52 = add nsw i32 %ret.5, 1
+  %inc54 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond47
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div56 = sdiv i32 %ret.1, %7
+  br label %for.cond59
+
+for.cond59:                                       ; preds = %for.body63, %f
+  %ret.6 = phi i32 [ %div56, %f ], [ %inc64, %for.body63 ]
+  %storemerge4 = phi i32 [ 0, %f ], [ %inc66, %for.body63 ]
+  %mul60 = shl nsw i32 %n, 1
+  %cmp61 = icmp slt i32 %storemerge4, %mul60
+  br i1 %cmp61, label %for.body63, label %q
+
+for.body63:                                       ; preds = %for.cond59
+  %inc64 = add nsw i32 %ret.6, 1
+  %inc66 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond59
+
+q:                                                ; preds = %for.cond59, %for.cond47
+  %ret.7 = phi i32 [ %ret.5, %for.cond47 ], [ %ret.6, %for.cond59 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.7, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization15, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization15
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 false, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[AND:.+]] = and i32
+; CHECK: %[[TOBOOL:.+]] = icmp eq i32 %[[AND]]
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34:.+]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[M]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[L:.+]]
+
+; CHECK: [[L]]:
+; CHECK: %[[TOBOOL42:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL42]], label %[[WHILEBODY]], label %[[IFTHEN43:.+]]
+
+; CHECK: [[IFTHEN43]]:
+; CHECK: br label %[[FORCOND47:.+]]
+
+; CHECK: [[FORCOND47]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[FORBODY51:.+]], label %[[QLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY51]]:
+; CHECK: br label %[[FORCOND47]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND59:.+]]
+
+; CHECK: [[FORCOND59]]:
+; CHECK: br i1 false, label %[[FORBODY63:.+]], label %[[QLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY63]]:
+; CHECK: br label %[[FORCOND59]]
+
+; CHECK: [[QLOOPEXIT]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[QLOOPEXIT2]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[Q]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
new file mode 100644
index 0000000000000..6bbae842606b2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
@@ -0,0 +1,319 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c <-.
+;    /   / \  |
+;   |   d   e |
+;   |  / \ /  |
+;   | f   g --'
+;   |/    |
+;   h     i
+;    \   /
+;     \ /
+;      j
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;    / \
+;   b   c <.
+;   |   |  |
+;   |   e  |
+;   |   |  |
+;   |   d  |
+;   |   |  |
+;   |   g -'
+;   |   |
+;   |   i
+;    \  |
+;     \ f
+;      \|
+;       h
+;       |
+;       j
+;
+; __kernel void partial_linearization16(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto h;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; h:
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %for.cond41, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+for.cond41:                                       ; preds = %for.body45, %if.then9
+  %ret.5 = phi i32 [ %div46, %for.body45 ], [ %ret.1, %if.then9 ]
+  %storemerge2 = phi i32 [ %inc48, %for.body45 ], [ 0, %if.then9 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge2, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.5, 2
+  %inc48 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.5, %n
+  br label %h
+
+h:                                                ; preds = %for.end49, %for.cond
+  %ret.6 = phi i32 [ %sub50, %for.end49 ], [ %ret.0, %for.cond ]
+  br label %for.cond52
+
+for.cond52:                                       ; preds = %for.body56, %h
+  %ret.7 = phi i32 [ %ret.6, %h ], [ %sub57, %for.body56 ]
+  %storemerge3 = phi i32 [ 0, %h ], [ %inc59, %for.body56 ]
+  %mul53 = shl nsw i32 %n, 1
+  %cmp54 = icmp slt i32 %storemerge3, %mul53
+  br i1 %cmp54, label %for.body56, label %early
+
+for.body56:                                       ; preds = %for.cond52
+  %sub57 = sub nsw i32 %ret.7, %storemerge3
+  %inc59 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond52
+
+early:                                            ; preds = %for.cond52, %for.end39
+  %ret.8 = phi i32 [ %div, %for.end39 ], [ %ret.7, %for.cond52 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization16, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization16
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND41PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FORCOND41PREHEADER]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND52:.+]]
+
+; CHECK: [[FORCOND52]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56:.+]], label %[[EARLYLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY56]]:
+; CHECK: br label %[[FORCOND52]]
+
+; CHECK: [[EARLYLOOPEXIT]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
new file mode 100644
index 0000000000000..d87b91b19c9d4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -0,0 +1,376 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <----.
+;             / \     |
+;            c   d    |
+;           /   / \   |
+;          e   f   g -'
+;         / \  |   |
+;   .--> h   | i   j
+;   |   / \  |  \ /
+;   '- k   l '-> m
+;      |    \   /
+;      n     \ /
+;       \     o
+;        \   /
+;         \ /
+;          p
+;
+; * where nodes b, d, and h are uniform branches, and nodes e and g are varying
+;   branches.
+; * where nodes h, j, m, o, and p are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;              a
+;              |
+;              b <----.
+;             / \     |
+;            c   d    |
+;           /   / \   |
+;          e   f   g -'
+;         /    |   |
+;   .--> h     i   |
+;   |   / \    |   |
+;   '- k   l   |   |
+;       \   \  |  /
+;        n   \ | /
+;         \   \|/
+;          `-> j
+;              |
+;              m
+;              |
+;              o
+;              |
+;              p
+;
+; __kernel void partial_linearization17(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 10) {
+;       goto c;
+;     } else if (n > 5) {
+;       goto f;
+;     }
+;     if (id + i++ % 2 == 0) {
+;       break;
+;     }
+;   }
+;
+;   // j
+;   for (int i = 0; i < n + 10; i++) ret++;
+;   goto m;
+;
+; f:
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret += i;
+;   goto m;
+;
+; c:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   // e
+;   if (id % 2 == 0) {
+;     goto h;
+;   } else {
+;     goto m;
+;   }
+;
+; m:
+;   ret <<= 2;
+;   goto o;
+;
+; h:
+;   for (int i = 0; i < n * 2; i++) {
+;     if (n > 5) {
+;       goto l;
+;     }
+;   }
+;   // n
+;   ret += id << 3;
+;   goto p;
+;
+; l:
+;   ret += id << 3;
+;
+; o:
+;   for (int i = 0; i < n * 2; i++) ret += i;
+;
+; p:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end5, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %for.cond28, label %if.else
+
+if.else:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 5
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.else
+  %inc = add nuw nsw i32 %i.0, 1
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp6 = icmp eq i32 %conv, %add
+  br i1 %cmp6, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.end5
+  %ret.0 = phi i32 [ %inc14, %for.body ], [ 0, %if.end5 ]
+  %storemerge = phi i32 [ %inc15, %for.body ], [ 0, %if.end5 ]
+  %add11 = add nsw i32 %n, 10
+  %cmp12 = icmp slt i32 %storemerge, %add11
+  br i1 %cmp12, label %for.body, label %m
+
+for.body:                                         ; preds = %for.cond
+  %inc14 = add nuw nsw i32 %ret.0, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+f:                                                ; preds = %if.else
+  %mul = shl i32 %n, 1
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.body22, %f
+  %ret.1 = phi i32 [ %mul, %f ], [ %add23, %for.body22 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ]
+  %mul19 = shl nsw i32 %n, 1
+  %cmp20 = icmp slt i32 %storemerge3, %mul19
+  br i1 %cmp20, label %for.body22, label %m
+
+for.body22:                                       ; preds = %for.cond18
+  %add23 = add nsw i32 %storemerge3, %ret.1
+  %inc25 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond18
+
+for.cond28:                                       ; preds = %for.body32, %while.body
+  %ret.2 = phi i32 [ %add33, %for.body32 ], [ 0, %while.body ]
+  %storemerge4 = phi i32 [ %inc35, %for.body32 ], [ 0, %while.body ]
+  %add29 = add nsw i32 %n, 5
+  %cmp30 = icmp slt i32 %storemerge4, %add29
+  br i1 %cmp30, label %for.body32, label %for.end36
+
+for.body32:                                       ; preds = %for.cond28
+  %add33 = add nuw nsw i32 %ret.2, 2
+  %inc35 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond28
+
+for.end36:                                        ; preds = %for.cond28
+  %rem375 = and i32 %conv, 1
+  %cmp38 = icmp eq i32 %rem375, 0
+  br i1 %cmp38, label %for.cond43, label %m
+
+m:                                                ; preds = %for.end36, %for.cond18, %for.cond
+  %ret.3 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond18 ], [ %ret.2, %for.end36 ]
+  %shl = shl i32 %ret.3, 2
+  br label %o
+
+for.cond43:                                       ; preds = %for.inc52, %for.end36
+  %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ]
+  %mul44 = shl nsw i32 %n, 1
+  %cmp45 = icmp slt i32 %storemerge6, %mul44
+  br i1 %cmp45, label %for.body47, label %for.end54
+
+for.body47:                                       ; preds = %for.cond43
+  %cmp48 = icmp sgt i32 %n, 5
+  br i1 %cmp48, label %l, label %for.inc52
+
+for.inc52:                                        ; preds = %for.body47
+  %inc53 = add nuw nsw i32 %storemerge6, 1
+  br label %for.cond43
+
+for.end54:                                        ; preds = %for.cond43
+  %shl55 = mul i32 %conv, 8
+  %add56 = add nsw i32 %ret.2, %shl55
+  br label %p
+
+l:                                                ; preds = %for.body47
+  %shl57 = mul i32 %conv, 8
+  %add58 = add nsw i32 %ret.2, %shl57
+  br label %o
+
+o:                                                ; preds = %l, %m
+  %storemerge1 = phi i32 [ %shl, %m ], [ %add58, %l ]
+  br label %for.cond60
+
+for.cond60:                                       ; preds = %for.body64, %o
+  %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ]
+  %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ]
+  %mul61 = shl nsw i32 %n, 1
+  %cmp62 = icmp slt i32 %storemerge2, %mul61
+  br i1 %cmp62, label %for.body64, label %p
+
+for.body64:                                       ; preds = %for.cond60
+  %add65 = add nsw i32 %storemerge2, %ret.4
+  %inc67 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond60
+
+p:                                                ; preds = %for.cond60, %for.end54
+  %ret.5 = phi i32 [ %add56, %for.end54 ], [ %ret.4, %for.cond60 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.5, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization17, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization17
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCOND28PREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCOND28PREHEADERELSE:.+]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[FORCOND28PREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND28:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[F:.+]], label %[[IFEND5:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[MLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND28PREHEADERELSE]], label %[[FORCOND28PREHEADERSPLIT]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[FORCOND18:.+]]
+
+; CHECK: [[FORCOND18]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[MLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND18]]
+
+; CHECK: [[FORCOND28]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32:.+]], label %[[FOREND36:.+]]
+
+; CHECK: [[FORBODY32]]:
+; CHECK: br label %[[FORCOND28]]
+
+; CHECK: [[FOREND36]]:
+; CHECK: br label %[[FORCOND43PREHEADER:.+]]
+
+; CHECK: [[FORCOND43PREHEADER]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br label %[[FORCOND43:.+]]
+
+; CHECK: [[MLOOPEXIT]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[MLOOPEXIT2]]:
+; CHECK: br label %[[FORCONDPREHEADERELSE]]
+
+; CHECK: [[M]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[FORCOND43]]:
+; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]]
+
+; CHECK: [[FORBODY47]]:
+; CHECK: br i1 true, label %[[L:.+]], label %[[FORINC52:.+]]
+
+; CHECK: [[FORINC52]]:
+; CHECK: br label %[[FORCOND43]]
+
+; CHECK: [[FOREND54]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[L]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[O]]:
+; CHECK: br label %[[FORCOND60:.+]]
+
+; CHECK: [[FORCOND60]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64:.+]], label %[[PLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY64]]:
+; CHECK: br label %[[FORCOND60]]
+
+; CHECK: [[PLOOPEXIT]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[P]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
new file mode 100644
index 0000000000000..d870ca351ce11
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
@@ -0,0 +1,289 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <--.
+;      / \   |
+;     c   d -'
+;    / \  |
+;   e   f |
+;   |    \|
+;   |     g
+;   |    /
+;   |   h
+;    \ / \
+;     i   j
+;      \ /
+;       k
+;
+; * where nodes b, and h are uniform branches, and nodes c and d are varying
+;   branches.
+; * where nodes e, f, g, i and k are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;     |
+;     b <--.
+;    / \   |
+;   c   d -'
+;   |   |
+;   f   |
+;   |   |
+;   e   |
+;    \ /
+;     g
+;     |
+;     h
+;    / \
+;   |   j
+;    \ /
+;     i
+;     |
+;     k
+;
+; __kernel void partial_linearization18(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (id + i % 2 == 0) {
+;         goto e;
+;       } else {
+;         goto f;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto g;
+;
+; g:
+;   for (int i = 1; i < n * 2; i++) ret *= i;
+;   goto h;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto i;
+;
+; h:
+;   if (n > 3) {
+; i:
+;     ret++;
+;   } else {
+;     ret *= 3;
+;   }
+;
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp2 = icmp eq i32 %conv, %add
+  br i1 %cmp2, label %for.cond26, label %for.cond
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add5 = add nsw i32 %inc, %conv
+  %cmp6 = icmp sgt i32 %add5, 3
+  br i1 %cmp6, label %g, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add14, %for.body ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc15, %for.body ], [ 0, %if.then ]
+  %add11 = add nsw i32 %n, 5
+  %cmp12 = icmp slt i32 %storemerge2, %add11
+  br i1 %cmp12, label %for.body, label %g
+
+for.body:                                         ; preds = %for.cond
+  %add14 = add nuw nsw i32 %ret.0, 2
+  %inc15 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %if.end
+  %ret.1 = phi i32 [ 0, %if.end ], [ %ret.0, %for.cond ]
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body20, %g
+  %ret.2 = phi i32 [ %ret.1, %g ], [ %mul21, %for.body20 ]
+  %storemerge = phi i32 [ 1, %g ], [ %inc23, %for.body20 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp18 = icmp slt i32 %storemerge, %mul
+  br i1 %cmp18, label %for.body20, label %h
+
+for.body20:                                       ; preds = %for.cond17
+  %mul21 = mul nsw i32 %storemerge, %ret.2
+  %inc23 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond17
+
+for.cond26:                                       ; preds = %for.body30, %if.then
+  %ret.3 = phi i32 [ %inc31, %for.body30 ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc33, %for.body30 ], [ 0, %if.then ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %i38
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nuw nsw i32 %ret.3, 1
+  %inc33 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+h:                                                ; preds = %for.cond17
+  %cmp35 = icmp sgt i32 %n, 3
+  br i1 %cmp35, label %i38, label %if.else40
+
+i38:                                              ; preds = %h, %for.cond26
+  %ret.4 = phi i32 [ %ret.3, %for.cond26 ], [ %ret.2, %h ]
+  %inc39 = add nsw i32 %ret.4, 1
+  br label %if.end42
+
+if.else40:                                        ; preds = %h
+  %mul41 = mul nsw i32 %ret.2, 3
+  br label %if.end42
+
+if.end42:                                         ; preds = %if.else40, %i38
+  %storemerge1 = phi i32 [ %mul41, %if.else40 ], [ %inc39, %i38 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization18, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization18
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[GLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: %[[CMP18:.+]] = icmp
+; CHECK: br i1 %[[CMP18]], label %[[FORBODY20:.+]], label %[[H:.+]]
+
+; CHECK: [[FORBODY20]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[I38LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[H]]:
+; CHECK: %[[CMP35:.+]] = icmp
+; CHECK: br i1 %[[CMP35]], label %[[I38:.+]], label %[[IFELSE40:.+]]
+
+; CHECK: [[I38LOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I38]]:
+; CHECK: br label %[[IFEND42:.+]]
+
+; CHECK: [[IFELSE40]]:
+; CHECK: br label %[[I38]]
+
+; CHECK: [[IFEND42]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
new file mode 100644
index 0000000000000..69e8ecea9d0a5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
@@ -0,0 +1,308 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <----.
+;      / \     |
+;     c   \    |
+;    / \   \   |
+;   d   e   f -'
+;   |   |   |
+;    \  \   g
+;     \  \ / \
+;      \  h   i <,
+;       \  \ /  /
+;        \  j  /
+;         \   /
+;          `-'
+;
+; * where nodes b, c, and g are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g, h, i and j are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;       a
+;       |
+;       b <----.
+;      / \     |
+;     c   \    |
+;    / \   \   |
+;   d   e   f -'
+;   |   |   |
+;    \  |  /
+;     \ | /
+;      \|/
+;       g
+;       |
+;       i
+;       |
+;       h
+;       |
+;       j
+;
+; The uniform branch `g` has been linearized because both its successors are
+; divergent. Not linearizing `g`  would mean that only one of both
+; successors could be executed in addition to the other, pending a uniform
+; condition evaluates to true, whereas what we want is to possibly execute both
+; no matter what the uniform condition evaluates to.
+;
+; __kernel void partial_linearization19(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (n == 6) {
+;         goto d;
+;       } else {
+;         goto e;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       break;
+;     }
+;   }
+;
+;   // g
+;   if (n == 3) {
+;     goto h;
+;   } else {
+;     goto i;
+;   }
+;
+; d:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto i;
+;
+; e:
+;   for (int i = 1; i < n * 2; i++) ret += i;
+;   goto h;
+;
+; i:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto j;
+;
+; h:
+;   for (int i = 0; i < n; i++) ret++;
+;   goto j;
+;
+; j:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp eq i32 %n, 6
+  br i1 %cmp2, label %for.cond, label %for.cond20
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add = add nsw i32 %inc, %conv
+  %cmp5 = icmp sgt i32 %add, 3
+  br i1 %cmp5, label %while.end, label %while.body
+
+while.end:                                        ; preds = %if.end
+  %cmp9 = icmp eq i32 %n, 3
+  br i1 %cmp9, label %h, label %i28
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add17, %for.body ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc18, %for.body ], [ 0, %if.then ]
+  %add14 = add nsw i32 %n, 5
+  %cmp15 = icmp slt i32 %storemerge3, %add14
+  br i1 %cmp15, label %for.body, label %i28
+
+for.body:                                         ; preds = %for.cond
+  %add17 = add nuw nsw i32 %ret.0, 2
+  %inc18 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.cond20:                                       ; preds = %for.body23, %if.then
+  %ret.1 = phi i32 [ %add24, %for.body23 ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc26, %for.body23 ], [ 1, %if.then ]
+  %mul = shl nsw i32 %n, 1
+  %cmp21 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp21, label %for.body23, label %h
+
+for.body23:                                       ; preds = %for.cond20
+  %add24 = add nuw nsw i32 %storemerge2, %ret.1
+  %inc26 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond20
+
+i28:                                              ; preds = %for.cond, %while.end
+  %ret.2 = phi i32 [ 0, %while.end ], [ %ret.0, %for.cond ]
+  br label %for.cond30
+
+for.cond30:                                       ; preds = %for.body34, %i28
+  %ret.3 = phi i32 [ %ret.2, %i28 ], [ %inc35, %for.body34 ]
+  %storemerge = phi i32 [ 0, %i28 ], [ %inc37, %for.body34 ]
+  %add31 = add nsw i32 %n, 5
+  %cmp32 = icmp slt i32 %storemerge, %add31
+  br i1 %cmp32, label %for.body34, label %j
+
+for.body34:                                       ; preds = %for.cond30
+  %inc35 = add nuw nsw i32 %ret.3, 1
+  %inc37 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond30
+
+h:                                                ; preds = %for.cond20, %while.end
+  %ret.4 = phi i32 [ 0, %while.end ], [ %ret.1, %for.cond20 ]
+  br label %for.cond40
+
+for.cond40:                                       ; preds = %for.body43, %h
+  %ret.5 = phi i32 [ %ret.4, %h ], [ %inc44, %for.body43 ]
+  %storemerge1 = phi i32 [ 0, %h ], [ %inc46, %for.body43 ]
+  %cmp41 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp41, label %for.body43, label %j
+
+for.body43:                                       ; preds = %for.cond40
+  %inc44 = add nsw i32 %ret.5, 1
+  %inc46 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond40
+
+j:                                                ; preds = %for.cond40, %for.cond30
+  %ret.6 = phi i32 [ %ret.3, %for.cond30 ], [ %ret.5, %for.cond40 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization19, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization19
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br i1 %[[CMP2MERGE:.+]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND20PREHEADER:.+]]
+
+; CHECK: [[FORCOND20PREHEADER]]:
+; CHECK: br label %[[FORCOND20:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP2MERGE]] = phi i1 [ %[[CMP2]], %[[IFTHEN]] ], [ false, %[[IFEND]] ]
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[I28LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND20]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY23]]:
+; CHECK: br label %[[FORCOND20]]
+
+; CHECK: [[I28LOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[I28:.+]]:
+; CHECK: br label %[[FORCOND30:.+]]
+
+; CHECK: [[FORCOND30]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY34]]:
+; CHECK: br label %[[FORCOND30]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND40:.+]]
+
+; CHECK: [[FORCOND40]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43:.+]], label %[[JLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY43]]:
+; CHECK: br label %[[FORCOND40]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[JLOOPEXIT2]]:
+; CHECK: br label %[[I28]]
+
+; CHECK: [[J]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
new file mode 100644
index 0000000000000..85ef2577e5a43
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
@@ -0,0 +1,274 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   X   /
+;      \ / \ /
+;       h   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    /         \
+;   e - d   f - g
+;        \ /
+;         i
+;         |
+;         h
+;         |
+;         j
+;
+; __kernel void partial_linearization2(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n > 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto i;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;   goto end;
+;
+; i:
+;   ret *= 10;
+;   goto end;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge5, %sub
+  br i1 %cmp5, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge4 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge4, %div9
+  br i1 %cmp10, label %for.body12, label %i42
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge4, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge3 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge3, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond
+  %ret.4 = phi i32 [ %ret.0, %for.cond ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %end
+
+i42:                                              ; preds = %for.cond32, %for.cond8
+  %ret.5 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %h
+  %storemerge2 = phi i32 [ %mul43, %i42 ], [ %add41, %h ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization2, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization2
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCOND8PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[I42LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT3:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[I42LOOPEXIT4:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[I42:.+]]
+
+; CHECK: [[HLOOPEXIT3]]:
+; CHECK: br label %[[I42]]
+
+; CHECK: [[H:.+]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[I42LOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[I42LOOPEXIT4]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[I42]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
new file mode 100644
index 0000000000000..6ee82d98f8009
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
@@ -0,0 +1,236 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <--------.
+;    / \         |
+;   |   c        |
+;   |  / \       |
+;   | f   h <--. |
+;   | |  / \   | |
+;   | | |   d -' |
+;   | | |   |    |
+;   | | |   e ---'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     g
+;
+; * where nodes b, d, and e are uniform branches, and node h is a varying
+;   branch.
+; * where nodes b, d and g are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--.
+;     |    |
+;     c    |
+;    /|    |
+;   f h <. |
+;   | |  | |
+;   | d -' |
+;   | |    |
+;   | e ---'
+;    \|
+;     g
+;
+; __kernel void partial_linearization20(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto g;
+;     }
+;     if (n == 6) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (ret++ + id >= n) {
+;         goto d;
+;       }
+;       if (n & 1) {
+;         goto g;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 1; i++) ret++;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %inc, %e ]
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  br i1 %0, label %g, label %if.end
+
+if.end:                                           ; preds = %while.body
+  %cmp4 = icmp eq i32 %n, 6
+  br i1 %cmp4, label %for.cond, label %while.body9
+
+while.body9:                                      ; preds = %d, %if.end
+  %ret.1 = phi i32 [ %ret.0, %if.end ], [ %inc, %d ]
+  %inc = add nsw i32 %ret.1, 1
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp sge i32 %add, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond1 = or i1 %tobool, %cmp10
+  br i1 %or.cond1, label %d, label %g
+
+d:                                                ; preds = %while.body9
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body9
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+for.cond:                                         ; preds = %for.body, %if.end
+  %ret.2 = phi i32 [ %inc27, %for.body ], [ %ret.0, %if.end ]
+  %storemerge = phi i32 [ %inc28, %for.body ], [ 0, %if.end ]
+  %cmp25 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp25, label %g, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc27 = add nsw i32 %ret.2, 1
+  %inc28 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %e, %while.body9, %while.body
+  %ret.3 = phi i32 [ %ret.0, %while.body ], [ %inc, %e ], [ %ret.2, %for.cond ], [ %inc, %while.body9 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization20, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization20
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP4:.+]] = icmp
+; CHECK: br i1 %[[CMP4]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODY9PREHEADER:.+]]
+
+; CHECK: [[WHILEBODY9PREHEADER]]:
+; CHECK: br label %[[WHILEBODY9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FORCONDPREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[WHILEBODY9]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY9]], label %[[WHILEBODY9PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY9PUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT1:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[GLOOPEXIT1]]:
+; CHECK: br label %[[GLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERELSE]], label %[[FORCONDPREHEADERSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
new file mode 100644
index 0000000000000..d71e84ab8facb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
@@ -0,0 +1,197 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   |   c <--. |
+;   |  / \   | |
+;   | |   d -' |
+;   | |  / \   |
+;   | | |   e -'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     f
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; __kernel void partial_linearization21(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto f;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto f;
+;     }
+;   }
+;
+; f:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  %cmp6.old = icmp eq i32 %n, 3
+  br i1 %cmp6.old, label %if.else, label %f
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %f
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %f
+
+f:                                                ; preds = %e, %if.else, %while.body5, %while.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization21, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization21
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[WHILEBODY5:.+]]:
+
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[WHILEBODY5]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[FLOOPEXITELSE:.+]]
+
+; CHECK: [[FLOOPEXITELSE]]:
+; CHECK: br label %[[FLOOPEXIT1:.+]]
+
+; CHECK: [[FLOOPEXIT1]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[F]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
new file mode 100644
index 0000000000000..0bcd836a36de0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -0,0 +1,264 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-12+
+; RUN: %veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   f   c <--. |
+;   |\ / \   | |
+;   | |   d -' |
+;   | |\ / \   |
+;   | | |   e -'
+;   | | |\ /
+;   | | | g
+;   | | |/
+;   | | /
+;    \|/
+;     h
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--.
+;    /|    |
+;   f c <. |
+;   | |  | |
+;   | d -' |
+;   | |    |
+;   | e ---'
+;    \|
+;     g
+;     |
+;     h
+;
+; __kernel void partial_linearization22(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto h;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   if (n == 2) {
+;     goto h;
+;   }
+;
+; g:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto h;
+;
+; h:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  switch i32 %n, label %g [
+    i32 3, label %if.else
+    i32 2, label %h
+  ]
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %h
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+f:                                                ; preds = %while.body
+  %cmp24 = icmp eq i32 %n, 2
+  br i1 %cmp24, label %h, label %g
+
+g:                                                ; preds = %f, %e, %while.body5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %g
+  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
+  %cmp29 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp29, label %h, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc31 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
+  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization22
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP6:.+]] = icmp slt
+; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]]
+; CHECK: %[[F_EXIT_MASK:.+]] = select i1
+; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]])
+; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[LEAFBLOCK1:.*]]:
+; CHECK: %[[SWITCHLEAF:.+]] = icmp eq i32 %n, 3
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[LEAFBLOCK1]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[G_EXIT_MASK:.+]], %[[F]] ], [ false, %[[E]] ]
+; CHECK: br label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[F]]:
+; CHECK: %[[CMP24:.+]] = icmp eq i32 %n, 2
+; CHECK: %[[G_EXIT_MASK]] = select i1 %[[CMP24]], i1 false, i1 %[[F_EXIT_MASK]]
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FSPLIT:.+]]:
+; CHECK: %[[CMP24_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %cmp24.merge)
+; CHECK: br i1 %[[CMP24_ANY]], label %[[H:.+]], label %[[G]]
+
+; CHECK: [[GLOOPEXIT:.+]]:
+; CHECK: br label %[[GLOOPEXITELSE:.+]]
+
+; CHECK: [[GLOOPEXITELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 true, label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[HLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[HLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT]]
+
+;; CHECK: [[H]]:
+;; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
new file mode 100644
index 0000000000000..1f04b8bf7143d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
@@ -0,0 +1,247 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization23 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   X   /
+;      \ / \ /
+;       h   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    /         \
+;   e - d   f - g
+;        \ /
+;         i
+;         |
+;         h
+;         |
+;         j
+;
+; The purpose of this test is to make sure we correctly handle blending in `i`
+; which cannot be considered as a blend block since it is not the join point of
+; either div causing blocks.
+; We want to make sure the incoming blocks of the phi nodes in `i` are correctly
+; translated into select instructions for the predecessors which get linearized.
+;
+;
+;
+; __kernel void partial_linearization23(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n > 10) {
+;     if (id % 3 == 0) {
+;       ret = n - 1; goto h;
+;     } else {
+;       for (int i = 0; i < n / 3; i++) { ret += 2; } goto i;
+;     }
+;   } else {
+;     if (id % 2 == 0) {
+;       ret = n * 2; goto h;
+;     } else {
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;   goto end;
+;
+; i:
+;   ret *= 10;
+;   goto end;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization23(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else7
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then
+  %div = sdiv i32 %n, 3
+  %cmp52 = icmp sgt i32 %n, 2
+  br i1 %cmp52, label %for.body.lr.ph, label %i24
+
+for.body.lr.ph:                                   ; preds = %for.cond.preheader
+  %min.iters.check = icmp ult i32 %div, 8
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.lr.ph
+  %n.vec = and i32 %div, -8
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi6 = phi i32 [ 0, %vector.ph ], [ %0, %vector.body ]
+  %vec.phi11 = phi i32 [ 0, %vector.ph ], [ %1, %vector.body ]
+  %vec.phi17 = phi i32 [ 0, %vector.ph ], [ %2, %vector.body ]
+  %vec.phi22 = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
+  %vec.phi104 = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
+  %vec.phi109 = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
+  %vec.phi1015 = phi i32 [ 0, %vector.ph ], [ %6, %vector.body ]
+  %vec.phi1020 = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
+  %0 = add nuw nsw i32 %vec.phi6, 2
+  %1 = add nuw nsw i32 %vec.phi11, 2
+  %2 = add nuw nsw i32 %vec.phi17, 2
+  %3 = add nuw nsw i32 %vec.phi22, 2
+  %4 = add nuw nsw i32 %vec.phi104, 2
+  %5 = add nuw nsw i32 %vec.phi109, 2
+  %6 = add nuw nsw i32 %vec.phi1015, 2
+  %7 = add nuw nsw i32 %vec.phi1020, 2
+  %index.next = add i32 %index, 8
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %.lcssa25 = phi i32 [ %0, %vector.body ]
+  %.lcssa210 = phi i32 [ %1, %vector.body ]
+  %.lcssa216 = phi i32 [ %2, %vector.body ]
+  %.lcssa221 = phi i32 [ %3, %vector.body ]
+  %.lcssa3 = phi i32 [ %4, %vector.body ]
+  %.lcssa8 = phi i32 [ %5, %vector.body ]
+  %.lcssa14 = phi i32 [ %6, %vector.body ]
+  %.lcssa19 = phi i32 [ %7, %vector.body ]
+  %bin.rdx7 = add nuw i32 %.lcssa3, %.lcssa25
+  %bin.rdx12 = add nuw i32 %.lcssa8, %.lcssa210
+  %bin.rdx18 = add nuw i32 %.lcssa14, %.lcssa216
+  %bin.rdx23 = add nuw i32 %.lcssa19, %.lcssa221
+  %bin.rdx1113 = add i32 %bin.rdx7, %bin.rdx12
+  %bin.rdx1124 = add i32 %bin.rdx18, %bin.rdx23
+  %bin.rdx1325 = add i32 %bin.rdx1113, %bin.rdx1124
+  %cmp.n = icmp eq i32 %div, %n.vec
+  br i1 %cmp.n, label %i24, label %scalar.ph
+
+scalar.ph:                                        ; preds = %middle.block, %for.body.lr.ph
+  %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %for.body.lr.ph ]
+  %bc.merge.rdx = phi i32 [ %bin.rdx1325, %middle.block ], [ 0, %for.body.lr.ph ]
+  %9 = add i32 %bc.resume.val, 1
+  %10 = icmp sgt i32 %div, %9
+  %smax = select i1 %10, i32 %div, i32 %9
+  %11 = shl i32 %smax, 1
+  %12 = shl i32 %bc.resume.val, 1
+  br label %for.body
+
+if.then4:                                         ; preds = %if.then
+  %sub = add nsw i32 %n, -1
+  br label %h
+
+for.body:                                         ; preds = %for.body, %scalar.ph
+  %storemerge44 = phi i32 [ %bc.resume.val, %scalar.ph ], [ %inc, %for.body ]
+  %inc = add nuw nsw i32 %storemerge44, 1
+  %cmp5 = icmp slt i32 %inc, %div
+  br i1 %cmp5, label %for.body, label %i24.loopexit
+
+if.else7:                                         ; preds = %entry
+  %rem81 = and i32 %conv, 1
+  %cmp9 = icmp eq i32 %rem81, 0
+  br i1 %cmp9, label %if.then11, label %for.cond14.preheader
+
+for.cond14.preheader:                             ; preds = %if.else7
+  %add15 = add nsw i32 %n, 5
+  %cmp165 = icmp sgt i32 %add15, 0
+  br i1 %cmp165, label %for.body18.preheader, label %i24
+
+for.body18.preheader:                             ; preds = %for.cond14.preheader
+  %13 = add i32 %n, 5
+  br label %for.body18
+
+if.then11:                                        ; preds = %if.else7
+  %mul = shl nsw i32 %n, 1
+  br label %h
+
+for.body18:                                       ; preds = %for.body18.preheader, %for.body18
+  %storemerge7 = phi i32 [ %inc21, %for.body18 ], [ 0, %for.body18.preheader ]
+  %ret.16 = phi i32 [ %mul19, %for.body18 ], [ 0, %for.body18.preheader ]
+  %mul19 = shl nsw i32 %ret.16, 1
+  %inc21 = add nuw nsw i32 %storemerge7, 1
+  %exitcond = icmp ne i32 %inc21, %13
+  br i1 %exitcond, label %for.body18, label %i24.loopexit1
+
+h:                                                ; preds = %if.then11, %if.then4
+  %storemerge3 = phi i32 [ %mul, %if.then11 ], [ %sub, %if.then4 ]
+  %add23 = add nsw i32 %storemerge3, 5
+  br label %end
+
+i24.loopexit:                                     ; preds = %for.body
+  %14 = add i32 %bc.merge.rdx, %11
+  %15 = sub i32 %14, %12
+  br label %i24
+
+i24.loopexit1:                                    ; preds = %for.body18
+  %mul19.lcssa = phi i32 [ %mul19, %for.body18 ]
+  br label %i24
+
+i24:                                              ; preds = %i24.loopexit1, %i24.loopexit, %for.cond14.preheader, %middle.block, %for.cond.preheader
+  %ret.2 = phi i32 [ 0, %for.cond.preheader ], [ %bin.rdx1325, %middle.block ], [ 0, %for.cond14.preheader ], [ %15, %i24.loopexit ], [ %mul19.lcssa, %i24.loopexit1 ]
+  %mul25 = mul nsw i32 %ret.2, 10
+  br label %end
+
+end:                                              ; preds = %i24, %h
+  %storemerge2 = phi i32 [ %mul25, %i24 ], [ %add23, %h ]
+  %sext = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext, 32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization23
+; CHECK: i24:
+; CHECK: %i24.entry_mask{{.+}} = or i1
+; CHECK: %i24.entry_mask{{.+}} = or i1
+; CHECK: %i24.entry_mask{{.+}} = or i1
+; CHECK: %i24.entry_mask{{.+}} = or i1
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
new file mode 100644
index 0000000000000..57c57890c1d10
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
@@ -0,0 +1,269 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   h   /
+;      \   \ /
+;       \   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;    / \
+;   b   c
+;   |   |
+;   e   g
+;   |   |
+;   d   f
+;    \ /
+;     h
+;     |
+;     i
+;     |
+;     j
+;
+; __kernel void partial_linearization3(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n > 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto h;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;
+; i:
+;   ret *= 10;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge4 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge4, %sub
+  br i1 %cmp5, label %for.body, label %end
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge4, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge3 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge3, %div9
+  br i1 %cmp10, label %for.body12, label %h
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge3, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge2 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge2, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond8
+  %ret.4 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %i42
+
+i42:                                              ; preds = %h, %for.cond32
+  %ret.5 = phi i32 [ %add41, %h ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %for.cond
+  %ret.6 = phi i32 [ %mul43, %i42 ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization3, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization3
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCOND8PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[ENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[ENDLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[HLOOPEXIT2]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[ENDLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[ENDLOOPEXIT2]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
new file mode 100644
index 0000000000000..1c557d6ae1ca2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
@@ -0,0 +1,195 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where node b is a uniform branch, and node c is a varying branch.
+; * where nodes f, d and g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;     |
+;     b <--.
+;    / \   |
+;   e   c  |
+;   |   |  |
+;   |   d -'
+;    \ /
+;     f
+;     |
+;     g
+;
+; __kernel void partial_linearization4(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (n > 20) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end5, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc6, %if.end5 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end5 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc7, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 20
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add = add nsw i32 %y.0, %x.0
+  %cmp2 = icmp sgt i32 %add, %n
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc6 = add nsw i32 %x.0, 1
+  %inc7 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add8 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add8
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add9 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add9, 0
+  %13 = select i1 %12, i32 1, i32 %add9
+  %div10 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div10, %f ], [ %mul, %e ]
+  %add11 = add i32 %y.0, %x.0
+  %add12 = add i32 %add11, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization4, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization4
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[E:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND5:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND:.+]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[FORCONDPUREEXIT]]
+
+; CHECK: [[EELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[ESPLIT:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE]], label %[[ESPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
new file mode 100644
index 0000000000000..c469d64eec092
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -0,0 +1,221 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;   |\ / \
+;   | d   e
+;   |  \ /
+;   |   f
+;    \ /
+;     g
+;
+; * where node c is a uniform branch, and nodes a and b are varying branches.
+; * where nodes b, c, d, f, g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;     |
+;     c
+;    / \
+;   |   e
+;    \ /
+;     b
+;     |
+;     d
+;     |
+;     f
+;     |
+;     g
+;
+; __kernel void partial_linearization5(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 2 == 0) { // a
+;     if (id == 4) { // b
+;       goto g;
+;     } else {
+;       goto d;
+;     }
+;   } else { // c
+;     if (n % 2 == 0) {
+;       goto d;
+;     } else {
+;       goto e;
+;     }
+;   }
+;
+; d:
+;   for (int i = 0; i < n / 4; i++) { ret += i - 2; }
+;   goto f;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) { ret += i + 5; }
+;
+; f:
+;   ret *= ret % n;
+;   ret *= ret + 4;
+;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem1 = and i32 %conv, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %cmp2 = icmp eq i32 %conv, 4
+  br i1 %cmp2, label %g, label %d
+
+if.else5:                                         ; preds = %entry
+  %rem62 = and i32 %n, 1
+  %cmp7 = icmp eq i32 %rem62, 0
+  br i1 %cmp7, label %d, label %e
+
+d:                                                ; preds = %if.else5, %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %d
+  %ret.0 = phi i32 [ 0, %d ], [ %add, %for.body ]
+  %storemerge3 = phi i32 [ 0, %d ], [ %inc, %for.body ]
+  %div = sdiv i32 %n, 4
+  %cmp11 = icmp slt i32 %storemerge3, %div
+  br i1 %cmp11, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %sub = add i32 %ret.0, -2
+  %add = add i32 %sub, %storemerge3
+  %inc = add nsw i32 %storemerge3, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.body18, %e
+  %ret.1 = phi i32 [ 0, %e ], [ %add20, %for.body18 ]
+  %storemerge = phi i32 [ 0, %e ], [ %inc22, %for.body18 ]
+  %add15 = add nsw i32 %n, 5
+  %cmp16 = icmp slt i32 %storemerge, %add15
+  br i1 %cmp16, label %for.body18, label %f
+
+for.body18:                                       ; preds = %for.cond14
+  %add19 = add i32 %ret.1, 5
+  %add20 = add i32 %add19, %storemerge
+  %inc22 = add nsw i32 %storemerge, 1
+  br label %for.cond14
+
+f:                                                ; preds = %for.cond14, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond14 ]
+  %0 = icmp eq i32 %ret.2, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %rem24 = srem i32 %ret.2, %5
+  %mul = mul nsw i32 %rem24, %ret.2
+  %add25 = add nsw i32 %mul, 4
+  %mul26 = mul nsw i32 %add25, %mul
+  br label %g
+
+g:                                                ; preds = %f, %if.then
+  %ret.3 = phi i32 [ %mul26, %f ], [ 0, %if.then ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization5, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization5
+; CHECK: br label %[[IFELSE5:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+
+; CHECK: [[FORCOND14PREHEADER]]:
+; CHECK: br label %[[FORCOND14:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP11:.+]] = icmp
+; CHECK: br i1 %[[CMP11]], label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND14]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18:.+]], label %[[FLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY18]]:
+; CHECK: br label %[[FORCOND14]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT2]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
new file mode 100644
index 0000000000000..a81e663bb0575
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
@@ -0,0 +1,200 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization6 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes b and c are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g and h are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ |
+;      \|
+;       g
+;       |
+;       h
+;
+; __kernel void partial_linearization6(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       ret += n + 1;
+;     }
+;     if (id == n) break;
+;   }
+;
+;   ret += n * 2;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   ret += n * 4;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end10, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.1, %if.end10 ]
+  %rem1 = and i32 %n, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end6
+
+if.else:                                          ; preds = %while.body
+  %add = add nsw i32 %n, 1
+  %add5 = add nsw i32 %add, %ret.0
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.else, %if.then
+  %ret.1 = phi i32 [ %add5, %if.else ], [ %ret.0, %if.then ]
+  %cmp7 = icmp eq i32 %conv, %n
+  br i1 %cmp7, label %while.end, label %if.end10
+
+if.end10:                                         ; preds = %if.end6
+  br label %while.body
+
+while.end:                                        ; preds = %if.end6
+  %mul = shl nsw i32 %n, 1
+  %add11 = add nsw i32 %ret.1, %mul
+  %0 = icmp eq i32 %add11, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %add11, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  %mul12 = mul i32 %n, 4
+  %n.neg = sub i32 0, %n
+  %add13 = add i32 %mul12, %n.neg
+  %sub = add i32 %add13, %ret.0
+  br label %early
+
+early:                                            ; preds = %e, %while.end
+  %storemerge = phi i32 [ %div, %while.end ], [ %sub, %e ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization6, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization6
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[E:.+]], label %[[IFEND6:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFEND6]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE:.+]], label %[[ESPLIT:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[EELSE]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[ESPLIT]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
new file mode 100644
index 0000000000000..9ca98830fd917
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
@@ -0,0 +1,228 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c
+;    / \ / \
+;   d   e   f
+;    \ / \ /
+;     g   h
+;      \ /
+;       i
+;
+; * where nodes a, c and e are uniform branches, and node b is a varying
+;   branch.
+; * where nodes d, e, g and i are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;    / \
+;   b   c
+;   |  /|
+;   d / |
+;   |/  |
+;   e   f
+;   |\  |
+;   | \ |
+;   |  \|
+;   g - h
+;   |
+;   i
+;
+; __kernel void partial_linearization7(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   if (n > 10) { // a
+;     if (n + id > 10) { // b
+;       i = n * 10; // d
+;       goto g;
+;     } else {
+;       goto e;
+;     }
+;   } else {
+;     if (n < 5) { // c
+;       goto e;
+;     } else {
+;       for (int j = 0; j < n; j++) { i++; }
+;       goto h;
+;     }
+;   }
+;
+; e:
+;   if (n > 5) {
+;     goto g;
+;   } else {
+;     i = n * 3 / 5;
+;     goto h;
+;   }
+;
+; g:
+;   for (int j = 0; j < n; j++) { i++; }
+;   goto i;
+;
+; h:
+;   i = n + id / 3;
+;
+; i:
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %conv, %n
+  %cmp2 = icmp sgt i32 %add, 10
+  br i1 %cmp2, label %if.then4, label %e
+
+if.then4:                                         ; preds = %if.then
+  %mul = mul nsw i32 %n, 10
+  br label %g
+
+if.else5:                                         ; preds = %entry
+  %cmp6 = icmp slt i32 %n, 5
+  br i1 %cmp6, label %e, label %if.else9
+
+if.else9:                                         ; preds = %if.else5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else9
+  %storemerge = phi i32 [ 0, %if.else9 ], [ %inc12, %for.body ]
+  %cmp10 = icmp slt i32 %storemerge, %n
+  br i1 %cmp10, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5, %if.then
+  %cmp13 = icmp sgt i32 %n, 5
+  br i1 %cmp13, label %g, label %h
+
+g:                                                ; preds = %e, %if.then4
+  %i.1 = phi i32 [ %mul, %if.then4 ], [ 0, %e ]
+  br label %for.cond19
+
+for.cond19:                                       ; preds = %for.body22, %g
+  %i.2 = phi i32 [ %i.1, %g ], [ %inc23, %for.body22 ]
+  %storemerge1 = phi i32 [ 0, %g ], [ %inc25, %for.body22 ]
+  %cmp20 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp20, label %for.body22, label %i29
+
+for.body22:                                       ; preds = %for.cond19
+  %inc23 = add nsw i32 %i.2, 1
+  %inc25 = add nsw i32 %storemerge1, 1
+  br label %for.cond19
+
+h:                                                ; preds = %e, %for.cond
+  %div27 = sdiv i32 %conv, 3
+  %add28 = add nsw i32 %div27, %n
+  br label %i29
+
+i29:                                              ; preds = %h, %for.cond19
+  %i.3 = phi i32 [ %add28, %h ], [ %i.2, %for.cond19 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %i.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization7, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization7
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE5:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[IFTHEN4:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP6:.+]] = icmp
+; CHECK: br i1 %[[CMP6]], label %[[E]], label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[E]]:
+; CHECK: %[[CMP13:.+]] = icmp
+; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND19:.+]]
+
+; CHECK: [[FORCOND19]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[I29LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND19]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I29LOOPEXIT]]:
+; CHECK: br label %[[I29:.+]]
+
+; CHECK: [[I29]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
new file mode 100644
index 0000000000000..8b7dd995ba092
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
@@ -0,0 +1,191 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization8 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where nodes b and c varying branches.
+; * where nodes e, f, d and g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;   a
+;   |
+;   b <.
+;   |  |
+;   c  |
+;   |  |
+;   d -'
+;   |
+;   f
+;   |
+;   e
+;   |
+;   g
+;
+; __kernel void partial_linearization8(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (i + id > n) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end6, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc7, %if.end6 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end6 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc8, %if.end6 ]
+  %add = add nsw i32 %storemerge, %conv
+  %cmp = icmp sgt i32 %add, %n
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add2 = add nsw i32 %y.0, %x.0
+  %cmp3 = icmp sgt i32 %add2, %n
+  br i1 %cmp3, label %f, label %if.end6
+
+if.end6:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc7 = add nsw i32 %x.0, 1
+  %inc8 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add9 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add9
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add10 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add10, 0
+  %13 = select i1 %12, i32 1, i32 %add10
+  %div11 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div11, %f ], [ %mul, %e ]
+  %add12 = add i32 %y.0, %x.0
+  %add13 = add i32 %add12, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add13, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization8, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization8
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND6:.+]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[E]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
new file mode 100644
index 0000000000000..bff1b5b466b6d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
@@ -0,0 +1,148 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k partial_linearization9 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | %filecheck %s
+
+; The CFG of the following kernel is:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; * where node e is a varying branch.
+; * where node f is divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; __kernel void partial_linearization9(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   while (1) {
+;     int j = 0;
+;     for (; ; i++) {
+;       if (j++ > n) break;
+;     }
+;     if (i++ + id > n) break;
+;   }
+;
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end7, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc3, %if.end7 ]
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %while.body
+  %i.1 = phi i32 [ %i.0, %while.body ], [ %inc3, %for.inc ]
+  %j.0 = phi i32 [ 0, %while.body ], [ %inc, %for.inc ]
+  %cmp = icmp sgt i32 %j.0, %n
+  %inc3 = add nsw i32 %i.1, 1
+  br i1 %cmp, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %add = add nsw i32 %i.1, %conv
+  %cmp4 = icmp sgt i32 %add, %n
+  br i1 %cmp4, label %while.end, label %if.end7
+
+if.end7:                                          ; preds = %for.end
+  br label %while.body
+
+while.end:                                        ; preds = %for.end
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %inc3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization9, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization9
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FOREND:.+]], label %[[FORINC:.+]]
+
+; CHECK: [[FORINC]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
new file mode 100644
index 0000000000000..20f649a8ffa45
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test < %s
+
+; This test ensures that VECZ does not crash during control flow conversion due
+; to a missing exit mask. As such, we need only verify that the return code from
+; veczc is 0, and FileCheck is not required. See CA-3117 for details.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = tail call spir_func i32 @_Z13get_global_idj(i32 0)
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.body.preheader, label %if.end.thread
+
+for.body.preheader:
+  %cmp2 = icmp sgt i32 %n, 1
+  %0 = and i32 %call, 1
+  %cmp3 = icmp eq i32 %0, 0
+  br i1 %cmp2, label %if.end2, label %if.else
+
+if.end.thread:
+  %cmp4 = icmp eq i32 %call, 0
+  br i1 %cmp4, label %if.end, label %for.cond.preheader
+
+if.else:
+  br i1 %cmp3, label %if.end, label %for.body
+
+for.cond.preheader:
+  %cmp5 = icmp sgt i32 %n, 1
+  br i1 %cmp5, label %for.body, label %if.end
+
+for.body:
+  br i1 0, label %if.end, label %for.body
+
+if.end:
+  %div = sdiv i32 %call, 2
+  br label %if.end2
+
+if.end2:
+  %ret = phi i32 [ 0, %for.body.preheader ], [ %div, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 0
+  store i32 %ret, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare spir_func i32 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z3maxii(i32, i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
new file mode 100644
index 0000000000000..052cf3ed75b9e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k foo -w 2 -debug-vecz-pipeline -S < %s 2>&1 | %filecheck %s
+; RUN: %veczc -k foo -w 2 -vecz-passes scalarize -debug-vecz-pipeline -S < %s 2>&1 | %filecheck %s --check-prefix=PASSES1
+; RUN: %veczc -k foo -w 2 -vecz-passes scalarize,packetizer -debug-vecz-pipeline -S < %s 2>&1 | %filecheck %s --check-prefix=PASSES2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Don't check specific passes, but assume that *some* analyses and passes are run.
+; CHECK: Running analysis: {{.*}}> on __vecz_v2_foo
+; CHECK: Running pass: {{.*}} on __vecz_v2_foo
+
+; PASSES1: Running pass: RequireAnalysisPass<{{(class )?}}compiler::utils::DeviceInfoAnalysis,
+; PASSES1-NOT: Running pass:
+; PASSES1: Running pass: Function scalarization on __vecz_v2_foo
+; PASSES1-NOT: Running pass:
+; PASSES1-NOT: Running pass:
+
+; PASSES2: Running pass: RequireAnalysisPass<{{(class )?}}compiler::utils::DeviceInfoAnalysis,
+; PASSES2-NOT: Running pass:
+; PASSES2: Running pass: Function scalarization on __vecz_v2_foo
+; PASSES2: Running pass: Function packetization on __vecz_v2_foo
+; PASSES2-NOT: Running pass:
+; PASSES2-NOT: Running pass:
+
+define spir_kernel void @foo(i32 addrspace(1)* %out) {
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
new file mode 100644
index 0000000000000..a05a1957e8a68
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: llvm-12+
+; RUN: %veczc -k foo -w 2 -vecz-passes scalarize,mask-memops,packetizer -print-after mask-memops -S < %s 2>&1 | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: IR Dump After Simplify masked memory operations{{( on __vecz_v2_foo)?}}
+; CHECK-NEXT: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) #0 {
+; CHECK-NEXT:   %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
+; CHECK-NEXT:   store i32 0, ptr addrspace(1) %arrayidx, align 4
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+; CHECK: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) {{.*}} {
+; CHECK-NEXT:   %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
+; CHECK-NEXT:   store <2 x i32> zeroinitializer, ptr addrspace(1) %arrayidx, align 4
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define spir_kernel void @foo(i32 addrspace(1)* %out) {
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
new file mode 100644
index 0000000000000..57d979a591c71
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k codegen_2 -vecz-simd-width 16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @codegen_2(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %size, i32 %reps) local_unnamed_addr {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = sext i32 %reps to i64
+  %mul = mul i64 %call, %conv
+  %add = add i64 %call, 1
+  %mul2 = mul i64 %add, %conv
+  %cmp19 = icmp ult i64 %mul, %mul2
+  br i1 %cmp19, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv4 = sext i32 %size to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %sum.1, %for.inc ]
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %sum.0.lcssa, i32 addrspace(1)* %arrayidx8, align 4, !tbaa !9
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.021 = phi i64 [ %mul, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %sum.020 = phi i32 [ 0, %for.body.lr.ph ], [ %sum.1, %for.inc ]
+  %cmp5 = icmp ult i64 %i.021, %conv4
+  br i1 %cmp5, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %i.021
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !tbaa !9
+  %add7 = add nsw i32 %0, %sum.020
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %sum.1 = phi i32 [ %add7, %if.then ], [ %sum.020, %for.body ]
+  %inc = add nuw i64 %i.021, 1
+  %cmp = icmp ult i64 %inc, %mul2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+!host.build_options = !{!8}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @codegen_2, !3, !4, !5, !6, !7}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int*", !"int", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"const", !"", !"", !""}
+!8 = !{!""}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+
+
+; It checks that the PHI node did not prevent the interleave factor from being determined
+; CHECK: define spir_kernel void @__vecz_v16_codegen_2
+; CHECK-NOT: call <16 x i32> @__vecz_b_masked_gather_load4_4_Dv16_jDv16_u3ptrU3AS1Dv16_b
+; CHECK: call <16 x i32> @__vecz_b_masked_interleaved_load4_V_Dv16_ju3ptrU3AS1Dv16_b
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
new file mode 100644
index 0000000000000..ec2b2e9ccb618
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -0,0 +1,135 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info intrinsics are correctly placed after
+; phi nodes.
+
+; RUN: %veczc -k loop_phi -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @loop_phi(i32 addrspace(3)* %a, i32 addrspace(3)* %b) #0 !dbg !4 {
+entry:
+  %a.addr = alloca i32 addrspace(3)*, align 8
+  %b.addr = alloca i32 addrspace(3)*, align 8
+  %tid = alloca i64, align 8
+  %i = alloca i32, align 4
+  store i32 addrspace(3)* %a, i32 addrspace(3)** %a.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(3)** %a.addr, metadata !12, metadata !30), !dbg !31
+  store i32 addrspace(3)* %b, i32 addrspace(3)** %b.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(3)** %b.addr, metadata !13, metadata !30), !dbg !31
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !30), !dbg !32
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #3, !dbg !32
+  store i64 %call, i64* %tid, align 8, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !19, metadata !30), !dbg !33
+  %0 = load i64, i64* %tid, align 8, !dbg !33
+  %conv = trunc i64 %0 to i32, !dbg !33
+  store i32 %conv, i32* %i, align 4, !dbg !33
+  br label %for.cond, !dbg !33
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4, !dbg !34
+  %cmp = icmp slt i32 %1, 128, !dbg !34
+  br i1 %cmp, label %for.body, label %for.end, !dbg !33
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4, !dbg !36
+  %idxprom = sext i32 %2 to i64, !dbg !36
+  %3 = load i32 addrspace(3)*, i32 addrspace(3)** %b.addr, align 8, !dbg !36
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %3, i64 %idxprom, !dbg !36
+  %4 = load i32, i32 addrspace(3)* %arrayidx, align 4, !dbg !36
+  %5 = load i32, i32* %i, align 4, !dbg !36
+  %idxprom2 = sext i32 %5 to i64, !dbg !36
+  %6 = load i32 addrspace(3)*, i32 addrspace(3)** %a.addr, align 8, !dbg !36
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(3)* %6, i64 %idxprom2, !dbg !36
+  store i32 %4, i32 addrspace(3)* %arrayidx3, align 4, !dbg !36
+  br label %for.inc, !dbg !38
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4, !dbg !34
+  %add = add nsw i32 %7, 32, !dbg !34
+  store i32 %add, i32* %i, align 4, !dbg !34
+  br label %for.cond, !dbg !34
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z12get_local_idj(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!28}
+!llvm.ident = !{!29}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/build")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "loop_phi", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !9}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 64)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8)
+!11 = !{!12, !13, !14, !19}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 2, type: !7)
+!13 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 2, type: !9)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !1, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/home/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/home/Aorta/build")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "i", scope: !20, file: !1, line: 4, type: !8)
+!20 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4)
+!21 = !{void (i32 addrspace(3)*, i32 addrspace(3)*)* @loop_phi, !22, !23, !24, !25, !26, !27}
+!22 = !{!"kernel_arg_addr_space", i32 3, i32 3}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"const"}
+!27 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1}
+!28 = !{i32 2, !"Debug Info Version", i32 3}
+!29 = !{!"clang version 3.8.1 "}
+!30 = !DIExpression()
+!31 = !DILocation(line: 2, scope: !4)
+!32 = !DILocation(line: 3, scope: !4)
+!33 = !DILocation(line: 4, scope: !20)
+!34 = !DILocation(line: 4, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !20, file: !1, line: 4)
+!36 = !DILocation(line: 5, scope: !37)
+!37 = distinct !DILexicalBlock(scope: !35, file: !1, line: 4)
+!38 = !DILocation(line: 6, scope: !37)
+!39 = !DILocation(line: 7, scope: !4)
+
+; CHECK: for.cond:
+; CHECK: %[[PHI1:.+]] = phi <4 x [[TYPE:i[0-9]+]]> [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: call void @llvm.dbg.value(metadata <4 x [[TYPE]]> %[[PHI1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg !{{[0-9]+}}
+; CHECK-NOT: phi
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
new file mode 100644
index 0000000000000..fe4bfed65f3ce
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k phi_memory -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i32 %size) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %idx.ext = sext i32 %conv to i64
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idx.ext
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %output.addr.0 = phi i32 addrspace(1)* [ %add.ptr, %entry ], [ %add.ptr2, %for.body ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, %size
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  store i32 %0, i32 addrspace(1)* %output.addr.0, align 4
+  %add.ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %output.addr.0, i64 1
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the contiguity of the load and store is identified through the
+; loop-incrementing pointer PHI node
+;
+; CHECK: void @__vecz_v4_phi_memory
+; CHECK: %[[LD:.+]] = load <4 x i32>
+; CHECK: store <4 x i32> %[[LD]]
+; CHECK-NOT: scatter_store
+; CHECK-NOT: gather_load
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
new file mode 100644
index 0000000000000..0daa9f7a40bd2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k phi_memory -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i64 %size) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %call
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %output.addr.0 = phi i32 addrspace(1)* [ %add.ptr, %entry ], [ %add.ptr2, %for.body ]
+  %storemerge = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i64 %storemerge, %size
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i64 %storemerge, %call
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %add
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  store i32 %0, i32 addrspace(1)* %output.addr.0, align 4
+  %add.ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %output.addr.0, i64 %call
+  %inc = add nsw i64 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the NON-contiguity of the store is identified through the
+; loop-incrementing pointer PHI node
+;
+; CHECK: void @__vecz_v4_phi_memory
+; CHECK: %[[LD:.+]] = load <4 x i32>
+; CHECK: call void @__vecz_b_scatter_store4_Dv4_jDv4_u3ptrU3AS1(<4 x i32> %[[LD]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
new file mode 100644
index 0000000000000..6b76b45458390
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k predicate_with_switch -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+@predicate_with_switch.tmpIn = internal addrspace(3) global [16 x i32] undef, align 4
+
+define spir_kernel void @predicate_with_switch(i32 addrspace(1)* %A, i32 addrspace(1)* %B) #0 {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #2
+  %call1 = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  switch i64 %call, label %if.end [
+    i64 0, label %return
+    i64 200, label %return
+  ]
+
+if.end:
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %call1
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @predicate_with_switch.tmpIn, i64 0, i64 %call
+  store i32 %0, i32 addrspace(3)* %arrayidx3, align 4
+  %sub = add i64 %call, -1
+  %arrayidx4 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @predicate_with_switch.tmpIn, i64 0, i64 %sub
+  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %call1
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  br label %return
+
+return:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_predicate_with_switch
+
+; We should use masked stores
+; CHECK: vecz_b_masked_store4
+; CHECK: vecz_b_masked_store4
+
+; We should *not* have unconditional stores
+; CHECK-NOT: store <4 x i32>
+; CHECK-NOT: store <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
new file mode 100644
index 0000000000000..b5ac1cf16b6f0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S -vecz-passes=packetizer < %s | %filecheck %s
+
+; CHECK: %{{.*}} = fcmp nnan ninf olt <4 x float> %{{.*}}, %{{.*}}
+
+define spir_kernel void @fast_nan(float addrspace(1)* %src1, float addrspace(1)* %src2, i16 addrspace(1)* %dst, i32 %width) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %src1, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %src2, i64 %call
+  %1 = load float, float addrspace(1)* %arrayidx2, align 4
+  %cmp = fcmp nnan ninf olt float %0, %1
+  %conv4 = zext i1 %cmp to i16
+  %arrayidx6 = getelementptr inbounds i16, i16 addrspace(1)* %dst, i64 %call
+  store i16 %conv4, i16 addrspace(1)* %arrayidx6, align 2
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
new file mode 100644
index 0000000000000..3f3eb7c48d2af
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
@@ -0,0 +1,88 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_float -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
+@.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %cmp = icmp eq i32 %width, 13
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+ x i8] addrspace(2)* @.str, i64 0, i64 0), i32 %0) #3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define spir_kernel void @test_float(float* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %0
+  %conv = fpext float %mul to double
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(2)* @.strf, i64 0, i64 0), double %conv)
+  ret void
+}
+
+
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @printf_kernel, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
+
+; CHECK: define spir_kernel void @__vecz_v4_test_float
+; CHECK: %[[CONV2:.+]] = fpext <4 x float> %{{.+}} to <4 x double>
+; CHECK: %[[V2:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 0
+; CHECK: %[[V3:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 1
+; CHECK: %[[V4:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 2
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 3
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V2]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V3]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V4]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V5]])
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
new file mode 100644
index 0000000000000..ae102ec12fc8f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
@@ -0,0 +1,126 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+; RUN: %veczc -k regression_by_all -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | %filecheck %s
+
+; The purpose of this test is to make sure the block `c` does not get considered
+; as a by_all because one of its predecessors is by_all. In fact, because `c`
+; has a div causing block (b) as one of its predecessors, then it cannot be
+; considered by_all
+
+; The CFG of the following kernel is:
+;
+;   a
+;   |\
+;   | b
+;   |/ \
+;   c   d
+;    \ /
+;     e
+;
+; * where node a is a uniform branch, and node b is a varying branch.
+; * where nodes c, d and e are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;    /|
+;   | b
+;   | |
+;   | d
+;    \|
+;     c
+;     |
+;     e
+;
+; __kernel void regression_by_all(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n % 2 == 0) {
+;     goto d;
+;   } else {
+;     ret = 1;
+;     if (id % 2 != 0) {
+;       goto d;
+;     } else {
+;       for (int i = 0; i < n; ++i) { ret++; }
+;       goto e;
+;     }
+;   }
+;
+; d:
+;   ret += id;
+;   ret *= n;
+;
+; e:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @regression_by_all(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %rem1 = and i32 %n, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %d, label %if.else
+
+if.else:                                          ; preds = %entry
+  %rem22 = and i32 %conv, 1
+  %cmp3 = icmp eq i32 %rem22, 0
+  br i1 %cmp3, label %for.cond, label %d
+
+for.cond:                                         ; preds = %if.else, %for.body
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 1, %if.else ]
+  %storemerge = phi i32 [ %inc9, %for.body ], [ 0, %if.else ]
+  %cmp7 = icmp slt i32 %storemerge, %n
+  br i1 %cmp7, label %for.body, label %e
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc9 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+d:                                                ; preds = %if.else, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ 1, %if.else ]
+  %add = add nsw i32 %ret.1, %conv
+  %mul = mul nsw i32 %add, %n
+  br label %e
+
+e:                                                ; preds = %for.cond, %d
+  %ret.2 = phi i32 [ %mul, %d ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_regression_by_all
+; CHECK: br i1 %[[CMP:.+]], label %[[D:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[D]]:
+; CHECK-NOT: %d.entry_mask = and i1 true, true
+; CHECK: %d.entry_mask = phi i1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
new file mode 100644
index 0000000000000..5be2aa5aff22e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i8(
+; CHECK: %shl = shl i64 %call, 2
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
+; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
+; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
+define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl i64 %call, 2
+  %add = add i64 %shl, %0
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i16(
+; CHECK: %shl = shl i64 %call, 2
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
+; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
+; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
+define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = ptrtoint i16 addrspace(1)* %in to i64
+  %shl = shl i64 %call, 2
+  %add = add i64 %shl, %0
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
new file mode 100644
index 0000000000000..615b307f584f6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl nuw nsw i64 %call, 2
+  %add = add i64 %shl, %0
+  %1 = inttoptr i64 %add to i32 addrspace(1)*
+  %2 = load i32, i32 addrspace(1)* %1, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_remove_intptr
+; CHECK-NOT: ptrtoint
+; CHECK-NOT: inttoptr
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in
+; CHECK: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %remove_intptr, align 4
+; CHECK: store <4 x i32> %[[LOAD]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
new file mode 100644
index 0000000000000..3121b9b26c55d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl nuw nsw i64 %call, 2
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %shl
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %intin.06 = phi i64 [ %0, %entry ], [ %add, %for.body ]
+  %add = add i64 %intin.06, 4
+  %1 = inttoptr i64 %add to i32 addrspace(1)*
+  %2 = load i32, i32 addrspace(1)* %1, align 4
+  store i32 %2, i32 addrspace(1)* %arrayidx, align 4
+  %inc = add nuw nsw i32 %x.07, 1
+  %exitcond.not = icmp eq i32 %inc, 4
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_remove_intptr
+; CHECK-NOT: ptrtoint
+; CHECK-NOT: inttoptr
+; CHECK: %[[RPHI:.+]] = phi ptr addrspace(1) [ %in, %entry ], [ %[[RGEP:.+]], %for.body ]
+; CHECK: %[[RGEP]] = getelementptr i8, ptr addrspace(1) %[[RPHI]], i{{32|64}} 4
+; CHECK: load i32, ptr addrspace(1) %[[RGEP]], align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
new file mode 100644
index 0000000000000..9b081570b8ea6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s -w 16 | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i64 addrspace(1)* %N) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %0 = load i64, i64 addrspace(1)* %N, align 8
+  %cmp = icmp ult i64 %call, %0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %2, %1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @__vecz_v16_add
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %[[END:.+]], label %[[THEN:.+]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[THEN]]:
+; CHECK: br label %[[END]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[END]]:
+; CHECK-NEXT: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
new file mode 100644
index 0000000000000..385119729092f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @get_local_id(i32 0)
+  %cmp = icmp eq i32 %lid, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  %single_load = load i32, i32 addrspace(1)* %in
+  %single_add = add i32 %single_load, 42
+  store i32 %single_add, i32 addrspace(1)* %in
+  br label %merge
+
+merge:
+  %multi_load = load i32, i32 addrspace(1)* %in
+  %multi_add = add i32 %multi_load, 42
+  %gid = call i32 @get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %multi_add, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %cmp3 to i4
+; CHECK: %[[MASK:.+]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %single_load{{[0-9]*}} = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: %multi_load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
new file mode 100644
index 0000000000000..86ddd892043c9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func void @barrier(i32);
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: entry:
+; CHECK: %load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
new file mode 100644
index 0000000000000..1f7952620f6aa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func void @barrier(i32);
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @get_local_id(i32 0)
+  %cmp = icmp eq i32 %lid, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  %secretly_scalar_load = load i32, i32 addrspace(1)* %in
+  %add = add i32 %secretly_scalar_load, 42
+  store i32 %add, i32 addrspace(1)* %in
+  br label %merge
+
+merge:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
new file mode 100644
index 0000000000000..66a2ae70067d5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func void @barrier(i32);
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @get_local_id(i32 0)
+  %cmp = icmp eq i32 %lid, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  br label %merge
+
+merge:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
new file mode 100644
index 0000000000000..2e06ea7193ab2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_local_id(i32);
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @get_local_id(i32 0)
+  %and = and i32 %lid, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  %lid1 = call i32 @get_local_id(i32 1)
+  %cmp1 = icmp eq i32 %lid1, 0
+  br i1 %cmp1, label %deeper_if, label %deeper_merge
+
+deeper_if:
+  br label %deeper_merge
+
+deeper_merge:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+  br label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LOAD:.+]] = load i32, ptr addrspace(1) %in
+; CHECK: %[[SPLAT_IN:.+]] = insertelement <4 x i32> {{poison|undef}}, i32 %[[LOAD]], {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLAT_IN]], <4 x i32> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32> %[[SPLAT]], ptr addrspace(1){{( nonnull)? %.*}}, <4 x i1> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
new file mode 100644
index 0000000000000..f8d4531c7cb68
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
@@ -0,0 +1,78 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z12get_local_idj(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0
+
+declare spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float>, i64, float addrspace(1)*)
+
+declare spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64, float addrspace(1)*)
+; Function Attrs: inlinehint norecurse nounwind readnone
+declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2
+
+define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) {
+entry:
+  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0) #0
+  %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid
+  br label %loop
+
+loop:                                              ; preds = %entry, %loop
+  %madv4.prev = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4, %loop ]
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %i.inc = add nuw nsw i64 %i, 1
+  %cmp = icmp slt i64 %i.inc, %n
+  %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address)
+  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> undef, <4 x i32> zeroinitializer
+  %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0
+  br i1 %cmp, label %loop, label %end
+
+end:                                             ; preds = %loop
+  %mad.vec0 = extractelement <4 x float> %madv4, i32 0
+  store float %mad.vec0, float addrspace(1)* %inout.address, align 4
+  tail call spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float> %madv4, i64 0, float addrspace(1)* %inout.address)
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate }
+attributes #2 = { inlinehint norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; The purpose of this test is to make sure we correctly scalarize an instruction
+; used by both a scalar and vector instruction. We would previously try to
+; scalarize its users twice thus resulting in invalid IR.
+
+; CHECK: define spir_kernel void @__vecz_v4_scalar_vector_user
+; CHECK: loop:
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S0:[0-9]+]], %loop ]{{$}}
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S1:[0-9]+]], %loop ]{{$}}
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S2:[0-9]+]], %loop ]{{$}}
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S3:[0-9]+]], %loop ]{{$}}
+
+; make sure the above PHI incomings are unique by looking for their definitions
+; CHECK: %madv4[[S0]] =
+; CHECK: %madv4[[S1]] =
+; CHECK: %madv4[[S2]] =
+; CHECK: %madv4[[S3]] =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
new file mode 100644
index 0000000000000..5ee40b4c1f247
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
@@ -0,0 +1,86 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x i32>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %0 = load <4 x float>, <4 x float>* %a, align 16
+  %1 = load <4 x float>, <4 x float>* %b, align 16
+  %2 = load <4 x i32>, <4 x i32>* %c, align 16
+  %call = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %2)
+  %3 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %call)
+  store <4 x float> %3, <4 x float>* %d, align 16
+  ret void
+}
+
+declare spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32>)
+declare spir_func float @_Z13convert_floati(i32)
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v4_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+; CHECK: %[[A_0:.+]] = getelementptr float, ptr %a, i32 0
+; CHECK: %[[A_1:.+]] = getelementptr float, ptr %a, i32 1
+; CHECK: %[[A_2:.+]] = getelementptr float, ptr %a, i32 2
+; CHECK: %[[A_3:.+]] = getelementptr float, ptr %a, i32 3
+; CHECK: %[[LA_0:.+]] = load float, ptr %[[A_0]]
+; CHECK: %[[LA_1:.+]] = load float, ptr %[[A_1]]
+; CHECK: %[[LA_2:.+]] = load float, ptr %[[A_2]]
+; CHECK: %[[LA_3:.+]] = load float, ptr %[[A_3]]
+; CHECK: %[[B_0:.+]] = getelementptr float, ptr %b, i32 0
+; CHECK: %[[B_1:.+]] = getelementptr float, ptr %b, i32 1
+; CHECK: %[[B_2:.+]] = getelementptr float, ptr %b, i32 2
+; CHECK: %[[B_3:.+]] = getelementptr float, ptr %b, i32 3
+; CHECK: %[[LB_0:.+]] = load float, ptr %[[B_0]]
+; CHECK: %[[LB_1:.+]] = load float, ptr %[[B_1]]
+; CHECK: %[[LB_2:.+]] = load float, ptr %[[B_2]]
+; CHECK: %[[LB_3:.+]] = load float, ptr %[[B_3]]
+; CHECK: %[[C_0:.+]] = getelementptr i32, ptr %c, i32 0
+; CHECK: %[[C_1:.+]] = getelementptr i32, ptr %c, i32 1
+; CHECK: %[[C_2:.+]] = getelementptr i32, ptr %c, i32 2
+; CHECK: %[[C_3:.+]] = getelementptr i32, ptr %c, i32 3
+; CHECK: %[[LC_0:.+]] = load i32, ptr %[[C_0]]
+; CHECK: %[[LC_1:.+]] = load i32, ptr %[[C_1]]
+; CHECK: %[[LC_2:.+]] = load i32, ptr %[[C_2]]
+; CHECK: %[[LC_3:.+]] = load i32, ptr %[[C_3]]
+; CHECK: %[[CALL1:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_0]])
+; CHECK: %[[CALL2:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_1]])
+; CHECK: %[[CALL3:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_2]])
+; CHECK: %[[CALL4:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_3]])
+; CHECK: %[[FMAD_0:.+]] = call float @llvm.fmuladd.f32(float %[[LA_0]], float %[[LB_0]], float %[[CALL1]])
+; CHECK: %[[FMAD_1:.+]] = call float @llvm.fmuladd.f32(float %[[LA_1]], float %[[LB_1]], float %[[CALL2]])
+; CHECK: %[[FMAD_2:.+]] = call float @llvm.fmuladd.f32(float %[[LA_2]], float %[[LB_2]], float %[[CALL3]])
+; CHECK: %[[FMAD_3:.+]] = call float @llvm.fmuladd.f32(float %[[LA_3]], float %[[LB_3]], float %[[CALL4]])
+; CHECK: %[[D_0:.+]] = getelementptr float, ptr %d, i32 0
+; CHECK: %[[D_1:.+]] = getelementptr float, ptr %d, i32 1
+; CHECK: %[[D_2:.+]] = getelementptr float, ptr %d, i32 2
+; CHECK: %[[D_3:.+]] = getelementptr float, ptr %d, i32 3
+; CHECK: store float %[[FMAD_0]], ptr %[[D_0]]
+; CHECK: store float %[[FMAD_1]], ptr %[[D_1]]
+; CHECK: store float %[[FMAD_2]], ptr %[[D_2]]
+; CHECK: store float %[[FMAD_3]], ptr %[[D_3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
new file mode 100644
index 0000000000000..cac1ea1b23ea9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_calls(<4 x float>* %a, <4 x float>* %b, <4 x i32>* %c, <4 x float>* %d) {
+entry:
+  %0 = load <4 x float>, <4 x float>* %a, align 16
+  %1 = load <4 x float>, <4 x float>* %b, align 16
+  %2 = load <4 x i32>, <4 x i32>* %c, align 16
+  %call = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %2)
+  %3 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %call)
+  store <4 x float> %3, <4 x float>* %d, align 16
+  ret void
+}
+
+declare spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32>)
+declare spir_func float @_Z13convert_floati(i32)
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; Checks that this function gets vectorized, although because every instruction is
+; uniform, the process of vectorization makes no actual changes whatsoever!
+; CHECK: define spir_kernel void @__vecz_v4_test_calls(ptr %a, ptr %b, ptr %c, ptr %d)
+; CHECK: entry:
+; CHECK: %[[LA:.+]] = load <4 x float>, ptr %a, align 16
+; CHECK: %[[LB:.+]] = load <4 x float>, ptr %b, align 16
+; CHECK: %[[LC:.+]] = load <4 x i32>, ptr %c, align 16
+; CHECK: %[[CALL:.+]] = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %[[LC]])
+; CHECK: %[[FMAD:.+]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %[[LA]], <4 x float> %[[LB]], <4 x float> %[[CALL]])
+; CHECK: store <4 x float> %[[FMAD]], ptr %d, align 16
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
new file mode 100644
index 0000000000000..49c0e8fd4cb2c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -0,0 +1,183 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info is preserved in the vectorized kernel.
+; Specifically that the scalarization pass doesn't destroy DI
+; intrinsics attached to the vector instructions it scalarizes.
+
+; RUN: %veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+
+; Function Attrs: nounwind
+define spir_kernel void @mul2(<2 x i32> addrspace(1)* %in1, <2 x i32> addrspace(1)* %in2, <2 x i32> addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca <2 x i32> addrspace(1)*, align 8
+  %in2.addr = alloca <2 x i32> addrspace(1)*, align 8
+  %out.addr = alloca <2 x i32> addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca <2 x i32>, align 8
+  %b = alloca <2 x i32>, align 8
+  %tmp = alloca <2 x i32>, align 8
+  store <2 x i32> addrspace(1)* %in1, <2 x i32> addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %in1.addr, metadata !16, metadata !34), !dbg !35
+  store <2 x i32> addrspace(1)* %in2, <2 x i32> addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %in2.addr, metadata !17, metadata !34), !dbg !35
+  store <2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %out.addr, metadata !18, metadata !34), !dbg !35
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !19, metadata !34), !dbg !36
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !36
+  store i64 %call, i64* %tid, align 8, !dbg !36
+  call void @llvm.dbg.declare(metadata <2 x i32>* %a, metadata !23, metadata !34), !dbg !37
+  %0 = load i64, i64* %tid, align 8, !dbg !37
+  %1 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %in1.addr, align 8, !dbg !37
+  %arrayidx = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %1, i64 %0, !dbg !37
+  %2 = load <2 x i32>, <2 x i32> addrspace(1)* %arrayidx, align 8, !dbg !37
+  store <2 x i32> %2, <2 x i32>* %a, align 8, !dbg !37
+  call void @llvm.dbg.declare(metadata <2 x i32>* %b, metadata !24, metadata !34), !dbg !38
+  %3 = load i64, i64* %tid, align 8, !dbg !38
+  %4 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %in2.addr, align 8, !dbg !38
+  %arrayidx1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %4, i64 %3, !dbg !38
+  %5 = load <2 x i32>, <2 x i32> addrspace(1)* %arrayidx1, align 8, !dbg !38
+  store <2 x i32> %5, <2 x i32>* %b, align 8, !dbg !38
+  call void @llvm.dbg.declare(metadata <2 x i32>* %tmp, metadata !25, metadata !34), !dbg !39
+  %6 = load <2 x i32>, <2 x i32>* %a, align 8, !dbg !39
+  %7 = load <2 x i32>, <2 x i32>* %b, align 8, !dbg !39
+  %mul = mul <2 x i32> %6, %7, !dbg !39
+  store <2 x i32> %mul, <2 x i32>* %tmp, align 8, !dbg !39
+  %8 = load <2 x i32>, <2 x i32>* %tmp, align 8, !dbg !40
+  %9 = load i64, i64* %tid, align 8, !dbg !40
+  %10 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %out.addr, align 8, !dbg !40
+  %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %10, i64 %9, !dbg !40
+  store <2 x i32> %8, <2 x i32> addrspace(1)* %arrayidx2, align 8, !dbg !40
+  ret void, !dbg !41
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!26}
+!llvm.module.flags = !{!32}
+!llvm.ident = !{!33}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "Aorta/vecz_build")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "mul2", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !15)
+!5 = !DIFile(filename: "kernel.opencl", directory: "Aorta/vecz_build")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIDerivedType(tag: DW_TAG_typedef, name: "int2", file: !10, line: 63, baseType: !11)
+!10 = !DIFile(filename: "Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "Aorta/vecz_build")
+!11 = !DICompositeType(tag: DW_TAG_array_type, baseType: !12, size: 64, align: 64, flags: DIFlagVector, elements: !13)
+!12 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!13 = !{!14}
+!14 = !DISubrange(count: 2)
+!15 = !{!16, !17, !18, !19, !23, !24, !25}
+!16 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!17 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!18 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!19 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !20)
+!20 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !10, line: 33, baseType: !21)
+!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !10, line: 31, baseType: !22)
+!22 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!23 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 4, type: !9)
+!24 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 5, type: !9)
+!25 = !DILocalVariable(name: "tmp", scope: !4, file: !5, line: 6, type: !9)
+!26 = !{void (<2 x i32> addrspace(1)*, <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)*)* @mul2, !27, !28, !29, !30, !31}
+!27 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!28 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!29 = !{!"kernel_arg_type", !"int2*", !"int2*", !"int2*"}
+!30 = !{!"kernel_arg_base_type", !"int __attribute__((ext_vector_type(2)))*", !"int __attribute__((ext_vector_type(2)))*", !"int __attribute__((ext_vector_type(2)))*"}
+!31 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!32 = !{i32 2, !"Debug Info Version", i32 3}
+!33 = !{!"clang version 3.8.0 "}
+!34 = !DIExpression()
+!35 = !DILocation(line: 1, scope: !4)
+!36 = !DILocation(line: 3, scope: !4)
+!37 = !DILocation(line: 4, scope: !4)
+!38 = !DILocation(line: 5, scope: !4)
+!39 = !DILocation(line: 6, scope: !4)
+!40 = !DILocation(line: 7, scope: !4)
+!41 = !DILocation(line: 8, scope: !4)
+
+; Vectorized kernel function
+; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_mul2({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
+
+; Check that intrinsics for user variable locations are still present
+; CHECK: call void @llvm.dbg.value(metadata {{.*}} %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
+; CHECK-SAME: !dbg [[PARAM_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.value(metadata {{.*}} %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[PARAM_LOC]]
+
+; CHECK: call void @llvm.dbg.value(metadata {{.*}} %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[PARAM_LOC]]
+
+; CHECK: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[TID_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.declare(metadata ptr %a, metadata [[DI_A:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME:!dbg [[A_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.declare(metadata ptr %b, metadata [[DI_B:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME:!dbg [[B_LOC:![0-9]+]]
+
+; CHECK: call void @llvm.dbg.declare(metadata ptr %tmp, metadata [[DI_TMP:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME:!dbg [[TMP_LOC:![0-9]+]]
+
+; Debug info metadata entries
+; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_INT2:![0-9]+]], size: 64, align: 64)
+; CHECK:[[DI_INT2]] = !DIDerivedType(tag: DW_TAG_typedef, name: "int2"
+
+; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "mul2"
+; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]]
+
+; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A]], [[DI_B]], [[DI_TMP]]}
+
+; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_OUT]] = !DILocalVariable(name: "out", arg: 3, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]]
+; CHECK-SAME:line: 3
+
+; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 4
+
+; CHECK: [[DI_B]] = !DILocalVariable(name: "b", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 5
+
+; CHECK: [[DI_TMP]] = !DILocalVariable(name: "tmp", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 6
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
new file mode 100644
index 0000000000000..936aab9504adc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
@@ -0,0 +1,142 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define spir_kernel void @test_instructions(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i32>* %pc) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx
+  %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx
+  %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx
+  %0 = load <4 x i32>, <4 x i32>* %a, align 16
+  %1 = load <4 x i32>, <4 x i32>* %b, align 16
+  %add = add <4 x i32> %1, %0
+  store <4 x i32> %add, <4 x i32>* %c, align 16
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 1
+  %2 = load <4 x i32>, <4 x i32>* %arrayidx3, align 16
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32>* %b, i64 1
+  %3 = load <4 x i32>, <4 x i32>* %arrayidx4, align 16
+  %cmp = icmp sgt <4 x i32> %2, %3
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %arrayidx5 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 1
+  store <4 x i32> %sext, <4 x i32>* %arrayidx5, align 16
+  %arrayidx6 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 2
+  %4 = load <4 x i32>, <4 x i32>* %arrayidx6, align 16
+  %cmp7 = icmp slt <4 x i32> %4, <i32 11, i32 12, i32 13, i32 14>
+  %sext8 = sext <4 x i1> %cmp7 to <4 x i32>
+  %arrayidx9 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 2
+  store <4 x i32> %sext8, <4 x i32>* %arrayidx9, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_instructions(ptr %pa, ptr %pb, ptr %pc)
+; CHECK: entry:
+; CHECK: %[[A_0:.+]] = getelementptr i32, ptr %a, i32 0
+; CHECK: %[[A_1:.+]] = getelementptr i32, ptr %a, i32 1
+; CHECK: %[[A_2:.+]] = getelementptr i32, ptr %a, i32 2
+; CHECK: %[[A_3:.+]] = getelementptr i32, ptr %a, i32 3
+; CHECK: %[[LA_0:.+]] = load i32, ptr %[[A_0]]
+; CHECK: %[[LA_1:.+]] = load i32, ptr %[[A_1]]
+; CHECK: %[[LA_2:.+]] = load i32, ptr %[[A_2]]
+; CHECK: %[[LA_3:.+]] = load i32, ptr %[[A_3]]
+; CHECK: %[[B_0:.+]] = getelementptr i32, ptr %b, i32 0
+; CHECK: %[[B_1:.+]] = getelementptr i32, ptr %b, i32 1
+; CHECK: %[[B_2:.+]] = getelementptr i32, ptr %b, i32 2
+; CHECK: %[[B_3:.+]] = getelementptr i32, ptr %b, i32 3
+; CHECK: %[[LB_0:.+]] = load i32, ptr %[[B_0]]
+; CHECK: %[[LB_1:.+]] = load i32, ptr %[[B_1]]
+; CHECK: %[[LB_2:.+]] = load i32, ptr %[[B_2]]
+; CHECK: %[[LB_3:.+]] = load i32, ptr %[[B_3]]
+; CHECK: %[[ADD1:.+]] = add i32 %[[LB_0]], %[[LA_0]]
+; CHECK: %[[ADD2:.+]] = add i32 %[[LB_1]], %[[LA_1]]
+; CHECK: %[[ADD3:.+]] = add i32 %[[LB_2]], %[[LA_2]]
+; CHECK: %[[ADD4:.+]] = add i32 %[[LB_3]], %[[LA_3]]
+; CHECK: %[[C_0:.+]] = getelementptr i32, ptr %c, i32 0
+; CHECK: %[[C_1:.+]] = getelementptr i32, ptr %c, i32 1
+; CHECK: %[[C_2:.+]] = getelementptr i32, ptr %c, i32 2
+; CHECK: %[[C_3:.+]] = getelementptr i32, ptr %c, i32 3
+; CHECK: store i32 %[[ADD1]], ptr %[[C_0]]
+; CHECK: store i32 %[[ADD2]], ptr %[[C_1]]
+; CHECK: store i32 %[[ADD3]], ptr %[[C_2]]
+; CHECK: store i32 %[[ADD4]], ptr %[[C_3]]
+; CHECK: %arrayidx3 = getelementptr inbounds <4 x i32>, ptr %a, i64 1
+; CHECK: %[[A1_0:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 0
+; CHECK: %[[A1_1:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 1
+; CHECK: %[[A1_2:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 2
+; CHECK: %[[A1_3:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 3
+; CHECK: %[[LA1_0:.+]] = load i32, ptr %[[A1_0]]
+; CHECK: %[[LA1_1:.+]] = load i32, ptr %[[A1_1]]
+; CHECK: %[[LA1_2:.+]] = load i32, ptr %[[A1_2]]
+; CHECK: %[[LA1_3:.+]] = load i32, ptr %[[A1_3]]
+; CHECK: %arrayidx4 = getelementptr inbounds <4 x i32>, ptr %b, i64 1
+; CHECK: %[[B1_0:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 0
+; CHECK: %[[B1_1:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 1
+; CHECK: %[[B1_2:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 2
+; CHECK: %[[B1_3:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 3
+; CHECK: %[[LB1_0:.+]] = load i32, ptr %[[B1_0]]
+; CHECK: %[[LB1_1:.+]] = load i32, ptr %[[B1_1]]
+; CHECK: %[[LB1_2:.+]] = load i32, ptr %[[B1_2]]
+; CHECK: %[[LB1_3:.+]] = load i32, ptr %[[B1_3]]
+; CHECK: %[[CMP5:.+]] = icmp sgt i32 %[[LA1_0]], %[[LB1_0]]
+; CHECK: %[[CMP6:.+]] = icmp sgt i32 %[[LA1_1]], %[[LB1_1]]
+; CHECK: %[[CMP8:.+]] = icmp sgt i32 %[[LA1_2]], %[[LB1_2]]
+; CHECK: %[[CMP9:.+]] = icmp sgt i32 %[[LA1_3]], %[[LB1_3]]
+; CHECK: %[[SEXT10:.+]] = sext i1 %[[CMP5]] to i32
+; CHECK: %[[SEXT11:.+]] = sext i1 %[[CMP6]] to i32
+; CHECK: %[[SEXT12:.+]] = sext i1 %[[CMP8]] to i32
+; CHECK: %[[SEXT13:.+]] = sext i1 %[[CMP9]] to i32
+; CHECK: %arrayidx5 = getelementptr inbounds <4 x i32>, ptr %c, i64 1
+; CHECK: %[[C1_0:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 0
+; CHECK: %[[C1_1:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 1
+; CHECK: %[[C1_2:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 2
+; CHECK: %[[C1_3:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 3
+; CHECK: store i32 %[[SEXT10]], ptr %[[C1_0]]
+; CHECK: store i32 %[[SEXT11]], ptr %[[C1_1]]
+; CHECK: store i32 %[[SEXT12]], ptr %[[C1_2]]
+; CHECK: store i32 %[[SEXT13]], ptr %[[C1_3]]
+; CHECK: %arrayidx6 = getelementptr inbounds <4 x i32>, ptr %a, i64 2
+; CHECK: %[[A2_0:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 0
+; CHECK: %[[A2_1:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 1
+; CHECK: %[[A2_2:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 2
+; CHECK: %[[A2_3:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 3
+; CHECK: %[[LA2_0:.+]] = load i32, ptr %[[A2_0]]
+; CHECK: %[[LA2_1:.+]] = load i32, ptr %[[A2_1]]
+; CHECK: %[[LA2_2:.+]] = load i32, ptr %[[A2_2]]
+; CHECK: %[[LA2_3:.+]] = load i32, ptr %[[A2_3]]
+; CHECK: %[[CMP714:.+]] = icmp slt i32 %[[LA2_0]], 11
+; CHECK: %[[CMP715:.+]] = icmp slt i32 %[[LA2_1]], 12
+; CHECK: %[[CMP716:.+]] = icmp slt i32 %[[LA2_2]], 13
+; CHECK: %[[CMP717:.+]] = icmp slt i32 %[[LA2_3]], 14
+; CHECK: %[[SEXT818:.+]] = sext i1 %[[CMP714]] to i32
+; CHECK: %[[SEXT819:.+]] = sext i1 %[[CMP715]] to i32
+; CHECK: %[[SEXT820:.+]] = sext i1 %[[CMP716]] to i32
+; CHECK: %[[SEXT821:.+]] = sext i1 %[[CMP717]] to i32
+; CHECK: %arrayidx9 = getelementptr inbounds <4 x i32>, ptr %c, i64 2
+; CHECK: %[[C2_0:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 0
+; CHECK: %[[C2_1:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 1
+; CHECK: %[[C2_2:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 2
+; CHECK: %[[C2_3:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 3
+; CHECK: store i32 %[[SEXT818]], ptr %[[C2_0]]
+; CHECK: store i32 %[[SEXT819]], ptr %[[C2_1]]
+; CHECK: store i32 %[[SEXT820]], ptr %[[C2_2]]
+; CHECK: store i32 %[[SEXT821]], ptr %[[C2_3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
new file mode 100644
index 0000000000000..93df5a9d99fa1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_instructions(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %a, align 16
+  %1 = load <4 x i32>, <4 x i32>* %b, align 16
+  %add = add <4 x i32> %1, %0
+  store <4 x i32> %add, <4 x i32>* %c, align 16
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 1
+  %2 = load <4 x i32>, <4 x i32>* %arrayidx3, align 16
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32>* %b, i64 1
+  %3 = load <4 x i32>, <4 x i32>* %arrayidx4, align 16
+  %cmp = icmp sgt <4 x i32> %2, %3
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %arrayidx5 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 1
+  store <4 x i32> %sext, <4 x i32>* %arrayidx5, align 16
+  %arrayidx6 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 2
+  %4 = load <4 x i32>, <4 x i32>* %arrayidx6, align 16
+  %cmp7 = icmp slt <4 x i32> %4, <i32 11, i32 12, i32 13, i32 14>
+  %sext8 = sext <4 x i1> %cmp7 to <4 x i32>
+  %arrayidx9 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 2
+  store <4 x i32> %sext8, <4 x i32>* %arrayidx9, align 16
+  ret void
+}
+
+; Checks that this function gets vectorized, although because every instruction is
+; uniform, the process of vectorization makes no actual changes whatsoever!
+; CHECK: define spir_kernel void @__vecz_v4_test_instructions(ptr %a, ptr %b, ptr %c)
+; CHECK: entry:
+; CHECK: %[[LA:.+]] = load <4 x i32>, ptr %a, align 16
+; CHECK: %[[LB:.+]] = load <4 x i32>, ptr %b, align 16
+; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LB]], %[[LA]]
+; CHECK: store <4 x i32> %[[ADD]], ptr %c, align 16
+; CHECK: %[[A1:.+]] = getelementptr inbounds <4 x i32>, ptr %a, i64 1
+; CHECK: %[[LA1:.+]] = load <4 x i32>, ptr %[[A1]], align 16
+; CHECK: %[[B1:.+]] = getelementptr inbounds <4 x i32>, ptr %b, i64 1
+; CHECK: %[[LB1:.+]] = load <4 x i32>, ptr %[[B1]], align 16
+; CHECK: %[[CMP:.+]] = icmp sgt <4 x i32> %[[LA1]], %[[LB1]]
+; CHECK: %[[SEXT:.+]] = sext <4 x i1> %[[CMP]] to <4 x i32>
+; CHECK: %[[C1:.+]] = getelementptr inbounds <4 x i32>, ptr %c, i64 1
+; CHECK: store <4 x i32> %[[SEXT]], ptr %[[C1]], align 16
+; CHECK: %[[A2:.+]] = getelementptr inbounds <4 x i32>, ptr %a, i64 2
+; CHECK: %[[LA2:.+]] = load <4 x i32>, ptr %[[A2]], align 16
+; CHECK: %[[CMP7:.+]] = icmp slt <4 x i32> %[[LA2]], <i32 11, i32 12, i32 13, i32 14>
+; CHECK: %[[SEXT8:.+]] = sext <4 x i1> %[[CMP7]] to <4 x i32>
+; CHECK: %[[C2:.+]] = getelementptr inbounds <4 x i32>, ptr %c, i64 2
+; CHECK: store <4 x i32> %[[SEXT8]], ptr %[[C2]], align 16
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
new file mode 100644
index 0000000000000..890fa663c968f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>*, <2 x i1>)
+declare void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float>, <2 x float>*, <2 x i1>)
+
+define spir_kernel void @scalarize_masked_memops(<2 x float>* %pa, <2 x float>* %pz) {
+entry:
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %head = insertelement <2 x i64> undef, i64 %idx, i64 0
+  %splat = shufflevector <2 x i64> %head, <2 x i64> undef, <2 x i32> zeroinitializer
+  %idxs = add <2 x i64> %splat, <i64 0, i64 1>
+  %mask = icmp slt <2 x i64> %idxs, <i64 8, i64 8>
+  %aptr = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx
+  %ld = call <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>* %aptr, <2 x i1> %mask)
+  %zptr = getelementptr <2 x float>, <2 x float>* %pz, i64 %idx
+  call void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float> %ld, <2 x float>* %zptr, <2 x i1> %mask)
+  ret void
+ ; CHECK:  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+ ; CHECK:  %[[IDXS0:.*]] = add i64 %idx, 0
+ ; CHECK:  %[[IDXS1:.*]] = add i64 %idx, 1
+ ; CHECK:  %[[MASK0:.*]] = icmp slt i64 %[[IDXS0]], 8
+ ; CHECK:  %[[MASK1:.*]] = icmp slt i64 %[[IDXS1]], 8
+ ; CHECK:  %aptr = getelementptr <2 x float>, ptr %pa, i64 %idx
+ ; CHECK:  %[[TMP1:.*]] = getelementptr float, ptr %aptr, i32 0
+ ; CHECK:  %[[TMP2:.*]] = getelementptr float, ptr %aptr, i32 1
+ ; CHECK:  %[[TMP3:.*]] = call float @__vecz_b_masked_load4_fu3ptrb(ptr %[[TMP1]], i1 %[[MASK0]])
+ ; CHECK:  %[[TMP4:.*]] = call float @__vecz_b_masked_load4_fu3ptrb(ptr %[[TMP2]], i1 %[[MASK1]])
+ ; CHECK:  %zptr = getelementptr <2 x float>, ptr %pz, i64 %idx
+ ; CHECK:  %[[TMP6:.*]] = getelementptr float, ptr %zptr, i32 0
+ ; CHECK:  %[[TMP7:.*]] = getelementptr float, ptr %zptr, i32 1
+ ; CHECK:  call void @__vecz_b_masked_store4_fu3ptrb(float %[[TMP3]], ptr %[[TMP6]], i1 %[[MASK0]])
+ ; CHECK:  call void @__vecz_b_masked_store4_fu3ptrb(float %[[TMP4]], ptr %[[TMP7]], i1 %[[MASK1]])
+ ; CHECK:  ret void
+
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
new file mode 100644
index 0000000000000..eba06f982f35b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @splat(i32 addrspace(1)* %data, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %data, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %0, i64 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat, <i32 2, i32 3, i32 5, i32 7>
+  %call1 = tail call spir_func i32 @not_scalarizable(<4 x i32> noundef %add)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32 noundef)
+declare spir_func i32 @not_scalarizable(<4 x i32> noundef)
+
+; It checks that the scalarizer scalarizes the add and reconstructs the vector
+; using insert element instructions to be consumed by the unscalarizable
+; function.
+; CHECK: void @__vecz_v4_splat({{.*}})
+; CHECK: entry:
+; CHECK:   %[[LD:.*]] = load i32
+; CHECK:   %[[ADD0:.*]] = add i32 %[[LD]]
+; CHECK:   %[[ADD1:.*]] = add i32 %[[LD]]
+; CHECK:   %[[ADD2:.*]] = add i32 %[[LD]]
+; CHECK:   %[[ADD3:.*]] = add i32 %[[LD]]
+; CHECK:   %[[INS0:.*]] = insertelement <4 x i32> {{undef|poison}}, i32 %[[ADD0]], i32 0
+; CHECK:   %[[INS1:.+]] = insertelement <4 x i32> %[[INS0]], i32 %[[ADD1]], i32 1
+; CHECK:   %[[INS2:.+]] = insertelement <4 x i32> %[[INS1]], i32 %[[ADD2]], i32 2
+; CHECK:   %[[INS3:.+]] = insertelement <4 x i32> %[[INS2]], i32 %[[ADD3]], i32 3
+; CHECK-NOT: shufflevector <4 x i32>
+; CHECK:   %{{.*}} = tail call spir_func i32 @not_scalarizable(<4 x i32> noundef %[[INS3]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
new file mode 100644
index 0000000000000..9049e0dc02f77
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @splat(float addrspace(1)* %data, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %data, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %splat.splatinsert = insertelement <4 x float> poison, float %0, i64 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %call1 = tail call spir_func float @not_scalarizable(<4 x float> noundef %splat.splat)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32 noundef)
+declare spir_func float @not_scalarizable(<4 x float> noundef)
+
+; It checks that the scalarizer turns the original vector splat back into a vector splat,
+; instead of a series of insertelement instructions.
+; CHECK: void @__vecz_v4_splat({{.*}})
+; CHECK: entry:
+; CHECK:   %[[LD:.*]] = load float
+; CHECK:   %[[INS0:.*]] = insertelement <4 x float> {{undef|poison}}, float %[[LD]], {{i32|i64}} 0
+; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 1
+; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 2
+; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 3
+; CHECK:   %[[SPLAT:.*]] = shufflevector <4 x float> %[[INS0]], <4 x float> {{undef|poison}}, <4 x i32> zeroinitializer
+; CHECK:   %{{.*}} = tail call spir_func float @not_scalarizable(<4 x float> noundef %[[SPLAT]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
new file mode 100644
index 0000000000000..3cc0bb9c40984
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+; RUN: %veczc -k bar -vecz-simd-width=4 -S -o - %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+define void @bar(i64** %ptrptrs, i64 %val) {
+  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidxa = getelementptr inbounds i64*, i64** %ptrptrs, i64 %idx
+  %ptrs = load i64*, i64** %arrayidxa, align 4
+  %addr = getelementptr inbounds i64, i64* %ptrs, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+
+  %elt0 = extractelement <4 x i64*> %addr, i32 0
+  %elt1 = extractelement <4 x i64*> %addr, i32 1
+  %elt2 = extractelement <4 x i64*> %addr, i32 2
+  %elt3 = extractelement <4 x i64*> %addr, i32 3
+
+  store i64 %val, i64* %elt0
+  store i64 %val, i64* %elt1
+  store i64 %val, i64* %elt2
+  store i64 %val, i64* %elt3
+  ret void
+}
+
+; it checks that the GEP with mixed scalar/vector operands in the kernel
+; gets scalarized/re-packetized correctly
+
+; CHECK: define void @__vecz_v4_bar
+; CHECK: %[[ADDR:.+]] = getelementptr inbounds i64, <4 x ptr> %{{.+}}, i64 2
+; CHECK: call void @__vecz_b_scatter_store8_Dv4_mDv4_u3ptr(<4 x i64> %.splat{{.*}}, <4 x ptr> %[[ADDR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
new file mode 100644
index 0000000000000..bda25dd467c25
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
@@ -0,0 +1,197 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k scan_fact -vecz-passes=cfg-convert -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@scan_fact.temp = internal addrspace(3) global [16 x i32] undef, align 4
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #0
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z12get_local_idj(i32) #0
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z14get_local_sizej(i32) #0
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @scan_fact(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #3
+  %call1 = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call2 = call spir_func i64 @_Z14get_local_sizej(i32 0) #3
+  %mul = shl i64 %call1, 1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %mul
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %mul3 = shl i64 %call, 1
+  %arrayidx4 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %mul3
+  store i32 %0, i32 addrspace(3)* %arrayidx4, align 4
+  %mul5 = shl i64 %call1, 1
+  %add = or i64 %mul5, 1
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add
+  %1 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %mul7 = shl i64 %call, 1
+  %add8 = or i64 %mul7, 1
+  %arrayidx9 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add8
+  store i32 %1, i32 addrspace(3)* %arrayidx9, align 4
+  %mul10 = shl i64 %call, 1
+  %add11 = or i64 %mul10, 1
+  %arrayidx12 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add11
+  %2 = load i32, i32 addrspace(3)* %arrayidx12, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i64 [ 1, %entry ], [ %mul29, %for.inc ]
+  %mul13 = shl i64 %call2, 1
+  %cmp = icmp ult i64 %storemerge, %mul13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #4
+  %mul14 = shl i64 %call, 1
+  %mul15 = mul i64 %storemerge, %mul14
+  %mul16 = shl i64 %call2, 1
+  %cmp17 = icmp ult i64 %mul15, %mul16
+  br i1 %cmp17, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %mul18 = mul i64 %storemerge, 2
+  %add19 = add i64 %mul15, -1
+  %sub = add i64 %add19, %mul18
+  %arrayidx20 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub
+  %3 = load i32, i32 addrspace(3)* %arrayidx20, align 4
+  %add21 = add i64 %mul15, -1
+  %sub22 = add i64 %add21, %storemerge
+  %arrayidx23 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub22
+  %4 = load i32, i32 addrspace(3)* %arrayidx23, align 4
+  %mul24 = mul nsw i32 %4, %3
+  %mul25 = mul i64 %storemerge, 2
+  %add26 = add i64 %mul15, -1
+  %sub27 = add i64 %add26, %mul25
+  %arrayidx28 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub27
+  store i32 %mul24, i32 addrspace(3)* %arrayidx28, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %mul29 = shl i64 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp30 = icmp eq i64 %call, 0
+  br i1 %cmp30, label %if.then31, label %if.end35
+
+if.then31:                                        ; preds = %for.end
+  %mul32 = mul i64 %call2, 2
+  %sub33 = add i64 %mul32, -1
+  %arrayidx34 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub33
+  store i32 1, i32 addrspace(3)* %arrayidx34, align 4
+  br label %if.end35
+
+if.end35:                                         ; preds = %if.then31, %for.end
+  br label %for.cond37
+
+for.cond37:                                       ; preds = %for.inc62, %if.end35
+  %storemerge1 = phi i64 [ %call2, %if.end35 ], [ %shr, %for.inc62 ]
+  %cmp38 = icmp eq i64 %storemerge1, 0
+  call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #4
+  %mul64 = shl i64 %call, 1
+  br i1 %cmp38, label %for.end63, label %for.body39
+
+for.body39:                                       ; preds = %for.cond37
+  %mul42 = mul i64 %storemerge1, %mul64
+  %mul43 = shl i64 %call2, 1
+  %cmp44 = icmp ult i64 %mul42, %mul43
+  br i1 %cmp44, label %if.then45, label %for.inc62
+
+if.then45:                                        ; preds = %for.body39
+  %add46 = add i64 %mul42, -1
+  %sub47 = add i64 %add46, %storemerge1
+  %arrayidx48 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub47
+  %5 = load i32, i32 addrspace(3)* %arrayidx48, align 4
+  %mul49 = mul i64 %storemerge1, 2
+  %add50 = add i64 %mul42, -1
+  %sub51 = add i64 %add50, %mul49
+  %arrayidx52 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub51
+  %6 = load i32, i32 addrspace(3)* %arrayidx52, align 4
+  %add53 = add i64 %mul42, -1
+  %sub54 = add i64 %add53, %storemerge1
+  %arrayidx55 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub54
+  store i32 %6, i32 addrspace(3)* %arrayidx55, align 4
+  %mul56 = mul nsw i32 %6, %5
+  %mul57 = mul i64 %storemerge1, 2
+  %add58 = add i64 %mul42, -1
+  %sub59 = add i64 %add58, %mul57
+  %arrayidx60 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub59
+  store i32 %mul56, i32 addrspace(3)* %arrayidx60, align 4
+  br label %for.inc62
+
+for.inc62:                                        ; preds = %if.then45, %for.body39
+  %shr = lshr i64 %storemerge1, 1
+  br label %for.cond37
+
+for.end63:                                        ; preds = %for.cond37
+  %add65 = or i64 %mul64, 1
+  %arrayidx66 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add65
+  %7 = load i32, i32 addrspace(3)* %arrayidx66, align 4
+  %mul67 = shl i64 %call1, 1
+  %arrayidx68 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %mul67
+  store i32 %7, i32 addrspace(1)* %arrayidx68, align 4
+  %sub69 = add i64 %call2, -1
+  %cmp70 = icmp eq i64 %call, %sub69
+  br i1 %cmp70, label %if.then71, label %if.else
+
+if.then71:                                        ; preds = %for.end63
+  %mul72 = shl i64 %call, 1
+  %add73 = or i64 %mul72, 1
+  %arrayidx74 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add73
+  %8 = load i32, i32 addrspace(3)* %arrayidx74, align 4
+  %mul75 = mul nsw i32 %8, %2
+  %mul76 = shl i64 %call1, 1
+  %add77 = or i64 %mul76, 1
+  %arrayidx78 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add77
+  store i32 %mul75, i32 addrspace(1)* %arrayidx78, align 4
+  br label %if.end85
+
+if.else:                                          ; preds = %for.end63
+  %mul79 = mul i64 %call, 2
+  %add80 = add i64 %mul79, 2
+  %arrayidx81 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add80
+  %9 = load i32, i32 addrspace(3)* %arrayidx81, align 4
+  %mul82 = shl i64 %call1, 1
+  %add83 = or i64 %mul82, 1
+  %arrayidx84 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add83
+  store i32 %9, i32 addrspace(1)* %arrayidx84, align 4
+  br label %if.end85
+
+if.end85:                                         ; preds = %if.else, %if.then71
+  ret void
+}
+
+declare void @__mux_work_group_barrier(i32, i32, i32)
+
+; The purpose of this test is to make sure we simply manage to vectorize this
+; test. We would previously not because a phi node of a uniform loop has an
+; incoming value from a divergent block, but all the incoming values of the
+; phi node are the same.
+; We would thus previously consider the phi node varying and that would make
+; the loop divergent, with a barrier in it.
+
+; CHECK: spir_kernel void @__vecz_v4_scan_fact
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
new file mode 100644
index 0000000000000..fd89387589746
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %gid = call i32 @get_global_id(i32 0)
+  %and = and i32 %gid, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if, label %early_ret
+
+early_ret:
+; just to prevent ROSCC from sticking its oar in
+  %gid1 = call i32 @get_global_id(i32 1)
+  ret void
+
+if:
+  %single_load = load i32, i32 addrspace(1)* %in
+  %single_add = add i32 %single_load, 42
+  store i32 %single_add, i32 addrspace(1)* %in
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[BITCAST:.*]] = bitcast <4 x i1> %cmp{{[0-9]*}} to i4
+; CHECK: %[[MASK:.*]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %[[single_load:single_load[0-9]*]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: %[[single_add:single_add[0-9]*]] = add i32 %[[single_load]], 42
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[single_add]], ptr addrspace(1) %in, i1 %[[MASK]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
new file mode 100644
index 0000000000000..a6559e6a728be
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
@@ -0,0 +1,93 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-fail-quietly -k test -vecz-passes="cfg-convert" -S < %s
+
+; This tests only that the kernel does not crash the vectorizer.
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  switch i32 %n, label %g [
+    i32 3, label %if.else
+    i32 2, label %h
+  ]
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %h
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+f:                                                ; preds = %while.body
+  %cmp24 = icmp eq i32 %n, 2
+  br i1 %cmp24, label %h, label %g
+
+g:                                                ; preds = %f, %e, %while.body5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %g
+  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
+  %cmp29 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp29, label %h, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc31 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
+  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
new file mode 100644
index 0000000000000..310c502fe1ad9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idx.ext = sext i32 %mul3 to i64
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i8, i8 addrspace(1)* %add.ptr, align 1
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1
+  %1 = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %add7 = add i8 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom
+  store i8 %add7, i8 addrspace(1)* %arrayidx11, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
new file mode 100644
index 0000000000000..785db3ed704af
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %conv4 = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %conv4
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add5 = or i64 %conv4, 1
+  %arrayidx6 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %add5
+  %1 = load i8, i8 addrspace(1)* %arrayidx6, align 1
+  %add9 = add i8 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom
+  store i8 %add9, i8 addrspace(1)* %arrayidx13, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
new file mode 100644
index 0000000000000..e2f37483be09e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1
+  %add12 = add i8 %1, %0
+  %idxprom16 = sext i32 %add to i64
+  %arrayidx17 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom16
+  store i8 %add12, i8 addrspace(1)* %arrayidx17, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
new file mode 100644
index 0000000000000..8c1f4bf96a105
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = or i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1
+  %sub = sub i8 %0, %1
+  %idxprom15 = sext i32 %add to i64
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom15
+  store i8 %sub, i8 addrspace(1)* %arrayidx16, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
new file mode 100644
index 0000000000000..e541aaddc8515
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1
+  %add13 = add nsw i32 %mul3, 2
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom14
+  %2 = load i8, i8 addrspace(1)* %arrayidx15, align 1
+  %add19 = add nsw i32 %mul3, 3
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom20
+  %3 = load i8, i8 addrspace(1)* %arrayidx21, align 1
+  %add24 = add i8 %1, %0
+  %add26 = add i8 %add24, %2
+  %add28 = add i8 %add26, %3
+  %idxprom32 = sext i32 %add to i64
+  %arrayidx33 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom32
+  store i8 %add28, i8 addrspace(1)* %arrayidx33, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
new file mode 100644
index 0000000000000..f0e79d8b30ce1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = add nsw i32 %mul3, 3
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %shl = shl i8 %0, 1
+  %add10 = add nsw i32 %mul3, 2
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom11
+  %1 = load i8, i8 addrspace(1)* %arrayidx12, align 1
+  %sub = sub i8 %shl, %1
+  %idxprom18 = sext i32 %add to i64
+  %arrayidx19 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom18
+  store i8 %sub, i8 addrspace(1)* %arrayidx19, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
new file mode 100644
index 0000000000000..5d7a033d24799
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 0
+  %ele1 = extractelement <4 x i8> %data.ld, i32 1
+  %ele2 = extractelement <4 x i8> %data.ld, i32 2
+  %ele3 = extractelement <4 x i8> %data.ld, i32 3
+  %zext0 = sext i8 %ele0 to i32
+  %zext1 = sext i8 %ele1 to i32
+  %zext2 = sext i8 %ele2 to i32
+  %zext3 = sext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts
+; to implement the extract elements and sexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
new file mode 100644
index 0000000000000..0e1e562e4e99d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 3
+  %ele1 = extractelement <4 x i8> %data.ld, i32 2
+  %ele2 = extractelement <4 x i8> %data.ld, i32 1
+  %ele3 = extractelement <4 x i8> %data.ld, i32 0
+  %zext0 = sext i8 %ele0 to i32
+  %zext1 = sext i8 %ele1 to i32
+  %zext2 = sext i8 %ele2 to i32
+  %zext3 = sext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts
+; to implement the extract elements and sexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
new file mode 100644
index 0000000000000..2bc9e829aaef9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 0
+  %ele1 = extractelement <4 x i8> %data.ld, i32 1
+  %ele2 = extractelement <4 x i8> %data.ld, i32 2
+  %ele3 = extractelement <4 x i8> %data.ld, i32 3
+  %zext0 = zext i8 %ele0 to i32
+  %zext1 = zext i8 %ele1 to i32
+  %zext2 = zext i8 %ele2 to i32
+  %zext3 = zext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks
+; to implement the extract elements and zexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK:  %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK:  %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[ZEXT0]], %[[ZEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[ZEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[ZEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
new file mode 100644
index 0000000000000..4b28f9a3722d6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 3
+  %ele1 = extractelement <4 x i8> %data.ld, i32 2
+  %ele2 = extractelement <4 x i8> %data.ld, i32 1
+  %ele3 = extractelement <4 x i8> %data.ld, i32 0
+  %zext0 = zext i8 %ele0 to i32
+  %zext1 = zext i8 %ele1 to i32
+  %zext2 = zext i8 %ele2 to i32
+  %zext3 = zext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks
+; to implement the extract elements and zexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK:  %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK:  %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[ZEXT0]], %[[ZEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[ZEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[ZEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
new file mode 100644
index 0000000000000..5d4a03930acf7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k squash -vecz-passes="squash-small-vecs,packetizer" -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(i64 addrspace(1)* %idx, <2 x float> addrspace(1)* %data, <2 x float> addrspace(1)* %output) #0 {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %idx.ptr = getelementptr inbounds i64, i64 addrspace(1)* %idx, i64 %gid
+  %idx.ld = load i64, i64 addrspace(1)* %idx.ptr, align 8
+  %data.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %data, i64 %idx.ld
+  %data.ld = load <2 x float>, <2 x float> addrspace(1)* %data.ptr, align 8
+  %output.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %output, i64 %gid
+  store <2 x float> %data.ld, <2 x float> addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <2 x float> is converted into a i64 for the purpose of the
+; gather load
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[GID:.+]] = call spir_func i64 @_Z13get_global_idj(i64 0) #[[ATTRS:[0-9]+]]
+; CHECK:  %[[IDX_PTR:.+]] = getelementptr inbounds i64, ptr addrspace(1) %idx, i64 %[[GID]]
+; CHECK:  %[[WIDE_LOAD:.+]] = load <4 x i64>, ptr addrspace(1) %[[IDX_PTR]], align 8
+; CHECK:  %[[DATA_PTR:.+]] = getelementptr inbounds <2 x float>, ptr addrspace(1) %data, <4 x i64> %[[WIDE_LOAD]]
+; CHECK:  %[[GATHER:.+]] = call <4 x i64> @__vecz_b_gather_load8_Dv4_mDv4_u3ptrU3AS1(<4 x ptr addrspace(1)> %[[DATA_PTR]])
+; CHECK:  %[[UNSQUASH:.+]] = bitcast <4 x i64> %[[GATHER]] to <8 x float>
+; CHECK:  %[[OUTPUT_PTR:.+]] = getelementptr inbounds <2 x float>, ptr addrspace(1) %output, i64 %[[GID]]
+; CHECK:  store <8 x float> %[[UNSQUASH]], ptr addrspace(1) %[[OUTPUT_PTR]], align 8
+; CHECK:  ret void
+
+; CHECK: attributes #[[ATTRS]] = { nobuiltin nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
new file mode 100644
index 0000000000000..9201bd1dfb8b1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32, i32 }>
+
+; Function start
+; CHECK: spir_kernel void @__vecz_v4_foo(
+
+; There should be exactly 4 vector stores
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_{{.*}}(<4 x i64>
+ 
+; There is one interleaved store from the scalar write
+; CHECK: call void @__vecz_b_interleaved_store1_10_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+ 
+; There shouldn't be any other stores
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+ 
+; Function end
+; CHECK: ret void
+
+define dso_local spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* nocapture noundef writeonly %info) !reqd_work_group_size !11 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 0)
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 1)
+  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 2)
+  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 0)
+  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+declare spir_func i32 @_Z12get_work_dimv()
+
+!11 = !{i32 4, i32 1, i32 1}
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
new file mode 100644
index 0000000000000..b91498a8e6e60
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s -vecz-choices=FullScalarization | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32, i32 }>
+
+define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+declare spir_func i32 @_Z12get_work_dimv()
+
+; CHECK: spir_kernel void @foo
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_m{{.*}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_10_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
new file mode 100644
index 0000000000000..1062dd331c5bd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32 }>
+
+define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+declare spir_func i32 @_Z12get_work_dimv()
+
+; CHECK: spir_kernel void @foo
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_{{.*}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_9_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
new file mode 100644
index 0000000000000..56373b722abe6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -S < %s -vecz-choices=FullScalarization | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32 }>
+
+define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+declare spir_func i32 @_Z12get_work_dimv()
+
+; CHECK: spir_kernel void @foo
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_9_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
new file mode 100644
index 0000000000000..1cb75279d01ed
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -0,0 +1,107 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct_type = type { i32, i32 }
+
+define spir_kernel void @test(i32* %in, i32* %out, %struct_type* %sin) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %inp = getelementptr inbounds i32, i32* %in, i64 %call
+  %oup = getelementptr inbounds i32, i32* %out, i64 %call
+  %o = load i32, i32* %oup
+  ; do this little compare + phi to throw off the InstCombine pass and ensure
+  ; we end up with a phi %struct_type that must be instantiated
+  %s = insertvalue %struct_type undef, i32 %o, 1
+  %cmpcall = icmp ult i64 16, %call
+  br i1 %cmpcall, label %lower, label %higher
+
+lower:
+  %lowers = insertvalue %struct_type %s, i32 0, 0
+  br label %lower.higher.phi
+
+higher:
+  %highers = insertvalue %struct_type %s, i32 1, 0
+  br label %lower.higher.phi
+
+lower.higher.phi:
+  %lowerhigherstruct = phi %struct_type [%lowers, %lower], [%highers, %higher]
+  br label %for.cond
+
+for.cond:
+  %storemerge = phi %struct_type [ %incv, %for.inc ], [ %lowerhigherstruct, %lower.higher.phi ]
+  %s1 = extractvalue %struct_type %storemerge, 1
+  %s1ext = zext i32 %s1 to i64
+  %cmp = icmp ult i64 %s1ext, %call
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %l = load i32, i32* %inp, align 4
+  store i32 %l, i32* %oup, align 4
+  br label %for.inc
+
+for.inc:
+  %toadd = extractvalue %struct_type %storemerge, 1
+  %toadd64 = zext i32 %toadd to i64
+  %ca = add i64 %toadd64, %call
+  %sinp = getelementptr inbounds %struct_type, %struct_type* %sin, i64 %ca
+  %sinv = load %struct_type, %struct_type* %sinp
+  %sinintv = extractvalue %struct_type %sinv, 1
+  %incv = insertvalue %struct_type %storemerge, i32 %sinintv, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; Check if the struct creation has been instantiated
+; CHECK: %[[V2:[0-9]+]] = load <4 x i32>, ptr %oup, align 4
+; CHECK: %[[V3:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 0
+; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 1
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 2
+; CHECK: %[[V6:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 3
+; CHECK: %[[S24:.+]] = insertvalue %struct_type undef, i32 %[[V3]], 1
+; CHECK: %[[S25:.+]] = insertvalue %struct_type undef, i32 %[[V4]], 1
+; CHECK: %[[S26:.+]] = insertvalue %struct_type undef, i32 %[[V5]], 1
+; CHECK: %[[S27:.+]] = insertvalue %struct_type undef, i32 %[[V6]], 1
+
+; Check if the phi node has been instantiated
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+
+; Check if the operations that use integer types are vectorized
+; CHECK: zext <4 x i32>
+; CHECK: icmp ugt <4 x i64>
+; CHECK: and <4 x i1>
+; CHECK: %[[L423:.+]] = call <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrDv4_b(ptr %{{.*}}, <4 x i1>
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrDv4_b(<4 x i32> %[[L423]], ptr{{( nonnull)? %.*}}, <4 x i1>
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
new file mode 100644
index 0000000000000..c502c2af024b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct_type = type { i32, i32 }
+
+define spir_kernel void @test(%struct_type* %in1, %struct_type* %in2, %struct_type* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %in1p = getelementptr inbounds %struct_type, %struct_type* %in1, i64 %call
+  %in2p = getelementptr inbounds %struct_type, %struct_type* %in2, i64 %call
+  %outp = getelementptr inbounds %struct_type, %struct_type* %out, i64 %call
+  %in1v = load %struct_type, %struct_type* %in1p
+  %in2v = load %struct_type, %struct_type* %in2p
+  %mod = urem i64 %call, 3
+  %cmp = icmp eq i64 %mod, 0
+  %res = select i1 %cmp, %struct_type %in1v, %struct_type %in2v
+  store %struct_type %res, %struct_type* %outp
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
new file mode 100644
index 0000000000000..238fbd8703e13
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -0,0 +1,88 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s 
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i32 @_Z18get_sub_group_sizev()
+declare spir_func i32 @_Z22get_sub_group_local_idv()
+declare spir_func i32 @_Z19sub_group_broadcastij(i32, i32)
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z13sub_group_anyi(i32)
+
+define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call.i = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %conv = zext i32 %call.i to i64
+  %call2 = tail call spir_func i32 @_Z18get_sub_group_sizev()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_size(
+; CHECK: store i32 4, ptr addrspace(1) {{.*}}
+}
+
+define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %call, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_local_id(
+; CHECK: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(1) %out
+}
+
+define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @_Z19sub_group_broadcastij(i32 %v, i32 0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast(
+; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[LD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
+}
+
+; This used to crash as packetizing get_sub_group_local_id produces a Constant, which we weren't expecting.
+define spir_kernel void @regression_sub_group_local_id(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %xy, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %0 = shl i64 %call, 32
+  %idxprom = ashr exact i64 %0, 32
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %xy, i64 %idxprom
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16
+  %2 = insertelement <4 x i32> %1, i32 %call1, i64 0
+  %3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, i64 0, i64 0
+  store i32 %call1, i32 addrspace(1)* %3, align 16
+  %call2 = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %4 = insertelement <4 x i32> %2, i32 %call2, i64 1
+  store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx, align 16
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %call7 = tail call spir_func i32 @_Z13sub_group_anyi(i32 %5)
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %call7, i32 addrspace(1)* %arrayidx9, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
new file mode 100644
index 0000000000000..c859f7b830e52
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -0,0 +1,246 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z16get_sub_group_idv()
+
+declare spir_func i32 @_Z13sub_group_alli(i32)
+declare spir_func i32 @_Z13sub_group_anyi(i32)
+
+declare spir_func i32 @_Z20sub_group_reduce_addi(i32)
+declare spir_func i64 @_Z20sub_group_reduce_addl(i64)
+declare spir_func float @_Z20sub_group_reduce_addf(float)
+declare spir_func i32 @_Z20sub_group_reduce_mini(i32)
+declare spir_func i32 @_Z20sub_group_reduce_minj(i32)
+declare spir_func i32 @_Z20sub_group_reduce_maxi(i32)
+declare spir_func i32 @_Z20sub_group_reduce_maxj(i32)
+declare spir_func float @_Z20sub_group_reduce_minf(float)
+declare spir_func float @_Z20sub_group_reduce_maxf(float)
+
+define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z13sub_group_alli(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_all_i32(
+; CHECK: [[T2:%.*]] = icmp eq <4 x i32> %{{.*}}, zeroinitializer
+
+; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4
+; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], 0
+
+; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z13sub_group_anyi(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_any_i32(
+; CHECK: [[T2:%.*]] = icmp ne <4 x i32> %{{.*}}, zeroinitializer
+
+; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4
+; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0
+
+; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_addi(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+; Given we've checked a "full" expanded reduction sequence above for LLVM < 13,
+; reduce duplicate CHECKs by assuming all reductions work orthogonally.
+
+define spir_kernel void @reduce_add_i32_uniform(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z20sub_group_reduce_addi(i32 %n)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_i32_uniform(
+; LLVM is clever enough to optimize this reduction, but not when it's an
+; intrinsic. LLVM 10 does the shift-left in a vector, LLVMs 11 and 12 do it in
+; scalar.
+; CHECK: [[CALL:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{%.*}})
+; CHECK: [[INS:%.*]] = insertelement <4 x i32> {{(undef|poison)}}, i32 [[CALL]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[SPLAT]],
+}
+
+define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_addl(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv
+  store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_i64(
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %{{.*}})
+; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_addf(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %{{.*}})
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_mini(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_smin_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_minj(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_umin_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxi(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_smax_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxj(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_umax_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_minf(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_fmin_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %{{.*}})
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_maxf(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_fmax_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %{{.*}})
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..2d58eecf4dd99
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,194 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -S < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i32 @_Z16get_sub_group_idv()
+
+declare spir_func i32 @_Z20sub_group_reduce_muli(i32)
+declare spir_func i64 @_Z20sub_group_reduce_mull(i64)
+declare spir_func float @_Z20sub_group_reduce_mulf(float)
+
+declare spir_func i32 @_Z20sub_group_reduce_andj(i32)
+declare spir_func i32 @_Z19sub_group_reduce_ori(i32)
+declare spir_func i64 @_Z20sub_group_reduce_xorl(i64)
+
+declare spir_func i1 @_Z28sub_group_reduce_logical_andb(i1)
+declare spir_func i1 @_Z27sub_group_reduce_logical_orb(i1)
+declare spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1)
+
+; CHECK-LABEL: @__vecz_v4_reduce_mul_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_muli(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_mul_i64(
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %{{.*}})
+; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_mull(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_mul_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %{{.*}})
+; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func float @_Z20sub_group_reduce_mulf(float %0)
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv
+  store float %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_and_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_andj(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_or_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @_Z19sub_group_reduce_ori(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_xor_i32(
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %{{.*}})
+; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_xorl(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_logical_and(
+; This doesn't generate a reduction intrinsic...
+; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1
+; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_andb(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_logical_or(
+; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0
+; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @_Z27sub_group_reduce_logical_orb(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_logical_xor(
+; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
+; CHECK: [[T:%.*]] = and i4 [[X]], 1
+; CHECK: [[R:%.*]] = zext i4 [[T]] to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
new file mode 100644
index 0000000000000..917d6628d3a7e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
@@ -0,0 +1,162 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: sed 's/VERSION/i32 1, i32 2/g' %s | %veczc -w 4 -S -vecz-passes=packetizer | %filecheck %s --check-prefixes CHECK,CHECK-12
+; RUN: sed 's/VERSION/i32 3, i32 0/g' %s | %veczc -w 4 -S -vecz-passes=packetizer | %filecheck %s --check-prefixes CHECK,CHECK-30
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
+declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
+declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
+declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32(
+; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}})
+; Obviously this codegen doesn't make sense for a real sub-group scan, but in
+; CL1.2 this isn't identified as one. Check instead that the call has been instantiated.
+; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
+; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
+; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
+; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
+
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64(
+; CHECK-30: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32(
+; CHECK-30: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32(
+; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32(
+; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32(
+; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32(
+; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32(
+; CHECK-30: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32(
+; CHECK-30: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{VERSION}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..c62fa5afbc9db
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,175 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -w 4 -S -vecz-passes=packetizer < %s | %filecheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
+declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+
+declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
+declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
+declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
+declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
+declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_excl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_excl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_and_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_or_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_xor_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_and(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_or(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_xor(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
new file mode 100644
index 0000000000000..91416479c68cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_shift = shl i64 %gid, 1
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_shift
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is applied when the source GEPs have
+; constant strides, even though they are different.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid_shift = shl i64 %gid, 1
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_shift
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
new file mode 100644
index 0000000000000..87f5b5ff759ee
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %gid_mashed = xor i64 %gid, 12462
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid_mashed
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform pass is not applied when the GEP index
+; is divergent, which would result in a scatter store regardless.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %gid_mashed = xor i64 %gid, 12462
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid_mashed
+; CHECK: store i64 1, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
new file mode 100644
index 0000000000000..43d427df52ec5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %gid_mashed = xor i64 %gid, 12462
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_mashed
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform pass is not applied when a source GEP
+; is divergent, which would result in a scatter store regardless.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %gid_mashed = xor i64 %gid, 12462
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_mashed
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid
+; CHECK: store i64 1, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
new file mode 100644
index 0000000000000..3841b6fb6b3a4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_negative -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  %c1 = getelementptr i64, i64* %c, i64 0
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  store i64 %b, i64* %c2, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is not applied when the select is not
+; accessed through an additional GEP.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: store i64 %b, ptr %c2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
new file mode 100644
index 0000000000000..9a5c22fd24859
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is applied when the source GEPs have
+; equal constant strides.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
new file mode 100644
index 0000000000000..84620148e2478
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_shift = shl i64 %gid, 1
+  %cond = icmp eq i64 %a, 0
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_shift
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is applied when the condition is
+; uniform, and the source GEPs have different constant strides.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid_shift = shl i64 %gid, 1
+; CHECK: %cond = icmp eq i64 %a, 0
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_shift
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
new file mode 100644
index 0000000000000..4e8f64ad12793
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %cond = icmp eq i64 %a, 0
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is not applied when the condition is
+; uniform, and the two strides are the same.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %cond = icmp eq i64 %a, 0
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid
+; CHECK: store i64 1, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
new file mode 100644
index 0000000000000..dfe67263c6480
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform,packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %cond = icmp eq i64 %a, 0
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 0
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is not applied when the condition is
+; uniform and the two strides are equal, and that the result is a contiguous
+; vector store.
+
+; CHECK: %[[SELECT:.+]] = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %[[BASE:.+]] = getelementptr i64, ptr %[[SELECT]], i64 0
+; CHECK: store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %[[BASE]], align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
new file mode 100644
index 0000000000000..e46aa62838945
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is applied when one of the source GEPs
+; is uniform
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
new file mode 100644
index 0000000000000..bac1dd59268ec
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 1
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This checks that the ternary transform is applied when the source GEPs are
+; both uniform.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 1
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
new file mode 100644
index 0000000000000..f411cacc3328a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
@@ -0,0 +1,117 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: linux
+; RUN: %veczc -k add -vecz-simd-width=128 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  store i64 %call, i64* %tid, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
+  %0 = load i64, i64* %tid, align 8, !dbg !32
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
+  store i32 %2, i32* %a, align 4, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
+  %3 = load i64, i64* %tid, align 8, !dbg !33
+  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33
+  store i32 %5, i32* %b, align 4, !dbg !33
+  %6 = load i32, i32* %a, align 4, !dbg !34
+  %7 = load i32, i32* %b, align 4, !dbg !34
+  %add = add nsw i32 %6, %7, !dbg !34
+  %8 = load i64, i64* %tid, align 8, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34
+  ret void, !dbg !35
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12, !13, !14, !19, !20}
+!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9)
+!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.0 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !4)
+!31 = !DILocation(line: 3, scope: !4)
+!32 = !DILocation(line: 5, scope: !4)
+!33 = !DILocation(line: 6, scope: !4)
+!34 = !DILocation(line: 7, scope: !4)
+!35 = !DILocation(line: 8, scope: !4)
+
+; We do not expect this test to succeed
+; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
new file mode 100644
index 0000000000000..1aa122a00703a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: linux
+; RUN: %veczc -k add -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @add(<128 x i32>* %in1, <128 x i32>* %in2, <128 x i32>* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %in1p = getelementptr inbounds <128 x i32>, <128 x i32>* %in1, i64 %call
+  %in1v = load <128 x i32>, <128 x i32>* %in1p, align 4
+  %in2p = getelementptr inbounds <128 x i32>, <128 x i32>* %in2, i64 %call
+  %in2v = load <128 x i32>, <128 x i32>* %in2p, align 4
+  %add = add nsw <128 x i32> %in1v, %in2v
+  %outp = getelementptr inbounds <128 x i32>, <128 x i32>* %out, i64 %call
+  store <128 x i32> %add, <128 x i32>* %outp, align 4
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+; We do not expect this test to succeed
+; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
new file mode 100644
index 0000000000000..bf906b4421d2c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
@@ -0,0 +1,120 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info intrinsics aren't created using undef values.
+; These cause the backend to assert in codegen.
+
+; RUN: %veczc -k test_fn -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test_fn(i16 addrspace(1)* %src, <4 x i16> addrspace(1)* %dst) #0 !dbg !4 {
+entry:
+  %src.addr = alloca i16 addrspace(1)*, align 8
+  %dst.addr = alloca <4 x i16> addrspace(1)*, align 8
+  %tid = alloca i32, align 4
+  %tmp = alloca <4 x i16>, align 8
+  store i16 addrspace(1)* %src, i16 addrspace(1)** %src.addr, align 8
+  call void @llvm.dbg.declare(metadata i16 addrspace(1)** %src.addr, metadata !18, metadata !32), !dbg !33
+  store <4 x i16> addrspace(1)* %dst, <4 x i16> addrspace(1)** %dst.addr, align 8
+  call void @llvm.dbg.declare(metadata <4 x i16> addrspace(1)** %dst.addr, metadata !19, metadata !32), !dbg !33
+  call void @llvm.dbg.declare(metadata i32* %tid, metadata !20, metadata !32), !dbg !34
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !34
+  %conv = trunc i64 %call to i32, !dbg !34
+  store i32 %conv, i32* %tid, align 4, !dbg !34
+  call void @llvm.dbg.declare(metadata <4 x i16>* %tmp, metadata !22, metadata !32), !dbg !35
+  %0 = load i32, i32* %tid, align 4, !dbg !35
+  %conv1 = sext i32 %0 to i64, !dbg !35
+  %1 = load i16 addrspace(1)*, i16 addrspace(1)** %src.addr, align 8, !dbg !35
+  %call2 = call spir_func <3 x i16> @_Z6vload3mPKU3AS1t(i64 %conv1, i16 addrspace(1)* %1) #3, !dbg !35
+  %call3 = call spir_func <4 x i16> @_Z9as_short4Dv3_t(<3 x i16> %call2) #3, !dbg !35
+  store <4 x i16> %call3, <4 x i16>* %tmp, align 8, !dbg !35
+  %2 = load <4 x i16>, <4 x i16>* %tmp, align 8, !dbg !36
+  %3 = load i32, i32* %tid, align 4, !dbg !36
+  %idxprom = sext i32 %3 to i64, !dbg !36
+  %4 = load <4 x i16> addrspace(1)*, <4 x i16> addrspace(1)** %dst.addr, align 8, !dbg !36
+  %arrayidx = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %4, i64 %idxprom, !dbg !36
+  store <4 x i16> %2, <4 x i16> addrspace(1)* %arrayidx, align 8, !dbg !36
+  ret void, !dbg !37
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #2
+
+declare spir_func <4 x i16> @_Z9as_short4Dv3_t(<3 x i16>) #2
+
+declare spir_func <3 x i16> @_Z6vload3mPKU3AS1t(i64, i16 addrspace(1)*) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!23}
+!llvm.module.flags = !{!30}
+!llvm.ident = !{!31}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test_fn", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !11}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 64)
+!8 = !DIDerivedType(tag: DW_TAG_typedef, name: "ushort", file: !9, line: 29, baseType: !10)
+!9 = !DIFile(filename: "builtins/include/builtins/builtins.h", directory: "/tmp")
+!10 = !DIBasicType(name: "unsigned short", size: 16, align: 16, encoding: DW_ATE_unsigned)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64, align: 64)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "short4", file: !9, line: 55, baseType: !13)
+!13 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, size: 64, align: 64, flags: DIFlagVector, elements: !15)
+!14 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed)
+!15 = !{!16}
+!16 = !DISubrange(count: 4)
+!17 = !{!18, !19, !20, !22}
+!18 = !DILocalVariable(name: "src", arg: 1, scope: !4, file: !1, line: 2, type: !7)
+!19 = !DILocalVariable(name: "dst", arg: 2, scope: !4, file: !1, line: 2, type: !11)
+!20 = !DILocalVariable(name: "tid", scope: !4, file: !1, line: 4, type: !21)
+!21 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!22 = !DILocalVariable(name: "tmp", scope: !4, file: !1, line: 5, type: !12)
+!23 = !{void (i16 addrspace(1)*, <4 x i16> addrspace(1)*)* @test_fn, !24, !25, !26, !27, !28, !29}
+!24 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!25 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!26 = !{!"kernel_arg_type", !"ushort*", !"short4*"}
+!27 = !{!"kernel_arg_base_type", !"ushort*", !"short __attribute__((ext_vector_type(4)))*"}
+!28 = !{!"kernel_arg_type_qual", !"", !""}
+!29 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1}
+!30 = !{i32 2, !"Debug Info Version", i32 3}
+!31 = !{!"clang version 3.8.1 "}
+!32 = !DIExpression()
+!33 = !DILocation(line: 2, scope: !4)
+!34 = !DILocation(line: 4, scope: !4)
+!35 = !DILocation(line: 5, scope: !4)
+!36 = !DILocation(line: 6, scope: !4)
+!37 = !DILocation(line: 7, scope: !4)
+
+; Vectorized kernel function
+; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_test_fn({{.*}} !dbg {{![0-9]+}}
+
+; Check that there is no intrinsics using undefs
+; CHECK-NOT: call void @llvm.dbg.value(metadata {{.*}} undef
+; CHECK-NOT: call void @llvm.dbg.declare(metadata {{.*}} undef
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
new file mode 100644
index 0000000000000..2912620a122a6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z14get_local_sizej(i32) #2
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z12get_local_idj(i32) #2
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test() #0 {
+entry:
+  %call8 = call spir_func i32 @_Z12get_local_idj(i32 0) #3
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* undef, i32 %call8
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %conv9 = uitofp i8 %0 to float
+  %phitmp = fptoui float %conv9 to i8
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* undef, i32 %call8
+  store i8 %phitmp, i8 addrspace(1)* %arrayidx16, align 1
+  ret void
+}
+
+; The "undefs" in the above IR should "optimize" to a trap call and an unreachable
+; terminator instruction.
+; CHECK: define spir_kernel void @__vecz_v4_test
+; On LLVM 13+ there's no such trap: the UB is just that the function returns early.
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
new file mode 100644
index 0000000000000..8e853bded41c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k uniform_address_index -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %0 = icmp eq i32 %a, -2147483648
+  %1 = icmp eq i32 %b, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %b, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %b
+  %div = sdiv i32 %a, %5
+  %6 = trunc i64 %call to i32
+  %conv1 = add i32 %div, %6
+  %idxprom = sext i32 %conv1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %7 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %7, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
+
+; It tests to ensure that the array index is correctly identified
+; as having a uniform stride and generates plain vector loads and not
+; gather/scatter builtin calls
+
+; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
+; CHECK: entry:
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-DAG: %[[INA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %[[X:.+]]
+; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
+; CHECK-DAG: %[[OUTA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %[[X:.+]]
+; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]]
+; CHECK-NOT: call <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
new file mode 100644
index 0000000000000..8e853bded41c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k uniform_address_index -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 {
+entry:
+  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %0 = icmp eq i32 %a, -2147483648
+  %1 = icmp eq i32 %b, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %b, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %b
+  %div = sdiv i32 %a, %5
+  %6 = trunc i64 %call to i32
+  %conv1 = add i32 %div, %6
+  %idxprom = sext i32 %conv1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %7 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %7, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
+
+; It tests to ensure that the array index is correctly identified
+; as having a uniform stride and generates plain vector loads and not
+; gather/scatter builtin calls
+
+; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
+; CHECK: entry:
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-DAG: %[[INA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %[[X:.+]]
+; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
+; CHECK-DAG: %[[OUTA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %[[X:.+]]
+; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]]
+; CHECK-NOT: call <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
new file mode 100644
index 0000000000000..fe624c9710b60
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %size = call i32 @get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %loop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %size
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: loop:
+; CHECK: %load = load i32, ptr addrspace(1) %in
+; CHECK: store i32 %load, ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
new file mode 100644
index 0000000000000..227ad0a41eaa8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%inc, %loop]
+  %slot = phi i32 addrspace(1)* [%init_addr, %entry], [%inc_addr, %loop]
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_addr = getelementptr inbounds i32, i32 addrspace(1)* %slot, i64 16
+  %inc = add i64 %index, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
new file mode 100644
index 0000000000000..efb1cd8c26117
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%inc, %loop]
+  %slot = phi i32 addrspace(1)* [%inc_addr, %loop], [%init_addr, %entry]
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_addr = getelementptr inbounds i32, i32 addrspace(1)* %slot, i64 16
+  %inc = add i64 %index, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+; Same as uniform_loop_contiguous_phi1.ll except with the PHI node incoming values reversed.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
new file mode 100644
index 0000000000000..f7b64a8e24ced
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %count = phi i64 [0, %entry], [%inc, %loop]
+  %index = phi i64 [%id, %entry], [%inc_index, %loop]
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_index = add i64 %index, 16
+  %inc = add i64 %count, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+; Same as uniform_loop_contiguous_phi1.ll except with the index GEP inside the loop.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
new file mode 100644
index 0000000000000..5e2d194e69919
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %count = phi i64 [0, %entry], [%inc, %loop]
+  %index = phi i64 [%inc_index, %loop], [%id, %entry]
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_index = add i64 %index, 16
+  %inc = add i64 %count, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+; Same as uniform_loop_contiguous_phi3.ll except with the PHI node incoming values reversed.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
new file mode 100644
index 0000000000000..434cd9df3af19
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %size = call i32 @get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %loop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %size
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @test(ptr addrspace(1) %in) !codeplay_ca_vecz.base !0
+; CHECK: entry:
+; CHECK: loop:
+; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in) #0 !codeplay_ca_vecz.derived !2
+; CHECK: entry:
+; CHECK: loop:
+; CHECK: !0 = !{!1, ptr @__vecz_v4_test}
+; CHECK: !1 = !{i32 4, i32 0, i32 0, i32 0}
+; CHECK: !2 = !{!1, ptr @test}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
new file mode 100644
index 0000000000000..3ac51a2c18e02
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
+  %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
+  %varying = load i32, i32 addrspace(1)* %a_gep
+  %uniform1 = load i32, i32 addrspace(1)* %b_gep
+  %uniform2 = load i32, i32 addrspace(1)* %c_gep
+  %vu = add i32 %varying, %uniform1
+  %vuu = add i32 %vu, %uniform2
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %vuu, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks that a sum of a varying value with two uniform values
+; gets re-associated from (Varying + Uniform) + Uniform
+; to Varying + (Uniform + Uniform)
+; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation
+; CHECK: load
+
+; Ensure the two uniforms are added together directly
+; CHECK: %[[REASSOC:.+]] = add i32 %uniform1, %uniform2
+
+; Ensure there is only one vector splat
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> {{undef|poison}}, i32 %[[REASSOC]], {{(i32|i64)}} 0
+; CHECK-NOT: insertelement <4 x i32> {{undef|poison}}, i32 %{{.+}}, {{(i32|i64)}} 0
+
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> {{undef|poison}}, <4 x i32> zeroinitializer
+; CHECK: %[[RESULT:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
+; CHECK: store <4 x i32> %vuu{{.*}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
new file mode 100644
index 0000000000000..44d59f35f3dec
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
@@ -0,0 +1,59 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x
+  %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
+  %varying1 = load i32, i32 addrspace(1)* %a_gep
+  %varying2 = load i32, i32 addrspace(1)* %b_gep
+  %uniform = load i32, i32 addrspace(1)* %c_gep
+  %vu = add i32 %varying1, %uniform
+  %vvu = add i32 %vu, %varying2
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %vvu, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks that a sum of a varying value with two uniform values
+; gets re-associated from (Varying + Uniform) + Varying
+; to (Varying + Varying) + Uniform
+; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation
+
+; CHECK: %[[VARYING1:.+]] = load <4 x i32>
+; CHECK: %[[VARYING2:.+]] = load <4 x i32>
+
+; The splat of the uniform value
+; CHECK: %uniform = load
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> {{undef|poison}}, i32 %uniform, {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> {{undef|poison}}, <4 x i32> zeroinitializer
+
+; Ensure the two varyings are added together directly
+; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]]
+; CHECK: %[[VVU:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
+; CHECK: store <4 x i32> %[[VVU]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
new file mode 100644
index 0000000000000..538f5e9b8d229
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
@@ -0,0 +1,59 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x
+  %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
+  %varying1 = load i32, i32 addrspace(1)* %a_gep
+  %varying2 = load i32, i32 addrspace(1)* %b_gep
+  %uniform = load i32, i32 addrspace(1)* %c_gep
+  %vu = add i32 %varying1, %uniform
+  %vvu = add i32 %varying2, %vu
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %vvu, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+; This test checks that a sum of a varying value with two uniform values
+; gets re-associated from Varying + (Varying + Uniform)
+; to (Varying + Varying) + Uniform
+; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation
+
+; CHECK: %[[VARYING1:.+]] = load <4 x i32>
+; CHECK: %[[VARYING2:.+]] = load <4 x i32>
+
+; The splat of the uniform value
+; CHECK: %uniform = load
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> {{undef|poison}}, i32 %uniform, {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> {{undef|poison}}, <4 x i32> zeroinitializer
+
+; Ensure the two varyings are added together directly
+; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]]
+; CHECK: %[[VVU:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
+; CHECK: store <4 x i32> %[[VVU]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
new file mode 100644
index 0000000000000..365ded52c7027
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k k_controlflow_loop_if -S < %s | %filecheck %s
+
+; ModuleID = 'test.cl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind uwtable
+define void @k_controlflow_loop_if(float* nocapture %out, float* nocapture readonly %in1, i32* nocapture readnone %in2) #0 {
+entry:
+  %call = tail call i64 @get_global_id(i32 0) #2
+  %sext = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext, 32
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %idxprom
+  %0 = bitcast float* %arrayidx to i32*
+  %1 = load i32, i32* %0, align 4, !tbaa !7
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %idxprom
+  %2 = bitcast float* %arrayidx2 to i32*
+  store i32 %1, i32* %2, align 4, !tbaa !7
+  ret void
+}
+
+declare i64 @get_global_id(i32) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (float*, float*, i32*)* @k_controlflow_loop_if, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"float*", !"float*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"float*", !"float*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!6 = !{!"clang version 3.8.0 "}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"float", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
+
+; The vectorized function
+; CHECK: define void @__vecz_v[[WIDTH:[0-9]+]]_k_controlflow_loop_if(
+
+; The unmangled get_global_id call
+; CHECK: tail call i64 @get_global_id(i32 0)
+
+; The vectorized loads and stores
+; CHECK: load <4 x i32>, ptr %arrayidx, align 4
+; CHECK: store <4 x i32> %0, ptr %arrayidx2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
new file mode 100644
index 0000000000000..0bee5edba1984
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
@@ -0,0 +1,113 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k entry -w 2 -vecz-handle-declaration-only-calls -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+@.str.1 = private unnamed_addr addrspace(2) constant [10 x i8] c"Test %ld\0A\00", align 1
+@.str.2 = private unnamed_addr addrspace(2) constant [6 x i8] c"Test\0A\00", align 1
+
+define spir_kernel void @entry(i64* %input, i64* %output) {
+entry:
+  %gid = call i64 @_Z12get_local_idj(i32 0)
+  %i1ptr = getelementptr i64, i64* %output, i64 %gid
+  call spir_func void @_Z9mem_fencej(i32 1)
+  %ii = call i64 @functionD(i64* %input)
+  %ib = trunc i64 %ii to i1
+  call void @functionA(i64* %i1ptr, i1 %ib)
+  %i1 = load i64, i64* %i1ptr
+  %i2ptr = getelementptr i64, i64* %input, i64 %gid
+  %i2 = load i64, i64* %i2ptr
+  %cond = icmp eq i64 %i1, %i2
+  br i1 %cond, label %middle, label %end
+
+middle:
+  %ci3ptr = getelementptr i64, i64* %output, i64 %gid
+  %ci3 = load i64, i64* %ci3ptr
+  %fc = call i64 @functionB(i64* %ci3ptr, i64 %ci3, i32 16, i1 false)
+  %call2 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str.1, i64 0, i64 0), i64 %ci3)
+  br label %end
+
+end:
+  %rr = phi i64 [42, %entry], [%fc, %middle]
+  call void @functionC(i64 %rr)
+  %nah = call i64 @functionB(i64* %i2ptr, i64 %rr, i32 8, i1 true)
+  %call3 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str.2, i64 0, i64 0))
+  ret void
+}
+
+declare void @functionA(i64*, i1)
+
+declare i64 @functionB(i64*, i64, i32, i1)
+
+declare void @functionC(i64)
+
+define i64 @functionD(i64* %input) {
+entry:
+  %r = load i64, i64* %input
+  ret i64 %r
+}
+
+declare spir_func void @_Z9mem_fencej(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+declare i64 @_Z12get_local_idj(i32)
+
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_entry
+; CHECK: entry:
+; Check that we didn't mask the get_local_id call
+; CHECK: %gid = call i64 @_Z12get_local_idj(i32 0)
+; Check that we didn't mask the mem_fence call
+; CHECK: call spir_func void @_Z9mem_fencej(i32 1)
+; Check that we instantiated functionA without a mask
+; CHECK: call void @functionA(ptr {{.+}}, i1 %ib)
+; CHECK: call void @functionA(ptr {{.+}}, i1 %ib)
+
+; Get the condition -- Also works as a sanity check for this test
+; CHECK: [[COND:%cond.*]] = icmp eq <[[WIDTH]] x i64>
+
+; Check if we instatiated functionB with a mask
+; CHECK: [[COND1:%[0-9]+]] = extractelement <[[WIDTH]] x i1> [[COND]], {{(i32|i64)}} 0
+; CHECK: [[COND2:%[0-9]+]] = extractelement <[[WIDTH]] x i1> [[COND]], {{(i32|i64)}} 1
+; CHECK: {{.+}} = call i64 @__vecz_b_masked_functionB(ptr {{(nonnull )?}}{{%[0-9]+}}, i64 {{%[0-9]+}}, i32 16, i1 false, i1 [[COND1]])
+; CHECK: {{.+}} = call i64 @__vecz_b_masked_functionB(ptr {{(nonnull )?}}{{%[0-9]+}}, i64 {{%[0-9]+}}, i32 16, i1 false, i1 [[COND2]])
+; CHECK: call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2) @.str.1, i64 {{%[0-9]+}}, i1 [[COND1]])
+; CHECK: call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2) @.str.1, i64 {{%[0-9]+}}, i1 [[COND2]])
+
+; The following checks check the generated functionB masked function
+; CHECK: define private i64 @__vecz_b_masked_functionB(ptr{{( %0)?}}, i64{{( %1)?}}, i32{{( %2)?}}, i1{{( %3)?}}, i1{{( %4)?}}) {
+; CHECK: entry:
+; CHECK: br i1 %4, label %active, label %exit
+; CHECK: active:
+; CHECK: [[RES:%[0-9]+]] = call i64 @functionB(ptr {{(nonnull )?}}%0, i64 %1, i32 %2, i1 %3)
+; CHECK: br label %exit
+; CHECK: exit:
+; CHECK: [[RET:%[0-9]+]] = phi i64 [ [[RES]], %active ], [ 0, %entry ]
+; CHECK: ret i64 [[RET]]
+
+; The following checks check the generated printf masked function
+; CHECK: define private spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2){{( %0)?}}, i64{{( %1)?}}, i1{{( %2)?}}) {
+; CHECK: entry:
+; CHECK: br i1 %2, label %active, label %exit
+; CHECK: active:
+; CHECK: [[RES:%[0-9]+]] = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) %0, i64 %1)
+; CHECK: br label %exit
+; CHECK: exit:
+; CHECK: [[RET:%[0-9]+]] = phi i32 [ [[RES]], %active ], [ 0, %entry ]
+; CHECK: ret i32 [[RET]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
new file mode 100644
index 0000000000000..b1cbef14bd152
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
@@ -0,0 +1,86 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k varying_load1 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @varying_load1(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %meta) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %conv, 11
+  br i1 %cmp, label %if.then, label %if.end16
+
+if.then:                                          ; preds = %entry
+  %0 = load i32, i32 addrspace(1)* %meta, align 4
+  %cmp2 = icmp eq i32 %0, 0
+  br i1 %cmp2, label %if.then4, label %if.end
+
+if.then4:                                         ; preds = %if.then
+  %mul5 = mul nsw i32 %conv, %n
+  %1 = icmp eq i32 %mul5, -2147483648
+  %2 = icmp eq i32 %n, -1
+  %3 = and i1 %2, %1
+  %4 = icmp eq i32 %n, 0
+  %5 = or i1 %4, %3
+  %6 = select i1 %5, i32 1, i32 %n
+  %div6 = sdiv i32 %mul5, %6
+  %add = add nsw i32 %div6, %conv
+  %shl7 = mul i32 %add, 8
+  %add8 = add nsw i32 %shl7, %mul5
+  %shl9 = shl i32 %add8, 3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then4, %if.then
+  %sum.0 = phi i32 [ %shl9, %if.then4 ], [ %n, %if.then ]
+  %rem1 = and i32 %conv, 1
+  %cmp10 = icmp eq i32 %rem1, 0
+  br i1 %cmp10, label %if.then12, label %if.end16
+
+if.then12:                                        ; preds = %if.end
+  %7 = load i32, i32 addrspace(1)* %meta, align 4
+  %add13 = add nsw i32 %7, %n
+  %mul14 = mul nsw i32 %add13, %sum.0
+  br label %if.end16
+
+if.end16:                                         ; preds = %if.end, %if.then12, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %mul14, %if.then12 ], [ 0, %if.end ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+; The purpose of this test is to make sure that if a condition is a use of a
+; uniform load that is control dependent of a varying path, then the load will
+; be considered "mask varying" and so the condition is still uniform.
+
+; CHECK: spir_kernel void @__vecz_v4_varying_load1
+; CHECK: if.then:
+; CHECK: %{{.+}} = call i32 @__vecz_b_masked_load4
+; CHECK: br i1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
new file mode 100644
index 0000000000000..7bd24a5255c5a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k varying_load2 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @varying_load2(i32 addrspace(1)* %input, i32 addrspace(1)* %out) #0 {
+entry:
+  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0) #3
+  %call2 = call spir_func i64 @_Z12get_local_idj(i32 0) #3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %call2
+  %cmp = icmp ne i64 %call2, 0
+  br i1 %cmp, label %for.cond.preheader, label %if.end14
+
+for.cond.preheader:                               ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.preheader, %for.inc
+  %max.0 = phi i32 [ %max.1, %for.inc ], [ 0, %for.cond.preheader ]
+  %storemerge = phi i64 [ %inc, %for.inc ], [ 0, %for.cond.preheader ]
+  %call6 = call spir_func i64 @_Z14get_local_sizej(i32 0) #3
+  %cmp7 = icmp ult i64 %storemerge, %call6
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %load1 = load i32, i32 addrspace(1)* %input, align 4
+  %cmp9 = icmp ugt i32 %load1, %max.0
+  br i1 %cmp9, label %if.then, label %for.inc
+
+if.then:                                        ; preds = %for.body
+  %load2 = load i32, i32 addrspace(1)* %input, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %max.1 = phi i32 [ %load2, %if.then ], [ %max.0, %for.body ]
+  %inc = add i64 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %max.0.lcssa = phi i32 [ %max.0, %for.cond ]
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call1
+  store i32 %max.0.lcssa, i32 addrspace(1)* %arrayidx13, align 4
+  br label %if.end14
+
+if.end14:                                         ; preds = %for.end, %entry
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z12get_local_idj(i32) #1
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z14get_local_sizej(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent noduplicate "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nobuiltin nounwind readonly }
+attributes #4 = { nounwind }
+
+; The purpose of this test is to make sure that if a condition is a use of a
+; uniform load that is control dependent of a varying path, then the load will
+; be considered "mask varying" and so the condition is still uniform.
+
+; CHECK: spir_kernel void @__vecz_v4_varying_load2
+; CHECK: for.body:
+; CHECK: %{{.+}} = call i32 @__vecz_b_masked_load4
+; CHECK: br i1
+; CHECK: if.then:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
new file mode 100644
index 0000000000000..edc53c43d5ed1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Test the -cl-opt-disable compile option
+; RUN: %veczc -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @fmuladd(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx2, align 32
+  %arrayidx3 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx3, align 32
+  %div = fdiv <4 x double> %2, %3
+  %4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %div)
+  %arrayidx4 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %5 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx4, align 32
+  %sub = fsub <4 x double> %5, %4
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx4, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_fmuladd(
+; Check if the scalar fmuladd exists
+; CHECK: call double @llvm.fmuladd.f64(
+; Check if the vector fmuladd doesn't exist
+; CHECK-NOT: call double @llvm.fmuladd.v4f64(
+; CHECK: ret void
+
+define spir_kernel void @fma(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx2, align 32
+  %arrayidx3 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx3, align 32
+  %div = fdiv <4 x double> %2, %3
+  %4 = call <4 x double> @llvm.fma.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %div)
+  %arrayidx4 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %5 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx4, align 32
+  %sub = fsub <4 x double> %5, %4
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx4, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_fma(
+; Check if the scalar fma exists
+; CHECK: call double @llvm.fma.f64(
+; Check if the vector fma doesn't exist
+; CHECK-NOT: call double @llvm.fma.v4f64(
+; CHECK: ret void
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
new file mode 100644
index 0000000000000..3440be62739c1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+; This test checks if a uniform <4 x i32> phi is not scalarized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ]
+; CHECK: %[[INC]] = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
new file mode 100644
index 0000000000000..9cd71a06bcedb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
@@ -0,0 +1,97 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call.trunc = trunc i64 %call to i32
+  %call.splatinsert = insertelement <4 x i32> undef, i32 %call.trunc, i32 0
+  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, %call.splat
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @_Z15get_global_sizej(i32)
+
+; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis
+; and then re-packetized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE1:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC2:.+]], %for.cond ]
+; CHECK: %[[STOREMERGE4:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC5:.+]], %for.cond ]
+; CHECK: %[[STOREMERGE6:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC7:.+]], %for.cond ]
+; CHECK: %[[STOREMERGE8:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC9:.+]], %for.cond ]
+; CHECK: %[[INC2]] = add <4 x i32> %[[STOREMERGE1]], [[CALL:.+]]
+; CHECK: %[[INC5]] = add <4 x i32> %[[STOREMERGE4]], [[CALL]]
+; CHECK: %[[INC7]] = add <4 x i32> %[[STOREMERGE6]], [[CALL]]
+; CHECK: %[[INC9]] = add <4 x i32> %[[STOREMERGE8]], [[CALL]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
new file mode 100644
index 0000000000000..5b7c052576409
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [29 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho\0A\00"
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
new file mode 100644
index 0000000000000..6678a13d3a18f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test32 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [225 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho\0A\00"
+
+; CHECK: define spir_kernel void @__vecz_v4_test32(
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
new file mode 100644
index 0000000000000..661c9da4c71a6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test64 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [449 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho\0A\00"
+
+; CHECK: define spir_kernel void @__vecz_v4_test64(ptr %out, ptr %in1, ptr %in2)
+; CHECK: %call465130 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: %call465131 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: %call465132 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: %call465133 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
new file mode 100644
index 0000000000000..f4061f3326c00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
@@ -0,0 +1,102 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_float_vectors -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [13 x i8] c"%#16A,%#16A\0A\00", align 1
+
+; CHECK: define spir_kernel void @__vecz_v4_test_float_vectors
+; CHECK: %[[V4:[0-9]+]] = fpext <4 x float> %{{.+}} to <4 x double>
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 0
+; CHECK: %[[V6:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 1
+; CHECK: %[[V7:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 2
+; CHECK: %[[V8:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 3
+; CHECK: %[[V9:[0-9]+]] = fpext <4 x float> %{{.+}} to <4 x double>
+; CHECK: %[[V10:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 0
+; CHECK: %[[V11:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 1
+; CHECK: %[[V12:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 2
+; CHECK: %[[V13:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 3
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V5]], double %[[V10]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V6]], double %[[V11]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V7]], double %[[V12]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V8]], double %[[V13]])
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
new file mode 100644
index 0000000000000..06c4e97bd888f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
@@ -0,0 +1,100 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k test_float_vectors -vecz-simd-width=4 -vecz-double-support=false -vecz-choices=FullScalarization -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare spir_func i64 @_Z13get_global_idj(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [13 x i8] c"%#16A,%#16A\0A\00", align 1
+
+; CHECK: define spir_kernel void @__vecz_v4_test_float_vectors
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 0
+; CHECK: %[[V6:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 1
+; CHECK: %[[V7:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 2
+; CHECK: %[[V8:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 3
+; CHECK: %[[V10:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 0
+; CHECK: %[[V11:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 1
+; CHECK: %[[V12:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 2
+; CHECK: %[[V13:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 3
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V5]], float %[[V10]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V6]], float %[[V11]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V7]], float %[[V12]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V8]], float %[[V13]])
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
new file mode 100644
index 0000000000000..8e64112885dc9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
@@ -0,0 +1,154 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k blend_div_loop -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @blend_div_loop(i8 addrspace(1)* %src1ptr, i32 %src1_step, i32 %src1_offset, i8 addrspace(1)* %dstptr, i32 %dst_step, i32 %dst_offset, i32 %dst_rows, i32 %dst_cols, i8 addrspace(1)* %src2ptr, i32 %src2_step, i32 %src2_offset, i8 addrspace(1)* %src3ptr, i32 %src3_step, i32 %src3_offset, i32 %rowsPerWI) #0 {
+entry:
+  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %call1 = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %0 = trunc i64 %call1 to i32
+  %conv3 = mul i32 %0, %rowsPerWI
+  %cmp = icmp slt i32 %conv, %dst_cols
+  br i1 %cmp, label %if.then, label %if.end62
+
+if.then:                                          ; preds = %entry
+  %call5 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src1_offset) #2
+  %call6 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src1_step, i32 %call5) #2
+  %call7 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %dst_offset) #2
+  %call8 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %dst_step, i32 %call7) #2
+  %call9 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src2_offset) #2
+  %call10 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src2_step, i32 %call9) #2
+  %call11 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src3_offset) #2
+  %call12 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src3_step, i32 %call11) #2
+  %add = add nsw i32 %conv3, %rowsPerWI
+  %call13 = call spir_func i32 @_Z3minii(i32 %dst_rows, i32 %add) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end54, %if.then
+  %src1_index.0 = phi i32 [ %call6, %if.then ], [ %add59, %for.end54 ]
+  %dst_index.0 = phi i32 [ %call8, %if.then ], [ %add60, %for.end54 ]
+  %src2_index.0 = phi i32 [ %call10, %if.then ], [ %add55, %for.end54 ]
+  %src3_index.0 = phi i32 [ %call12, %if.then ], [ %add56, %for.end54 ]
+  %y.0 = phi i32 [ %conv3, %if.then ], [ %inc58, %for.end54 ]
+  %cmp14 = icmp slt i32 %y.0, %call13
+  br i1 %cmp14, label %for.body, label %if.end62
+
+for.body:                                         ; preds = %for.cond
+  %idx.ext = sext i32 %src1_index.0 to i64
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %src1ptr, i64 %idx.ext
+  %idx.ext16 = sext i32 %dst_index.0 to i64
+  %add.ptr17 = getelementptr inbounds i8, i8 addrspace(1)* %dstptr, i64 %idx.ext16
+  %idx.ext18 = sext i32 %src2_index.0 to i64
+  %add.ptr19 = getelementptr inbounds i8, i8 addrspace(1)* %src2ptr, i64 %idx.ext18
+  %idx.ext20 = sext i32 %src3_index.0 to i64
+  %add.ptr21 = getelementptr inbounds i8, i8 addrspace(1)* %src3ptr, i64 %idx.ext20
+  br label %for.cond22
+
+for.cond22:                                       ; preds = %for.inc49, %for.body
+  %src1.0 = phi i8 addrspace(1)* [ %add.ptr, %for.body ], [ %add.ptr51, %for.inc49 ]
+  %src2.0 = phi i8 addrspace(1)* [ %add.ptr19, %for.body ], [ %add.ptr52, %for.inc49 ]
+  %src3.0 = phi i8 addrspace(1)* [ %add.ptr21, %for.body ], [ %add.ptr53, %for.inc49 ]
+  %px.0 = phi i32 [ 0, %for.body ], [ %inc50, %for.inc49 ]
+  %cmp23 = icmp eq i32 %px.0, 0
+  br i1 %cmp23, label %for.body25, label %for.end54
+
+for.body25:                                       ; preds = %for.cond22
+  %1 = zext i32 %px.0 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr17, i64 %1
+  store i8 -1, i8 addrspace(1)* %arrayidx, align 1
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.inc, %for.body25
+  %storemerge = phi i32 [ 0, %for.body25 ], [ %inc, %for.inc ]
+  %cmp27 = icmp eq i32 %storemerge, 0
+  br i1 %cmp27, label %for.body29, label %for.inc49
+
+for.body29:                                       ; preds = %for.cond26
+  %2 = zext i32 %storemerge to i64
+  %arrayidx31 = getelementptr inbounds i8, i8 addrspace(1)* %src2.0, i64 %2
+  %3 = load i8, i8 addrspace(1)* %arrayidx31, align 1
+  %4 = zext i32 %storemerge to i64
+  %arrayidx34 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 %4
+  %5 = load i8, i8 addrspace(1)* %arrayidx34, align 1
+  %cmp36 = icmp ugt i8 %3, %5
+  br i1 %cmp36, label %if.then46, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %for.body29
+  %6 = zext i32 %storemerge to i64
+  %arrayidx39 = getelementptr inbounds i8, i8 addrspace(1)* %src3.0, i64 %6
+  %7 = load i8, i8 addrspace(1)* %arrayidx39, align 1
+  %8 = zext i32 %storemerge to i64
+  %arrayidx42 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 %8
+  %9 = load i8, i8 addrspace(1)* %arrayidx42, align 1
+  %cmp44 = icmp ult i8 %7, %9
+  br i1 %cmp44, label %if.then46, label %for.inc
+
+if.then46:                                        ; preds = %lor.lhs.false, %for.body29
+  %10 = zext i32 %px.0 to i64
+  %arrayidx48 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr17, i64 %10
+  store i8 0, i8 addrspace(1)* %arrayidx48, align 1
+  br label %for.inc49
+
+for.inc:                                          ; preds = %lor.lhs.false
+  %inc = add nuw nsw i32 %storemerge, 1
+  br label %for.cond26
+
+for.inc49:                                        ; preds = %if.then46, %for.cond26
+  %inc50 = add nuw nsw i32 %px.0, 1
+  %add.ptr51 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 1
+  %add.ptr52 = getelementptr inbounds i8, i8 addrspace(1)* %src2.0, i64 1
+  %add.ptr53 = getelementptr inbounds i8, i8 addrspace(1)* %src3.0, i64 1
+  br label %for.cond22
+
+for.end54:                                        ; preds = %for.cond22
+  %add55 = add nsw i32 %src2_index.0, %src2_step
+  %add56 = add nsw i32 %src3_index.0, %src3_step
+  %inc58 = add nsw i32 %y.0, 1
+  %add59 = add nsw i32 %src1_index.0, %src1_step
+  %add60 = add nsw i32 %dst_index.0, %dst_step
+  br label %for.cond
+
+if.end62:                                         ; preds = %for.cond, %entry
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z5mad24iii(i32, i32, i32) #1
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z3minii(i32, i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+; The purpose of this test is to make sure we correctly replace the uses of
+; divergent loop update masks outside the loop, even in the pure exit.
+
+; CHECK: spir_kernel void @__vecz_v4_blend_div_loop
+; CHECK: for.cond26.pure_exit:
+; CHECK: %if.then46.entry_mask{{[0-9]+}} = or i1 %if.then46.loop_exit_mask{{[0-9]+}}.blend, %if.then46.loop_exit_mask.blend
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
new file mode 100644
index 0000000000000..6a5ed7a82ebe2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
@@ -0,0 +1,111 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vecz_scalar_gather_load -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z12get_group_idj(i32)
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i64 @_Z12get_local_idj(i32)
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @vecz_scalar_gather_load(i32 addrspace(1)* %row_indices, i32 addrspace(1)* %row_blocks, float addrspace(1)* %result) {
+entry:
+  %call1 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %call2 = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %call1
+  %load1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add1 = add i64 %call1, 1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %add1
+  %load2 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  br label %for.cond
+
+for.cond:                                       ; preds = %entry, %for.inc
+  %storemerge = phi i32 [ %load1, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ult i32 %storemerge, %load2
+  br i1 %cmp1, label %if.then1, label %for.end
+
+if.then1:                                       ; preds = %for.cond
+  %storemerge.zext = zext i32 %storemerge to i64
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %row_indices, i64 %storemerge.zext
+  %load3 = load i32, i32 addrspace(1)* %gep1, align 4
+  %sub1 = sub i32 %load3, %load1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %row_indices, i64 %storemerge.zext
+  %load4 = load i32, i32 addrspace(1)* %gep2, align 4
+  %sub2 = sub i32 %load4, %load1
+  %cmp2 = icmp ugt i32 %sub2, %sub1
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.then2:                                       ; preds = %if.then1
+  %sub1.zext = zext i32 %sub1 to i64
+  %gep3 = getelementptr inbounds float, float addrspace (1)* %result, i64 %sub1.zext
+  %load5 = load float, float addrspace(1)* %gep3, align 4
+  br label %if.else2
+
+if.else2:                                        ; preds = %if.then1, %if.then2
+  %ret = phi float [ %load5, %if.then2 ], [ 0.000000e+00, %if.then1 ]
+  %cmp3 = icmp eq i64 %call2, 0
+  br i1 %cmp3, label %if.then3, label %for.inc
+
+if.then3:                                       ; preds = %if.else2
+  %gep4 = getelementptr inbounds float, float addrspace(1)* %result, i64 %call2
+  store float %ret, float addrspace(1)* %gep4, align 4
+  br label %for.inc
+
+for.inc:                                       ; preds = %if.then3, %if.else2
+  %inc = add i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                        ; preds = %for.cond
+  ret void
+}
+
+; The purpose of this test is to ensure we don't generate a masked load for a
+; load from a uniform address, even where it is in a divergent control path.
+; It used to be the case that such a load would become a masked load during
+; control flow conversion, thefore causing it to become a varying load due to
+; the varying mask. However, since the introduction of the Mask Varying
+; attribute, it is possible to support a Uniform load with a Varying mask, so
+; it is no longer necessary to mark all loads in divergent paths as Varying.
+; The somewhat circuitous upshot of this is that the load no longer gets a mask
+; at all, since it was previously only considered to be in a divergent path on
+; account of another Mask Varying load!
+
+; CHECK: spir_kernel void @__vecz_v4_vecz_scalar_gather_load
+
+; This load depends only on the uniform loop iterator
+; CHECK: if.then1:
+; CHECK: %[[IND:.+]] = phi i32
+; CHECK: %[[ZIND:.+]] = zext i32 %[[IND]] to i64
+; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr addrspace(1) %row_indices, i64 %[[ZIND]]
+; CHECK: %{{.+}} = load i32, ptr addrspace(1) %[[GEP1]]
+
+; This load depends only on other uniform loads
+; CHECK: if.then2:
+; CHECK-NOT: declare float @__vecz_b_masked_gather_load4_
+; CHECK-NOT: declare float @__vecz_b_masked_load4_
+; CHECK: %[[GEP2:.+]] = getelementptr inbounds float, ptr addrspace(1) %result
+; CHECK: %{{.+}} = load float, ptr addrspace(1) %[[GEP2]]
+
+; The store instruction is definitely in a divergent path, however, so needs a mask.
+; CHECK: if.then3:
+; CHECK: call void @__vecz_b_masked_store4_f
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
new file mode 100644
index 0000000000000..410079f3bcbe8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k vecz_scalar_interleaved_load -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z13get_global_idj(i32) #0
+
+define spir_kernel void @vecz_scalar_interleaved_load(float addrspace(1)* %out, i64 %n, float %m) {
+entry:
+  %gid0 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
+  %gid1 = tail call spir_func i64 @_Z13get_global_idj(i32 1) #0
+  %cmp1 = icmp slt i64 %gid0, %n
+  br i1 %cmp1, label %if.then1, label %end
+
+if.then1:                                     ; preds = %entry
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %gid1
+  %cmp2 = fcmp une float %m, 0.000000e+00
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.then2:                                     ; preds = %if.then1
+  %mul1 = mul nsw i64 %gid0, %n
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %mul1
+  %cmp3 = icmp slt i64 %gid1, %n
+  %load1 = load float, float addrspace(1)* %gep2, align 4
+  %ie1 = insertelement <4 x float> undef, float %load1, i32 0
+  br i1 %cmp3, label %if.then3, label %if.else3
+
+if.then3:                                     ; preds = %if.then2
+  %laod2 = load float, float addrspace(1)* %gep2, align 4
+  br label %if.else3
+
+if.else3:                                     ; preds = %if.then2, %if.then3
+  %phi_load2 = phi float [ %laod2, %if.then3 ], [ 0.000000e+00, %if.then2 ]
+  %ie2 = insertelement <4 x float> %ie1, float %phi_load2, i32 1
+  %load3 = load float, float addrspace(1)* %gep2, align 4
+  %ie3 = insertelement <4 x float> %ie2, float %load3, i32 2
+  %x76 = load float, float addrspace(1)* %gep2, align 4
+  %ie4 = insertelement <4 x float> %ie3, float %x76, i32 3
+  br label %if.else2
+
+if.else2:                                    ; preds = %if.else3, %if.then1
+  %ret_vec = phi <4 x float> [ %ie4, %if.else3 ], [ zeroinitializer, %if.then1 ]
+  %ret = extractelement <4 x float> %ret_vec, i32 0
+  %ret_gep = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %gid1
+  store float %ret, float addrspace(1)* %ret_gep, align 4
+  br label %end
+
+end:                                    ; preds = %entry, %if.else2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
+; The purpose of this test is to ensure we correctly generate a scalar
+; masked load for a scalar load that has a strided pointer, instead of
+; generating an interleaved masked load for a non vector load (which is
+; invalid).
+
+; The middle optimizations break this test because after scalarization,
+; some of the vector elements become dead code and thus, an interleaved
+; load is in fact generated (although correctly, in this case)
+
+; CHECK: spir_kernel void @__vecz_v4_vecz_scalar_interleaved_load
+; CHECK: declare float @__vecz_b_masked_load4_fu3ptrU3AS1b
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
new file mode 100644
index 0000000000000..d8955549ab32d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
@@ -0,0 +1,105 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: %veczc -k dont_mask_workitem_builtins -S < %s | %filecheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  fence syncscope("singlethread") acq_rel
+  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0) #5
+  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare spir_func i64 @_Z12get_local_idj(i32) #1
+
+declare spir_func i64 @_Z13get_global_idj(i32) #1
+
+declare spir_func i64 @_Z14get_local_sizej(i32) #1
+
+declare spir_func i64 @_Z12get_group_idj(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { argmemonly nounwind readonly }
+attributes #5 = { nobuiltin nounwind }
+attributes #6 = { nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"const", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; The vectorized function
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_dont_mask_workitem_builtins(
+
+; Check if the builtins are still here
+; CHECK: call spir_func i64 @_Z12get_local_idj(i32 0)
+; CHECK: call spir_func i64 @_Z14get_local_sizej(i32 0)
+; CHECK: call spir_func i64 @_Z12get_group_idj(i32 0)
+; CHECK: fence syncscope("singlethread") acq_rel
+; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked__Z13get_global_idj(i32
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked__Z14get_local_sizej(i32
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked__Z12get_group_idj(i32
+
+; Function end
+; CHECK: ret void
+
+; Also check that we haven't declared the masked functions
+; CHECK-NOT: define private spir_func void @__vecz_b_masked__Z7barrierj(i32)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked__Z13get_global_idj(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked__Z14get_local_sizej(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked__Z12get_group_idj(i32, i1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
new file mode 100644
index 0000000000000..7ff39a21eff04
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -0,0 +1,446 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/device_info.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/optimal_builtin_replacement_pass.h>
+#include <compiler/utils/pass_machinery.h>
+#include <compiler/utils/vectorization_factor.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/Analysis/AliasAnalysis.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Bitcode/BitcodeReader.h>
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/InitializePasses.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/FileSystem.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/Process.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/ToolOutputFile.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetLoweringObjectFile.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+
+#include <string>
+
+#include "multi_llvm/multi_llvm.h"
+#include "vecz/pass.h"
+#include "vecz/vecz_target_info.h"
+
+static llvm::cl::opt<std::string> InputFilename(
+    llvm::cl::Positional, llvm::cl::desc("<input .bc file>"),
+    llvm::cl::init("-"));
+
+static llvm::cl::opt<std::string> OutputFilename(
+    "o", llvm::cl::desc("Override output filename"),
+    llvm::cl::value_desc("filename"));
+static llvm::cl::opt<bool, false> WriteTextual(
+    "S", llvm::cl::desc("Write module as text"));
+
+static llvm::cl::list<std::string> KernelNameSpecs(
+    "k", llvm::cl::desc("Kernel to vectorize"), llvm::cl::ZeroOrMore,
+    llvm::cl::value_desc("name"));
+
+static llvm::cl::opt<unsigned> SIMDDimIdx(
+    "d", llvm::cl::desc("Dimension index to vectorize on"), llvm::cl::init(0),
+    llvm::cl::value_desc("dimension"));
+
+static llvm::cl::opt<unsigned> SIMDWidth(
+    "w", llvm::cl::desc("Width to vectorize to"), llvm::cl::init(0),
+    llvm::cl::value_desc("width"));
+
+static llvm::cl::opt<bool> FailQuietly(
+    "vecz-fail-quietly",
+    llvm::cl::desc("don't return an error code on vectorization failure"));
+
+static llvm::cl::opt<bool> ChoicesHelp(
+    "vecz-choices-help",
+    llvm::cl::desc("see information about available choices"));
+
+static llvm::cl::opt<bool> VeczAuto(
+    "vecz-auto",
+    llvm::cl::desc("run the vectorizer if it is found to be useful"));
+
+static llvm::cl::opt<unsigned, 0> VeczSimdWidth(
+    "vecz-simd-width",
+    llvm::cl::desc("manually set the SIMD width for the vectorizer"));
+
+static llvm::cl::opt<llvm::cl::boolOrDefault> VeczScalable(
+    "vecz-scalable",
+    llvm::cl::desc("force scalable vectorization for the vectorizer"));
+
+// Allow the passing of Vecz Choices string on the command line. This is parsed
+// after the choices environment variable, thus overriding it.
+static llvm::cl::opt<std::string> ChoicesString(
+    "vecz-choices", llvm::cl::desc("Set vecz choices"));
+
+static llvm::cl::opt<bool> VeczCollectStats(
+    "vecz-llvm-stats", llvm::cl::desc("enable reporting LLVM statistics"));
+
+static llvm::cl::opt<std::string> UserTriple(
+    "vecz-target-triple", llvm::cl::desc("the target triple"));
+static llvm::cl::opt<std::string> UserCPU("vecz-target-mcpu",
+                                          llvm::cl::desc("Set the CPU model"));
+static llvm::cl::opt<std::string> CPUFeatures(
+    "vecz-target-features", llvm::cl::desc("Set the CPU feature string"));
+static llvm::cl::opt<bool> DoubleSupport(
+    "vecz-double-support", llvm::cl::init(true),
+    llvm::cl::desc(
+        "Assume the target has double-precision floating point support"));
+
+static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
+                                           llvm::StringRef cpu_model,
+                                           llvm::StringRef target_features) {
+  llvm::Triple triple(triple_string);
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllDisassemblers();
+
+  llvm::TargetOptions opts;
+  opts.DisableIntegratedAS = false;
+  std::string e;
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(triple.getTriple(), e);
+  if (!target) {
+    ::fprintf(stderr, "can't get target %s:%s\n", triple.getTriple().c_str(),
+              e.c_str());
+    ::exit(1);
+  }
+  llvm::PassRegistry &registry = *llvm::PassRegistry::getPassRegistry();
+  llvm::initializeAlwaysInlinerLegacyPassPass(registry);
+  return target->createTargetMachine(triple.getTriple(), cpu_model,
+                                     target_features, opts,
+                                     llvm::Reloc::Model::Static);
+}
+
+static vecz::VeczPassOptions getDefaultPassOptions() {
+  // Enable/disable Choices from the CODEPLAY_VECZ_CHOICES environment
+  // variable.
+  vecz::VectorizationChoices Choices;
+
+  const char *ptr = std::getenv("CODEPLAY_VECZ_CHOICES");
+  if (ptr && !Choices.parseChoicesString(ptr)) {
+    llvm::errs()
+        << "Failed to parse the CODEPLAY_VECZ_CHOICES env variable.\n"
+           "Use --vecz-choices-help for available choices and usage info.\n";
+    ::exit(1);
+  }
+
+  // Parse the Vecz choices given in the command line
+  const std::string &ch = ChoicesString;
+  if (!ch.empty() && !Choices.parseChoicesString(ch)) {
+    llvm::errs()
+        << "Failed to parse the --vecz-choices command line option.\n"
+           "Use --vecz-choices-help for available choices and usage info.\n";
+    ::exit(1);
+  }
+
+  if (VeczCollectStats) {
+    llvm::EnableStatistics(true);
+  }
+
+  auto const factor = SIMDWidth ? SIMDWidth : 4;
+  auto VF = compiler::utils::VectorizationFactor::getFixedWidth(factor);
+  if (VeczSimdWidth) {
+    VF.setKnownMin(VeczSimdWidth);
+  }
+
+  if (VeczScalable == llvm::cl::BOU_TRUE) {
+    VF.setIsScalable(true);
+  } else if (VeczScalable == llvm::cl::BOU_FALSE) {
+    VF.setIsScalable(false);
+  }
+  vecz::VeczPassOptions passOpts;
+  passOpts.choices = Choices;
+  passOpts.factor = VF;
+  passOpts.vecz_auto = VeczAuto;
+  passOpts.vec_dim_idx = SIMDDimIdx;
+  passOpts.local_size = SIMDWidth;
+  return passOpts;
+}
+
+// Parse a command line vectorization specification for a given kernel
+// <kernel_spec> ::= <kernel_name> ':' <spec>
+// <kernel_spec> ::= <kernel_name>
+// <spec> ::= <vf><dim>(opt)<width>(opt)
+//            <scalable_spec>(opt)<predicated_spec>(opt)
+// <spec> ::= <spec> ',' <spec>
+// <number> ::= [0-9]+
+// <kernel_name> ::= [a-zA-Z_][a-zA-Z_0-9]+
+// <dim> ::= '.' [123]
+// <vf> ::= <number>
+// <vf> ::= 'a' // automatic vectorization factor
+// <simd_width> ::= '@' <number>
+// <scalable_spec> ::= 's'
+// <predicated_spec> ::= 'p'
+static bool parsePassOptionsSwitch(
+    const llvm::StringRef spec, llvm::StringRef &name,
+    llvm::SmallVectorImpl<vecz::VeczPassOptions> &opts) {
+  auto pair = spec.split(':');
+  name = pair.first;
+  auto vals = pair.second;
+  auto defaults = getDefaultPassOptions();
+  if (!name.size()) {
+    return false;
+  }
+  if (!vals.empty()) {
+    do {
+      // HEREBEDRAGONS: The return status of `consumeInteger` and
+      // `consume_front` are "failed" and "succeeded" respectively. It's
+      // opposite day somewhere in llvm land...
+      unsigned vf;
+      auto opt = defaults;
+      if (vals.consume_front("a")) {
+        opt.vecz_auto = true;
+      } else if (!vals.consumeInteger(10, vf)) {
+        opt.factor = compiler::utils::VectorizationFactor::getFixedWidth(vf);
+      }
+      if (vals.consume_front(".")) {
+        unsigned dim;
+        if (vals.consumeInteger(10, dim)) {
+          return false;
+        }
+        if (!dim || dim > 3) {
+          return false;
+        }
+        opt.vec_dim_idx = dim;
+      }
+      if (vals.consume_front("@")) {
+        unsigned simd_width;
+        if (vals.consumeInteger(10, simd_width)) {
+          return false;
+        }
+        opt.local_size = simd_width;
+      }
+      // <scalable_spec> ::= 's'
+      opt.factor.setIsScalable(vals.consume_front("s"));
+      // <predicated_spec> ::= 'p'
+      if (vals.consume_front("p")) {
+        opt.choices.enableVectorPredication();
+      }
+      opts.push_back(opt);
+    } while (vals.consume_front(",") && !vals.empty());
+    if (!vals.empty()) {
+      return false;
+    }
+  } else {
+    opts.push_back(defaults);
+  }
+  return true;
+}
+
+using KernelOptMap =
+    llvm::SmallDenseMap<llvm::StringRef,
+                        llvm::SmallVector<vecz::VeczPassOptions, 1>, 1>;
+
+int main(const int argc, const char *const argv[]) {
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  if (ChoicesHelp) {
+    const auto &Infos = vecz::VectorizationChoices::queryAvailableChoices();
+    llvm::outs() << "Available Vecz Choices:\n\n";
+    for (const auto &Info : Infos) {
+      llvm::outs() << "  * " << Info.name << ":\n";
+      llvm::outs() << "      " << Info.desc << "\n\n";
+    }
+    llvm::outs() << "Separate multiple items with any one of [:;,].\n"
+                    "Prefix any choice with \"no\" to disable that option.\n";
+    return 0;
+  }
+
+  // If the user didn't specify an output filename, but is reading from stdin,
+  // output to stdout. This may be emitting binary, but trust the user to know
+  // what they're doing. We could also emit a warning.
+  if (OutputFilename.empty() && InputFilename == "-") {
+    OutputFilename = "-";
+  }
+
+  if (OutputFilename.empty()) {
+    llvm::errs() << "Error: no output filename was given (use -o <file>)\n";
+    return 1;
+  }
+
+  llvm::SMDiagnostic err;
+  llvm::LLVMContext context;
+#if LLVM_VERSION_GREATER_EQUAL(15, 0)
+  context.setOpaquePointers(true);
+#endif
+  std::unique_ptr<llvm::Module> module =
+      llvm::parseIRFile(InputFilename, err, context);
+
+  if (!module) {
+    auto errorOrInputFile =
+        llvm::MemoryBuffer::getFileOrSTDIN(InputFilename.getValue());
+
+    // If there was an error in getting the input file.
+    if (!errorOrInputFile) {
+      llvm::errs() << "Error: " << errorOrInputFile.getError().message() << " '"
+                   << InputFilename.getValue() << "'\n";
+      return 1;
+    }
+
+    llvm::errs() << "Error: bitcode file was malformed\n";
+    err.print("veczc", llvm::errs(),
+              llvm::sys::Process::StandardErrHasColors());
+    return 1;
+  }
+
+  KernelOptMap kernelOpts;
+  if (KernelNameSpecs.empty()) {
+    auto defaults = getDefaultPassOptions();
+    for (const auto &f : *module) {
+      if (f.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
+        continue;
+      }
+      kernelOpts[f.getName()].push_back(defaults);
+    }
+  } else {
+    for (const auto &S : KernelNameSpecs) {
+      llvm::StringRef name;
+      llvm::SmallVector<vecz::VeczPassOptions, 1> opts;
+      if (!parsePassOptionsSwitch(S, name, opts)) {
+        fprintf(stderr,
+                "failed to parse kernel vectorization specification%s\n",
+                name.str().c_str());
+        return 1;
+      }
+      if (!module->getFunction(name)) {
+        llvm::errs() << "Error: no such kernel to vectorize ('" << name
+                     << "')\n";
+        return 1;
+      }
+      kernelOpts[name] = std::move(opts);
+    }
+  }
+
+  // Open the file.
+  std::error_code EC;
+  llvm::sys::fs::OpenFlags OpenFlags = llvm::sys::fs::OF_None;
+  if (WriteTextual) {
+    OpenFlags |= llvm::sys::fs::OF_Text;
+  }
+  auto Out =
+      std::make_unique<llvm::ToolOutputFile>(OutputFilename, EC, OpenFlags);
+  if (EC || !Out) {
+    llvm::errs() << EC.message() << '\n';
+    return 1;
+  }
+
+  std::unique_ptr<llvm::TargetMachine> tm(
+      UserTriple.size() ? initLLVMTarget(UserTriple, UserCPU, CPUFeatures)
+                        : nullptr);
+  assert(!UserTriple.size() || tm);
+  if (tm) {
+    module->setTargetTriple(tm->getTargetTriple().getTriple());
+    module->setDataLayout(tm->createDataLayout());
+  }
+
+  compiler::utils::PassMachinery passMach(context, tm.get());
+
+  auto TICallback = [&](const llvm::Module &) {
+    return vecz::createTargetInfoFromTargetMachine(tm.get());
+  };
+
+  passMach.initializeStart();
+  passMach.getMAM().registerPass(
+      [&] { return vecz::TargetInfoAnalysis(TICallback); });
+  passMach.getMAM().registerPass(
+      [&] { return compiler::utils::BuiltinInfoAnalysis(); });
+  passMach.getFAM().registerPass([] { return llvm::TargetIRAnalysis(); });
+  passMach.getMAM().registerPass([] {
+    compiler::utils::DeviceInfo Info{/*half*/ 0, /*float*/ 0, DoubleSupport,
+                                     /*MaxWorthWidth*/ 64};
+    return compiler::utils::DeviceInfoAnalysis(Info);
+  });
+  passMach.getMAM().registerPass([&kernelOpts] {
+    return vecz::VeczPassOptionsAnalysis(
+        [&kernelOpts](llvm::Function &F, llvm::ModuleAnalysisManager &,
+                      llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) {
+          auto it = kernelOpts.find(F.getName());
+          if (it == kernelOpts.end()) {
+            return false;
+          }
+          Opts.assign(it->second.begin(), it->second.end());
+          return true;
+        });
+  });
+  passMach.initializeFinish();
+
+  llvm::ModulePassManager PM;
+
+  // Forcibly compute the BuiltinInfoAnalysis so that cached retrievals work.
+  PM.addPass(llvm::RequireAnalysisPass<compiler::utils::BuiltinInfoAnalysis,
+                                       llvm::Module>());
+
+  PM.addPass(llvm::createModuleToPostOrderCGSCCPassAdaptor(
+      compiler::utils::OptimalBuiltinReplacementPass()));
+  PM.addPass(vecz::RunVeczPass());
+  PM.run(*module, passMach.getMAM());
+
+  // If the user has specified a list of kernels to vectorize, we need to
+  // check we've matched their expectations. If they didn't specify we work on
+  // a "best-effort" basis
+  if (!KernelNameSpecs.empty()) {
+    for (auto p : kernelOpts) {
+      auto &f = *module->getFunction(p.first);
+      const auto &requested = p.getSecond();
+      llvm::SmallVector<compiler::utils::LinkMetadataResult, 1> results;
+      compiler::utils::parseOrigToVeczFnLinkMetadata(f, results);
+      for (auto &expected : requested) {
+        if (expected.vecz_auto) {
+          continue;
+        }
+        bool found = false;
+        for (auto &result : results) {
+          // FIXME this probably not the best way to do this
+          found |=
+              result.second.vf.getKnownMin() >= expected.factor.getKnownMin();
+        }
+        if (!found) {
+          llvm::errs() << "Error: Failed to vectorize function '" << f.getName()
+                       << "'\n";
+          return FailQuietly ? 0 : 1;
+        }
+      }
+    }
+  }
+
+  // Write the resulting module.
+  if (WriteTextual) {
+    Out->os() << *module;
+  } else {
+    llvm::WriteBitcodeToFile(*module, Out->os());
+  }
+
+  Out->keep();
+
+  if (llvm::AreStatisticsEnabled()) {
+    llvm::PrintStatistics();
+  }
+  return 0;
+}

From 4e8a584ff2ec413e3a4f01e732055ebd81abcd78 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 24 May 2023 10:28:46 +0100
Subject: [PATCH 002/182] [multi_llvm] Move to compiler module

This is (now) only used by compiler modules.
---
 .../include/multi_llvm/creation_apis_helper.h |  66 +++++++
 .../include/multi_llvm/llvm_version.h         |  38 ++++
 .../include/multi_llvm/multi_llvm.h           | 171 ++++++++++++++++++
 .../include/multi_llvm/opaque_pointers.h      |  56 ++++++
 .../include/multi_llvm/optional_helper.h      |  81 +++++++++
 .../include/multi_llvm/vector_type_helper.h   |  69 +++++++
 6 files changed, 481 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
new file mode 100644
index 0000000000000..cf1e5a80e6f0b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
@@ -0,0 +1,66 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
+#define MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
+
+#include <llvm/ADT/None.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Value.h>
+#include <llvm/Support/TypeSize.h>
+#include <multi_llvm/vector_type_helper.h>
+
+namespace multi_llvm {
+
+inline llvm::Value *createAllTrueMask(llvm::IRBuilder<> &B,
+                                      llvm::ElementCount EC) {
+  return llvm::ConstantInt::getTrue(llvm::VectorType::get(B.getInt1Ty(), EC));
+}
+
+inline llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder,
+                                        llvm::Type *Ty, llvm::ElementCount EC,
+                                        const llvm::Twine &Name = "") {
+  (void)Builder;
+  (void)Name;
+  if (EC.isScalable()) {
+    // FIXME: This intrinsic works on fixed-length types too: should we migrate
+    // to using it starting from LLVM 13?
+    return Builder.CreateStepVector(Ty, Name);
+  }
+
+  llvm::SmallVector<llvm::Constant *, 16> Indices;
+  unsigned SimdWidth = EC.getFixedValue();
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Indices.push_back(llvm::ConstantInt::get(getVectorElementType(Ty), i));
+  }
+  return llvm::ConstantVector::get(Indices);
+}
+
+inline llvm::CallInst *createRISCVMaskedIntrinsic(
+    llvm::IRBuilder<> &B, llvm::Intrinsic::ID ID,
+    llvm::ArrayRef<llvm::Type *> Types, llvm::ArrayRef<llvm::Value *> Args,
+    unsigned TailPolicy, llvm::Instruction *FMFSource = nullptr,
+    const llvm::Twine &Name = "") {
+  llvm::SmallVector<llvm::Value *> InArgs(Args.begin(), Args.end());
+  InArgs.push_back(
+      B.getIntN(Args.back()->getType()->getIntegerBitWidth(), TailPolicy));
+  return B.CreateIntrinsic(ID, Types, InArgs, FMFSource, Name);
+}
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
new file mode 100644
index 0000000000000..f1c9f3bdee1c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
@@ -0,0 +1,38 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_LLVM_VERSION_H_INCLUDED
+#define MULTI_LLVM_LLVM_VERSION_H_INCLUDED
+
+#include <llvm/Config/llvm-config.h>
+
+#define LLVM_VERSION_EQUAL(MAJOR, MINOR) \
+  (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR == (MINOR))
+
+#define LLVM_VERSION_LESS(MAJOR, MINOR) \
+  ((LLVM_VERSION_MAJOR < (MAJOR)) ||    \
+   (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR < (MINOR)))
+
+#define LLVM_VERSION_LESS_EQUAL(MAJOR, MINOR) \
+  (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_LESS(MAJOR, MINOR))
+
+#define LLVM_VERSION_GREATER(MAJOR, MINOR) \
+  ((LLVM_VERSION_MAJOR > (MAJOR)) ||       \
+   (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR > (MINOR)))
+
+#define LLVM_VERSION_GREATER_EQUAL(MAJOR, MINOR) \
+  (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_GREATER(MAJOR, MINOR))
+
+#endif  // MULTI_LLVM_LLVM_VERSION_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
new file mode 100644
index 0000000000000..33238e60f936f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -0,0 +1,171 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED
+#define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/Analysis/TargetLibraryInfo.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+template <typename T>
+llvm::ArrayRef<T> ArrayRef(T *data, size_t size) {
+#if LLVM_VERSION_MAJOR >= 16
+  return llvm::ArrayRef<T>(data, size);
+#else
+  return llvm::makeArrayRef<T>(data, size);
+#endif
+}
+
+template <typename T>
+llvm::ArrayRef<T> ArrayRef(llvm::SmallVectorImpl<T> &data) {
+#if LLVM_VERSION_MAJOR >= 16
+  return llvm::ArrayRef<T>(data.data(), data.size());
+#else
+  return llvm::makeArrayRef<T>(data.data(), data.size());
+#endif
+}
+
+// LLVM 11 changes the InlineFunction API so it takes the CallBase argument as
+// a reference now. Therefore, we need a generic helper that will also work for
+// prior LLVM versions.
+inline llvm::InlineResult InlineFunction(llvm::CallInst *CI,
+                                         llvm::InlineFunctionInfo &IFI,
+                                         llvm::AAResults *CalleeAAR = nullptr,
+                                         bool InsertLifetime = true) {
+#if LLVM_VERSION_MAJOR >= 16
+  return llvm::InlineFunction(*CI, IFI, /* MergeAttributes */ false, CalleeAAR,
+                              InsertLifetime,
+                              /* *ForwardVarArgsTo */ nullptr);
+#else
+  return llvm::InlineFunction(*CI, IFI, CalleeAAR, InsertLifetime);
+#endif
+}
+
+inline llvm::StructType *getStructTypeByName(llvm::Module &module,
+                                             llvm::StringRef name) {
+  return llvm::StructType::getTypeByName(module.getContext(), name);
+}
+
+inline llvm::DILocation *getDILocation(unsigned Line, unsigned Column,
+                                       llvm::MDNode *Scope,
+                                       llvm::MDNode *InlinedAt = nullptr) {
+  // If no scope is available, this is an unknown location.
+  if (!Scope) return llvm::DebugLoc();
+  return llvm::DILocation::get(Scope->getContext(), Line, Column, Scope,
+                               InlinedAt, /*ImplicitCode*/ false);
+}
+
+inline void insertAtEnd(llvm::BasicBlock *bb, llvm::Instruction *newInst) {
+#if LLVM_VERSION_MAJOR >= 16
+  newInst->insertInto(bb, bb->end());
+#else
+  bb->getInstList().push_back(newInst);
+#endif
+}
+
+template <typename T>
+inline typename std::remove_reference_t<T>::ScalarTy getFixedValue(T &&V) {
+#if LLVM_VERSION_MAJOR >= 16
+  return V.getFixedValue();
+#else
+  return V.getFixedSize();
+#endif
+}
+
+template <typename T>
+inline typename std::remove_reference_t<T>::ScalarTy getKnownMinValue(T &&M) {
+#if LLVM_VERSION_MAJOR >= 16
+  return M.getKnownMinValue();
+#else
+  return M.getKnownMinSize();
+#endif
+}
+
+/// @brief Create a binary operation corresponding to the given
+/// `llvm::RecurKind` with the two provided arguments. It may not
+/// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one
+/// operation: integer min/max operations may defer to multiple instructions or
+/// intrinsics depending on the LLVM version.
+///
+/// @param[in] B the IRBuilder to build new instructions
+/// @param[in] lhs the left-hand value for the operation
+/// @param[in] rhs the right-hand value for the operation
+/// @param[in] kind the kind of operation to create
+/// @param[out] The binary operation.
+inline llvm::Value *createBinOpForRecurKind(llvm::IRBuilder<> &B,
+                                            llvm::Value *lhs, llvm::Value *rhs,
+                                            llvm::RecurKind kind) {
+  switch (kind) {
+    default:
+      break;
+    case llvm::RecurKind::None:
+      return nullptr;
+    case llvm::RecurKind::Add:
+      return B.CreateAdd(lhs, rhs);
+    case llvm::RecurKind::Mul:
+      return B.CreateMul(lhs, rhs);
+    case llvm::RecurKind::Or:
+      return B.CreateOr(lhs, rhs);
+    case llvm::RecurKind::And:
+      return B.CreateAnd(lhs, rhs);
+    case llvm::RecurKind::Xor:
+      return B.CreateXor(lhs, rhs);
+    case llvm::RecurKind::FAdd:
+      return B.CreateFAdd(lhs, rhs);
+    case llvm::RecurKind::FMul:
+      return B.CreateFMul(lhs, rhs);
+  }
+  assert((kind == llvm::RecurKind::FMin || kind == llvm::RecurKind::FMax ||
+          kind == llvm::RecurKind::SMin || kind == llvm::RecurKind::SMax ||
+          kind == llvm::RecurKind::UMin || kind == llvm::RecurKind::UMax) &&
+         "Unexpected min/max kind");
+  if (kind == llvm::RecurKind::FMin || kind == llvm::RecurKind::FMax) {
+    return B.CreateBinaryIntrinsic(kind == llvm::RecurKind::FMin
+                                       ? llvm::Intrinsic::minnum
+                                       : llvm::Intrinsic::maxnum,
+                                   lhs, rhs);
+  }
+  bool isMin = kind == llvm::RecurKind::SMin || kind == llvm::RecurKind::UMin;
+  bool isSigned =
+      kind == llvm::RecurKind::SMin || kind == llvm::RecurKind::SMax;
+  llvm::Intrinsic::ID intrOpc =
+      isMin ? (isSigned ? llvm::Intrinsic::smin : llvm::Intrinsic::umin)
+            : (isSigned ? llvm::Intrinsic::smax : llvm::Intrinsic::umax);
+  return B.CreateBinaryIntrinsic(intrOpc, lhs, rhs);
+}
+
+inline void addVectorizableFunctionsFromVecLib(
+    llvm::TargetLibraryInfoImpl &TLII,
+    llvm::TargetLibraryInfoImpl::VectorLibrary VecLib, llvm::Triple TT) {
+#if LLVM_VERSION_MAJOR >= 16
+  TLII.addVectorizableFunctionsFromVecLib(VecLib, TT);
+#else
+  (void)TT;
+  TLII.addVectorizableFunctionsFromVecLib(VecLib);
+#endif
+}
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
new file mode 100644
index 0000000000000..9c871f9a0c3ef
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
@@ -0,0 +1,56 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_OPAQUE_POINTERS_H_INCLUDED
+#define MULTI_LLVM_OPAQUE_POINTERS_H_INCLUDED
+
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Type.h>
+
+namespace multi_llvm {
+inline bool isOpaquePointerTy(llvm::Type *Ty) {
+  if (auto *PTy = llvm::dyn_cast<llvm::PointerType>(Ty)) {
+    return PTy->isOpaque();
+  }
+  return false;
+}
+
+inline bool isOpaqueOrPointeeTypeMatches(llvm::PointerType *PTy,
+                                         llvm::Type *EltTy) {
+#if LLVM_VERSION_MAJOR >= 15
+  (void)EltTy;
+  (void)PTy;
+  assert(PTy->isOpaque() && "No support for typed pointers in LLVM 15+");
+  return true;
+#else
+  return PTy->isOpaque() || PTy->getPointerElementType() == EltTy;
+#endif
+}
+
+inline llvm::Type *getPtrElementType(llvm::PointerType *PTy) {
+  if (PTy->isOpaque()) {
+    return nullptr;
+  }
+#if LLVM_VERSION_MAJOR >= 15
+  assert(false && "No support for typed pointers in LLVM 15+");
+  return nullptr;
+#else
+  return PTy->getPointerElementType();
+#endif
+}
+
+};  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_OPAQUE_POINTERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
new file mode 100644
index 0000000000000..454b091dabfa3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
@@ -0,0 +1,81 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
+#define MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
+
+#include <llvm/ADT/None.h>
+#include <llvm/ADT/Optional.h>
+
+#if (LLVM_VERSION_MAJOR >= 16)
+#include <optional>
+#endif
+
+namespace multi_llvm {
+
+#if (LLVM_VERSION_MAJOR >= 16)
+
+template <typename T>
+using Optional = std::optional<T>;
+static constexpr std::nullopt_t None = std::nullopt;
+
+#else
+
+using llvm::None;
+using llvm::NoneType;
+template <typename T>
+class Optional : public llvm::Optional<T> {
+ public:
+  constexpr Optional() = default;
+  constexpr Optional(llvm::NoneType) {}
+
+  constexpr Optional(const T &value) : llvm::Optional<T>(value) {}
+  constexpr Optional(T &&value) : llvm::Optional<T>(std::move(value)) {}
+
+  Optional &operator=(const T &y) {
+    llvm::Optional<T>::operator=(y);
+    return *this;
+  }
+  Optional &operator=(T &&y) {
+    llvm::Optional<T>::operator=(std::forward<T>(y));
+    return *this;
+  }
+
+  constexpr Optional(llvm::Optional<T> &&value)
+      : llvm::Optional<T>(std::move(value)) {}
+
+  inline constexpr bool has_value() const {
+    return llvm::Optional<T>::hasValue();
+  }
+
+#if (LLVM_VERSION_MAJOR <= 14)
+  inline constexpr const T &value() const {
+    return llvm::Optional<T>::getValue();
+  }
+  inline constexpr T &value() { return llvm::Optional<T>::getValue(); }
+
+  template <typename U>
+  constexpr T value_or(U &&alt) const & {
+    return llvm::Optional<T>::getValueOr(alt);
+  }
+#endif
+};
+
+#endif
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
new file mode 100644
index 0000000000000..3b281a47b94b0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
@@ -0,0 +1,69 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
+#define MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
+
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Support/TypeSize.h>
+
+namespace multi_llvm {
+
+// The functions defined below are common functions to allow us to generically
+// get VectorType information from a base Type class, due to either deprecation
+// or removal of these in LLVM 11 (result of scalable/fixed vectors separation)
+
+inline llvm::Type *getVectorElementType(llvm::Type *ty) {
+  assert(llvm::isa<llvm::VectorType>(ty) && "Not a vector type");
+  return llvm::cast<llvm::VectorType>(ty)->getElementType();
+}
+inline llvm::Type *getVectorElementType(const llvm::Type *ty) {
+  assert(llvm::isa<llvm::VectorType>(ty) && "Not a vector type");
+  return llvm::cast<llvm::VectorType>(ty)->getElementType();
+}
+
+inline unsigned getVectorNumElements(llvm::Type *ty) {
+  assert(ty->getTypeID() == llvm::Type::FixedVectorTyID &&
+         "Not a fixed vector type");
+  return llvm::cast<llvm::FixedVectorType>(ty)
+      ->getElementCount()
+      .getFixedValue();
+}
+inline unsigned getVectorNumElements(const llvm::Type *ty) {
+  assert(ty->getTypeID() == llvm::Type::FixedVectorTyID &&
+         "Not a fixed vector type");
+  return llvm::cast<llvm::FixedVectorType>(ty)
+      ->getElementCount()
+      .getFixedValue();
+}
+
+inline llvm::ElementCount getVectorElementCount(llvm::Type *ty) {
+  return llvm::cast<llvm::VectorType>(ty)->getElementCount();
+}
+inline llvm::ElementCount getVectorElementCount(const llvm::Type *ty) {
+  return llvm::cast<llvm::VectorType>(ty)->getElementCount();
+}
+
+inline unsigned getVectorKnownMinNumElements(llvm::Type *ty) {
+  return getVectorElementCount(ty).getKnownMinValue();
+}
+
+inline unsigned getVectorKnownMinNumElements(const llvm::Type *ty) {
+  return getVectorElementCount(ty).getKnownMinValue();
+}
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED

From 41376e61e1632ed73af8add0bd4241e583dc353c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 25 May 2023 12:22:40 +0100
Subject: [PATCH 003/182] [lit] Have lit configure tools in-house

This removes our dependency on custom tool substitutions and build-time
paths to LLVM tools by having lit perform its own tool substitution.

The tool binaries are sourced from the same places as they are
currently: by checking the bin/ folder, the CA_BUILTINS_TOOL_DIR,
CA_LLVM_TOOLS_DIR, and the PATH, in that order.

Lit will now replace the tool with the full path, removing the need for
our custom substitutions.

Support for switching out tools with environment variables at runtime
has been kept. The emulator support should behave the same way too.
---
 .../lit/llvm/AArch64/shuffled_load_aarch64_1.ll    |  2 +-
 .../lit/llvm/AArch64/shuffled_load_aarch64_2.ll    |  2 +-
 .../lit/llvm/AArch64/shuffled_load_aarch64_3.ll    |  2 +-
 .../lit/llvm/AArch64/shuffled_load_aarch64_4.ll    |  2 +-
 .../lit/llvm/AArch64/shuffled_load_aarch64_5.ll    |  2 +-
 .../lit/llvm/AArch64/shuffled_load_aarch64_6.ll    |  2 +-
 .../vecz/test/lit/llvm/Boscc/boscc_killer.ll       |  2 +-
 .../vecz/test/lit/llvm/Boscc/boscc_merge.ll        |  2 +-
 .../vecz/test/lit/llvm/Boscc/boscc_merge2.ll       |  2 +-
 .../vecz/test/lit/llvm/Boscc/boscc_merge3.ll       |  2 +-
 .../test/lit/llvm/Boscc/duplicate_preheader.ll     |  2 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops1.ll      |  2 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops2.ll      |  2 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops3.ll      |  2 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops4.ll      |  2 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops5.ll      |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization0.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization1.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization10.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization11.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization12.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization13.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization14.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization15.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization16.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization17.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization18.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization19.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization2.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization20.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization21.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization22.ll |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization3.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization4.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization5.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization6.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization7.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization8.ll  |  2 +-
 .../test/lit/llvm/Boscc/partial_linearization9.ll  |  2 +-
 .../vecz/test/lit/llvm/Boscc/printf.ll             |  2 +-
 .../test/lit/llvm/Boscc/scalable_linearization.ll  |  2 +-
 .../test/lit/llvm/OpaquePointers/basic_mem2reg.ll  |  2 +-
 .../lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll  |  2 +-
 .../llvm/OpaquePointers/builtin_inlining_mem.ll    |  2 +-
 .../llvm/OpaquePointers/builtin_pointer_return.ll  |  2 +-
 .../OpaquePointers/control_flow_conversion_ptrs.ll |  2 +-
 .../llvm/OpaquePointers/interleaved_load_ooo.ll    |  2 +-
 .../test/lit/llvm/OpaquePointers/load_add_store.ll |  2 +-
 .../test/lit/llvm/OpaquePointers/masked_store.ll   |  2 +-
 .../test/lit/llvm/OpaquePointers/remove_intptr.ll  |  2 +-
 .../lit/llvm/OpaquePointers/ternary_transform.ll   |  2 +-
 .../define_interleaved_store.ll                    |  2 +-
 .../define_interleaved_store_as_masked.ll          |  2 +-
 .../PartialScalarization/vector_phi_uniform.ll     |  2 +-
 .../PartialScalarization/vector_phi_varying.ll     |  2 +-
 .../vecz/test/lit/llvm/RISCV/broadcast_vector.ll   |  2 +-
 .../test/lit/llvm/RISCV/define_subgroup_scans.ll   |  2 +-
 .../lit/llvm/RISCV/define_subgroup_scans_vp.ll     |  2 +-
 .../vecz/test/lit/llvm/RISCV/extract_element.ll    | 12 ++++++------
 .../vecz/test/lit/llvm/RISCV/insert_element.ll     | 10 +++++-----
 .../vecz/test/lit/llvm/RISCV/packetize_shuffle.ll  |  2 +-
 .../test/lit/llvm/RISCV/packetize_shuffle_bool.ll  |  2 +-
 .../lit/llvm/RISCV/packetize_shuffle_concat.ll     |  2 +-
 .../lit/llvm/RISCV/packetize_shuffle_narrow.ll     |  2 +-
 .../test/lit/llvm/RISCV/packetize_shuffle_wider.ll |  2 +-
 .../test/lit/llvm/RISCV/select_scalar_vector.ll    |  2 +-
 .../vecz/test/lit/llvm/RISCV/vp_memops.ll          | 12 ++++++------
 .../vecz/test/lit/llvm/RISCV/vp_vsetvli.ll         |  2 +-
 .../lit/llvm/ScalableVectors/broadcast_vector.ll   |  2 +-
 .../vecz/test/lit/llvm/ScalableVectors/builtins.ll |  2 +-
 .../vecz/test/lit/llvm/ScalableVectors/cast.ll     |  2 +-
 .../ScalableVectors/define_interleaved_store.ll    |  2 +-
 .../define_interleaved_store_as_masked.ll          |  2 +-
 .../lit/llvm/ScalableVectors/define_masked_load.ll |  2 +-
 .../define_masked_scatter_gather.ll                |  2 +-
 .../llvm/ScalableVectors/define_subgroup_scans.ll  |  2 +-
 .../ScalableVectors/define_subgroup_scans_vp.ll    |  2 +-
 .../lit/llvm/ScalableVectors/extract_element.ll    | 10 +++++-----
 .../vecz/test/lit/llvm/ScalableVectors/fadd.ll     |  2 +-
 .../test/lit/llvm/ScalableVectors/fail_builtins.ll |  2 +-
 .../lit/llvm/ScalableVectors/insert_element.ll     |  8 ++++----
 .../lit/llvm/ScalableVectors/interleaved_load.ll   |  2 +-
 .../test/lit/llvm/ScalableVectors/intrinsics.ll    |  6 +++---
 .../test/lit/llvm/ScalableVectors/lit.local.cfg    |  2 +-
 .../lit/llvm/ScalableVectors/load_add_store.ll     |  2 +-
 .../lit/llvm/ScalableVectors/load_binops_store.ll  |  2 +-
 .../vecz/test/lit/llvm/ScalableVectors/metadata.ll |  2 +-
 .../llvm/ScalableVectors/packetize_mask_varying.ll |  2 +-
 .../test/lit/llvm/ScalableVectors/scalable_auto.ll |  2 +-
 .../vecz/test/lit/llvm/ScalableVectors/select.ll   |  2 +-
 .../llvm/ScalableVectors/select_scalar_vector.ll   |  2 +-
 .../vecz/test/lit/llvm/ScalableVectors/shuffle.ll  |  2 +-
 .../lit/llvm/ScalableVectors/subgroup_builtins.ll  |  2 +-
 .../lit/llvm/ScalableVectors/subgroup_scans.ll     |  2 +-
 ...oup_scans_spv_khr_uniform_group_instructions.ll |  2 +-
 ..._scans_spv_khr_uniform_group_instructions_vp.ll |  2 +-
 .../lit/llvm/ScalableVectors/subgroup_scans_vp.ll  |  2 +-
 .../vecz/test/lit/llvm/ScalableVectors/vectors.ll  |  2 +-
 .../llvm/ScalableVectors/verification_fail_phi.ll  |  2 +-
 .../test/lit/llvm/ScalableVectors/widen_vload.ll   |  2 +-
 .../lit/llvm/ScalableVectors/workitem_funcs.ll     |  2 +-
 .../lit/llvm/VectorPredication/boscc_reduction.ll  |  2 +-
 .../vecz/test/lit/llvm/VectorPredication/choice.ll |  2 +-
 .../VectorPredication/compute_vector_length.ll     |  4 ++--
 .../define_interleaved_load_store.ll               |  2 +-
 .../VectorPredication/define_masked_load_store.ll  |  2 +-
 .../define_masked_scatter_gather.ll                |  2 +-
 .../VectorPredication/define_subgroup_scans.ll     |  2 +-
 .../lit/llvm/VectorPredication/load_add_store.ll   |  8 ++++----
 .../VectorPredication/packetize_mask_varying.ll    |  2 +-
 .../lit/llvm/VectorPredication/scatter_gather.ll   |  2 +-
 .../llvm/VectorPredication/subgroup_reductions.ll  |  2 +-
 ...eductions_spv_khr_uniform_group_instructions.ll |  2 +-
 .../lit/llvm/VectorPredication/subgroup_scans.ll   |  2 +-
 ...oup_scans_spv_khr_uniform_group_instructions.ll |  2 +-
 .../vecz/test/lit/llvm/VectorPredication/udiv.ll   |  2 +-
 .../llvm/VectorWidening/define_interleaved_load.ll |  2 +-
 .../define_interleaved_load_as_masked.ll           |  2 +-
 .../llvm/VectorWidening/delete_packetized_memop.ll |  2 +-
 .../extractelement_constant_index.ll               |  2 +-
 .../VectorWidening/extractelement_runtime_index.ll |  2 +-
 .../extractelement_runtime_index2.ll               |  2 +-
 .../extractelement_runtime_index3.ll               |  2 +-
 .../VectorWidening/insertelement_constant_index.ll |  2 +-
 .../insertelement_constant_index_constant_value.ll |  2 +-
 .../VectorWidening/insertelement_runtime_index.ll  |  2 +-
 .../lit/llvm/VectorWidening/interleaved_safety.ll  |  2 +-
 .../onearg_relationals_isfiniteDv4_d.ll            |  2 +-
 .../onearg_relationals_isfiniteDv4_f.ll            |  2 +-
 .../onearg_relationals_isinfDv4_d.ll               |  2 +-
 .../onearg_relationals_isinfDv4_f.ll               |  2 +-
 .../onearg_relationals_isnanDv4_d.ll               |  2 +-
 .../onearg_relationals_isnanDv4_f.ll               |  2 +-
 .../onearg_relationals_isnormalDv4_d.ll            |  2 +-
 .../onearg_relationals_isnormalDv4_f.ll            |  2 +-
 .../lit/llvm/VectorWidening/scalar_vector_user.ll  |  2 +-
 .../test/lit/llvm/VectorWidening/vector_copy.ll    |  2 +-
 .../lit/llvm/VectorWidening/vector_phi_varying.ll  |  2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_abs.ll |  2 +-
 .../test/lit/llvm/VectorWidening/widen_binops.ll   |  2 +-
 .../test/lit/llvm/VectorWidening/widen_copysign.ll |  2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fma.ll |  2 +-
 .../VectorWidening/widen_fmin_vector_scalar.ll     |  2 +-
 .../test/lit/llvm/VectorWidening/widen_fmuladd.ll  |  2 +-
 .../test/lit/llvm/VectorWidening/widen_fmuladd2.ll |  2 +-
 .../lit/llvm/VectorWidening/widen_fmuladd_phi.ll   |  2 +-
 .../test/lit/llvm/VectorWidening/widen_fshl.ll     |  2 +-
 .../test/lit/llvm/VectorWidening/widen_fshr.ll     |  2 +-
 .../lit/llvm/VectorWidening/widen_shufflevector.ll |  2 +-
 .../test/lit/llvm/VectorWidening/widen_sqrt.ll     |  2 +-
 .../vecz/test/lit/llvm/alloca_alias.ll             |  2 +-
 .../vecz/test/lit/llvm/arm_neon_store.ll           |  2 +-
 .../test/lit/llvm/async_workgroup_copy_uniform.ll  |  2 +-
 .../vecz/test/lit/llvm/atomic_cmpxchg.ll           |  2 +-
 .../vecz/test/lit/llvm/atomicrmw.ll                |  2 +-
 .../vecz/test/lit/llvm/atomicrmw_uniform.ll        |  2 +-
 .../vecz/test/lit/llvm/basic_mem2reg.ll            |  2 +-
 .../vecz/test/lit/llvm/bitcast_function.ll         |  2 +-
 .../vecz/test/lit/llvm/branch_splitting_and.ll     |  2 +-
 .../vecz/test/lit/llvm/branch_splitting_or.ll      |  2 +-
 .../vecz/test/lit/llvm/builtin_inlining_addsat.ll  |  2 +-
 .../vecz/test/lit/llvm/builtin_inlining_clamp.ll   |  2 +-
 .../vecz/test/lit/llvm/builtin_inlining_fmax.ll    |  2 +-
 .../vecz/test/lit/llvm/builtin_inlining_fmin.ll    |  2 +-
 .../vecz/test/lit/llvm/builtin_inlining_mem.ll     |  2 +-
 .../test/lit/llvm/builtin_inlining_negative.ll     |  2 +-
 .../test/lit/llvm/builtin_inlining_positive.ll     |  2 +-
 .../vecz/test/lit/llvm/builtin_pointer_return.ll   |  2 +-
 .../call_instantiation_failure_cantduplicate.ll    |  2 +-
 .../llvm/call_instantiation_failure_cantinline.ll  |  2 +-
 .../lit/llvm/call_instantiation_failure_optnone.ll |  2 +-
 .../call_instantiation_failure_user_undefined.ll   |  2 +-
 .../lit/llvm/call_instantiation_success_builtin.ll |  2 +-
 .../llvm/call_instantiation_success_instrinsic.ll  |  2 +-
 .../call_instantiation_success_user_defined.ll     |  2 +-
 .../vecz/test/lit/llvm/constant_address.ll         |  2 +-
 .../test/lit/llvm/constant_address_with_uniform.ll |  2 +-
 .../vecz/test/lit/llvm/contiguous_allocas.ll       |  2 +-
 .../llvm/control_flow_conversion_nested_loops.ll   |  2 +-
 .../lit/llvm/control_flow_conversion_order_y.ll    |  2 +-
 .../lit/llvm/control_flow_conversion_order_z.ll    |  2 +-
 .../test/lit/llvm/control_flow_conversion_ptrs.ll  |  2 +-
 .../lit/llvm/control_flow_conversion_uniform_if.ll |  2 +-
 .../llvm/control_flow_conversion_uniform_loop.ll   |  2 +-
 .../lit/llvm/control_flow_conversion_varying_if.ll |  2 +-
 .../llvm/control_flow_conversion_varying_loop.ll   |  2 +-
 .../compiler_passes/vecz/test/lit/llvm/convert3.ll |  2 +-
 .../compiler_passes/vecz/test/lit/llvm/convert4.ll |  2 +-
 .../vecz/test/lit/llvm/convert_contiguity.ll       |  2 +-
 .../vecz/test/lit/llvm/define_gather_load.ll       |  2 +-
 .../test/lit/llvm/define_gather_load_as_masked.ll  |  2 +-
 .../vecz/test/lit/llvm/define_interleaved_load.ll  |  2 +-
 .../lit/llvm/define_interleaved_load_as_masked.ll  |  2 +-
 .../vecz/test/lit/llvm/define_interleaved_store.ll |  2 +-
 .../lit/llvm/define_interleaved_store_as_masked.ll |  2 +-
 .../vecz/test/lit/llvm/define_internal_builtins.ll |  2 +-
 .../test/lit/llvm/define_masked_gather_load.ll     |  2 +-
 .../vecz/test/lit/llvm/define_masked_load.ll       |  2 +-
 .../test/lit/llvm/define_masked_scatter_store.ll   |  2 +-
 .../vecz/test/lit/llvm/define_masked_store.ll      |  2 +-
 .../vecz/test/lit/llvm/define_scatter_store.ll     |  2 +-
 .../lit/llvm/define_scatter_store_as_masked.ll     |  2 +-
 .../vecz/test/lit/llvm/define_subgroup_scans.ll    |  2 +-
 .../vecz/test/lit/llvm/delete_packetized_memop.ll  |  2 +-
 .../vecz/test/lit/llvm/diverging_loop.ll           |  2 +-
 .../vecz/test/lit/llvm/diverging_nested_loop.ll    |  2 +-
 .../vecz/test/lit/llvm/early-cse-mul-swap.ll       |  2 +-
 .../vecz/test/lit/llvm/emit_memintrinsics.ll       |  2 +-
 .../lit/llvm/emit_no_unaligned_memintrinsics.ll    |  2 +-
 .../vecz/test/lit/llvm/expect_assume.ll            |  2 +-
 .../test/lit/llvm/extractelement_constant_index.ll |  2 +-
 .../test/lit/llvm/extractelement_runtime_index.ll  |  2 +-
 .../vecz/test/lit/llvm/gep_duplication.ll          |  2 +-
 .../vecz/test/lit/llvm/gep_elim_opaque.ll          |  2 +-
 .../vecz/test/lit/llvm/indirect_call.ll            |  2 +-
 .../test/lit/llvm/inlined_function_debug_info.ll   |  2 +-
 .../test/lit/llvm/insert_element_debug_info.ll     |  2 +-
 .../test/lit/llvm/insertelement_constant_index.ll  |  2 +-
 .../test/lit/llvm/insertelement_runtime_index.ll   |  2 +-
 .../vecz/test/lit/llvm/instantiate_constants.ll    |  2 +-
 .../lit/llvm/interleaved_defuse_instantiated.ll    |  2 +-
 .../vecz/test/lit/llvm/interleaved_load16.ll       |  2 +-
 .../vecz/test/lit/llvm/interleaved_load_ooo.ll     |  2 +-
 .../vecz/test/lit/llvm/interleaved_safety.ll       |  2 +-
 .../vecz/test/lit/llvm/intrinsics-scalarize.ll     | 14 +++++++-------
 .../vecz/test/lit/llvm/intrinsics.ll               | 14 +++++++-------
 .../vecz/test/lit/llvm/irreducible_loop.ll         |  2 +-
 .../vecz/test/lit/llvm/loop_call_instantiation.ll  |  2 +-
 .../vecz/test/lit/llvm/masked_calls_max_builtin.ll |  2 +-
 .../vecz/test/lit/llvm/masked_interleaved.ll       |  2 +-
 .../test/lit/llvm/masked_interleaved_as_scatter.ll |  2 +-
 .../vecz/test/lit/llvm/masked_interleaved_group.ll |  2 +-
 .../test/lit/llvm/masked_interleaved_group2.ll     |  2 +-
 .../vecz/test/lit/llvm/masking_exit_blocks.ll      |  2 +-
 .../vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll  |  2 +-
 .../vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll  |  2 +-
 .../vecz/test/lit/llvm/memop_stride.ll             |  2 +-
 .../vecz/test/lit/llvm/memop_stride10.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride11.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride12.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride13.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride14.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride15.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride16.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride17.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride18.ll           |  2 +-
 .../vecz/test/lit/llvm/memop_stride2.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride3.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride4.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride5.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride6.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride7.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride8.ll            |  2 +-
 .../vecz/test/lit/llvm/memop_stride9.ll            |  2 +-
 .../vecz/test/lit/llvm/multiple_exit_blocks.ll     |  2 +-
 .../test/lit/llvm/multiple_kernels_inlining.ll     |  2 +-
 .../test/lit/llvm/multiple_vectorization_flags.ll  |  2 +-
 .../vecz/test/lit/llvm/multiple_vectorizations.ll  |  2 +-
 .../lit/llvm/multiple_vectorizations_nested.ll     |  4 ++--
 .../test/lit/llvm/multiple_vectorizations_vp.ll    |  2 +-
 .../vecz/test/lit/llvm/no_instantiate_memop.ll     |  2 +-
 .../vecz/test/lit/llvm/no_over_scalarization.ll    |  2 +-
 .../vecz/test/lit/llvm/no_redundant_bitcasts.ll    |  2 +-
 .../compiler_passes/vecz/test/lit/llvm/no_vecz1.ll |  2 +-
 .../compiler_passes/vecz/test/lit/llvm/no_vecz2.ll |  2 +-
 .../vecz/test/lit/llvm/offset_info_analysis.ll     |  2 +-
 .../lit/llvm/onearg_relationals_isfiniteDv4_d.ll   |  2 +-
 .../lit/llvm/onearg_relationals_isfiniteDv4_f.ll   |  2 +-
 .../test/lit/llvm/onearg_relationals_isfinited.ll  |  2 +-
 .../test/lit/llvm/onearg_relationals_isfinitef.ll  |  2 +-
 .../test/lit/llvm/onearg_relationals_isinfDv4_d.ll |  2 +-
 .../test/lit/llvm/onearg_relationals_isinfDv4_f.ll |  2 +-
 .../test/lit/llvm/onearg_relationals_isinfd.ll     |  2 +-
 .../test/lit/llvm/onearg_relationals_isinff.ll     |  2 +-
 .../test/lit/llvm/onearg_relationals_isnanDv4_d.ll |  2 +-
 .../test/lit/llvm/onearg_relationals_isnanDv4_f.ll |  2 +-
 .../test/lit/llvm/onearg_relationals_isnand.ll     |  2 +-
 .../test/lit/llvm/onearg_relationals_isnanf.ll     |  2 +-
 .../lit/llvm/onearg_relationals_isnormalDv4_d.ll   |  2 +-
 .../lit/llvm/onearg_relationals_isnormalDv4_f.ll   |  2 +-
 .../test/lit/llvm/onearg_relationals_isnormald.ll  |  2 +-
 .../test/lit/llvm/onearg_relationals_isnormalf.ll  |  2 +-
 .../vecz/test/lit/llvm/opencl_metadata1.ll         |  2 +-
 .../vecz/test/lit/llvm/opencl_metadata2.ll         |  2 +-
 .../vecz/test/lit/llvm/overaligned_allocas.ll      |  2 +-
 .../vecz/test/lit/llvm/packetization_branch.ll     |  2 +-
 .../vecz/test/lit/llvm/packetization_debug_info.ll |  2 +-
 .../vecz/test/lit/llvm/packetization_nonvarying.ll |  2 +-
 .../test/lit/llvm/packetization_uniform_branch.ll  |  2 +-
 .../vecz/test/lit/llvm/packetize_struct_gep.ll     |  2 +-
 .../test/lit/llvm/packetize_uniform_conditional.ll |  2 +-
 .../llvm/packetize_uniform_default_conditional.ll  |  2 +-
 .../lit/llvm/packetize_uniform_default_noreduce.ll |  2 +-
 .../llvm/packetize_uniform_default_noreduce2.ll    |  2 +-
 .../lit/llvm/packetize_uniform_default_reduce.ll   |  2 +-
 .../llvm/packetize_uniform_loops_conditional.ll    |  2 +-
 .../lit/llvm/packetize_uniform_loops_noreduce.ll   |  2 +-
 .../lit/llvm/packetize_uniform_loops_noreduce2.ll  |  2 +-
 .../lit/llvm/packetize_uniform_loops_reduce.ll     |  2 +-
 .../test/lit/llvm/packetize_uniform_noreduce.ll    |  2 +-
 .../test/lit/llvm/packetize_uniform_noreduce2.ll   |  2 +-
 .../vecz/test/lit/llvm/packetize_uniform_reduce.ll |  2 +-
 .../vecz/test/lit/llvm/partial_linearization0.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization1.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization10.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization11.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization12.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization13.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization14.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization15.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization16.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization17.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization18.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization19.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization2.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization20.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization21.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization22.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization23.ll  |  2 +-
 .../vecz/test/lit/llvm/partial_linearization3.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization4.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization5.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization6.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization7.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization8.ll   |  2 +-
 .../vecz/test/lit/llvm/partial_linearization9.ll   |  2 +-
 .../lit/llvm/partial_linearization_exit_masks.ll   |  2 +-
 .../vecz/test/lit/llvm/pass_pipeline.ll            |  6 +++---
 .../vecz/test/lit/llvm/pass_pipeline_printafter.ll |  2 +-
 .../vecz/test/lit/llvm/phi_interleaved.ll          |  2 +-
 .../vecz/test/lit/llvm/phi_node_debug_info.ll      |  2 +-
 .../vecz/test/lit/llvm/phi_scatter_gather.ll       |  2 +-
 .../vecz/test/lit/llvm/phi_scatter_gather_2.ll     |  2 +-
 .../vecz/test/lit/llvm/predicate_with_switch.ll    |  2 +-
 .../vecz/test/lit/llvm/preserve-fast-math.ll       |  2 +-
 .../vecz/test/lit/llvm/printf_float.ll             |  2 +-
 .../vecz/test/lit/llvm/regression_by_all.ll        |  2 +-
 .../vecz/test/lit/llvm/remove_intptr.ll            |  2 +-
 .../vecz/test/lit/llvm/remove_intptr_2.ll          |  2 +-
 .../vecz/test/lit/llvm/remove_intptr_phi.ll        |  2 +-
 .../vecz/test/lit/llvm/roscc_simplify.ll           |  2 +-
 .../llvm/scalar_load_store_in_varying_branch.ll    |  2 +-
 .../vecz/test/lit/llvm/scalar_splat.ll             |  2 +-
 ...lar_splat_after_load_store_in_varying_branch.ll |  2 +-
 .../lit/llvm/scalar_splat_after_varying_branch.ll  |  2 +-
 .../lit/llvm/scalar_splat_in_varying_branch.ll     |  2 +-
 .../vecz/test/lit/llvm/scalar_vector_user.ll       |  2 +-
 .../vecz/test/lit/llvm/scalarization_calls.ll      |  2 +-
 .../test/lit/llvm/scalarization_calls_uniform.ll   |  2 +-
 .../vecz/test/lit/llvm/scalarization_debug_info.ll |  2 +-
 .../test/lit/llvm/scalarization_instructions.ll    |  2 +-
 .../lit/llvm/scalarization_instructions_uniform.ll |  2 +-
 .../lit/llvm/scalarization_masked_load_store.ll    |  2 +-
 .../vecz/test/lit/llvm/scalarize-gather.ll         |  2 +-
 .../vecz/test/lit/llvm/scalarize-splat.ll          |  2 +-
 .../vecz/test/lit/llvm/scalarize_mixed_gep.ll      |  2 +-
 .../vecz/test/lit/llvm/scan_fact.ll                |  2 +-
 .../test/lit/llvm/secretly_scalar_load_store.ll    |  2 +-
 .../vecz/test/lit/llvm/select-no-crash.ll          |  2 +-
 .../vecz/test/lit/llvm/shuffled_load_1.ll          |  2 +-
 .../vecz/test/lit/llvm/shuffled_load_2.ll          |  2 +-
 .../vecz/test/lit/llvm/shuffled_load_3.ll          |  2 +-
 .../vecz/test/lit/llvm/shuffled_load_4.ll          |  2 +-
 .../vecz/test/lit/llvm/shuffled_load_5.ll          |  2 +-
 .../vecz/test/lit/llvm/shuffled_load_6.ll          |  2 +-
 .../vecz/test/lit/llvm/squash_extract_sext.ll      |  2 +-
 .../test/lit/llvm/squash_extract_sext_bigendian.ll |  2 +-
 .../vecz/test/lit/llvm/squash_extract_zext.ll      |  2 +-
 .../test/lit/llvm/squash_extract_zext_bigendian.ll |  2 +-
 .../vecz/test/lit/llvm/squash_float2_gather.ll     |  2 +-
 .../vecz/test/lit/llvm/stride_aligned.ll           |  2 +-
 .../test/lit/llvm/stride_aligned_scalarized.ll     |  2 +-
 .../vecz/test/lit/llvm/stride_misaligned.ll        |  2 +-
 .../test/lit/llvm/stride_misaligned_scalarized.ll  |  2 +-
 .../vecz/test/lit/llvm/struct_phi.ll               |  2 +-
 .../vecz/test/lit/llvm/struct_select.ll            |  2 +-
 .../vecz/test/lit/llvm/subgroup_builtins.ll        |  2 +-
 .../vecz/test/lit/llvm/subgroup_reductions.ll      |  2 +-
 ...eductions_spv_khr_uniform_group_instructions.ll |  2 +-
 .../vecz/test/lit/llvm/subgroup_scans.ll           |  4 ++--
 ...oup_scans_spv_khr_uniform_group_instructions.ll |  2 +-
 .../llvm/ternary_transform_different_strides.ll    |  2 +-
 .../lit/llvm/ternary_transform_divergent_gep.ll    |  2 +-
 .../lit/llvm/ternary_transform_divergent_source.ll |  2 +-
 .../test/lit/llvm/ternary_transform_negative.ll    |  2 +-
 .../test/lit/llvm/ternary_transform_positive.ll    |  2 +-
 .../ternary_transform_uniform_cond_diff_strides.ll |  2 +-
 .../llvm/ternary_transform_uniform_condition.ll    |  2 +-
 ...rnary_transform_uniform_condition_packetized.ll |  2 +-
 .../lit/llvm/ternary_transform_uniform_source.ll   |  2 +-
 .../lit/llvm/ternary_transform_uniform_sources.ll  |  2 +-
 .../lit/llvm/too_large_simdwidth_packetization.ll  |  2 +-
 .../lit/llvm/too_large_simdwidth_scalarization.ll  |  2 +-
 .../vecz/test/lit/llvm/undef_debug_info.ll         |  2 +-
 .../compiler_passes/vecz/test/lit/llvm/undef_ub.ll |  2 +-
 .../vecz/test/lit/llvm/uniform_address_base.ll     |  2 +-
 .../vecz/test/lit/llvm/uniform_address_index.ll    |  2 +-
 .../vecz/test/lit/llvm/uniform_loop.ll             |  2 +-
 .../test/lit/llvm/uniform_loop_contiguous_phi1.ll  |  2 +-
 .../test/lit/llvm/uniform_loop_contiguous_phi2.ll  |  2 +-
 .../test/lit/llvm/uniform_loop_contiguous_phi3.ll  |  2 +-
 .../test/lit/llvm/uniform_loop_contiguous_phi4.ll  |  2 +-
 .../vecz/test/lit/llvm/uniform_loop_metadata.ll    |  2 +-
 .../vecz/test/lit/llvm/uniform_reassociation1.ll   |  2 +-
 .../vecz/test/lit/llvm/uniform_reassociation2.ll   |  2 +-
 .../vecz/test/lit/llvm/uniform_reassociation3.ll   |  2 +-
 .../vecz/test/lit/llvm/unmangled_builtin_call.ll   |  2 +-
 .../vecz/test/lit/llvm/user_calls.ll               |  2 +-
 .../vecz/test/lit/llvm/varying_load1.ll            |  2 +-
 .../vecz/test/lit/llvm/varying_load2.ll            |  2 +-
 .../lit/llvm/vector_intrinsics_scalarization.ll    |  2 +-
 .../vecz/test/lit/llvm/vector_phi_uniform.ll       |  2 +-
 .../vecz/test/lit/llvm/vector_phi_varying.ll       |  2 +-
 .../vecz/test/lit/llvm/vector_printf.ll            |  2 +-
 .../vecz/test/lit/llvm/vector_printf32.ll          |  2 +-
 .../vecz/test/lit/llvm/vector_printf64.ll          |  2 +-
 .../vecz/test/lit/llvm/vector_printf_floats.ll     |  2 +-
 .../llvm/vector_printf_floats_no_double_support.ll |  2 +-
 .../vecz/test/lit/llvm/vecz_blend_div_loop.ll      |  2 +-
 .../vecz/test/lit/llvm/vecz_scalar_gather_load.ll  |  2 +-
 .../test/lit/llvm/vecz_scalar_interleaved_load.ll  |  2 +-
 .../vecz/test/lit/llvm/workitem_builtins.ll        |  2 +-
 422 files changed, 465 insertions(+), 465 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
index 193613154c125..84f54c35b55f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
index ae54f068548a0..9e400effc5b9d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
index a87c25a1fc8da..fe94dc4b90812 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
index ec0539d5851c0..c27a536614a30 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
index 793fdce9984a1..6f866824b68cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
index 4f20d63d9e6d7..a287d5c3d4f5c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
index 185d4863140ac..219d7f25dde1b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k boscc_killer -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k boscc_killer -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
index 64b4c3539143e..50ca9820ff272 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k boscc_merge -vecz-passes="function(instcombine),function(simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k boscc_merge -vecz-passes="function(instcombine),function(simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
index e9f805c0cf2a5..9b40d771ffd59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k boscc_merge2 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k boscc_merge2 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
index 26f1ee02a1fe4..dda323e5be329 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k boscc_merge3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k boscc_merge3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
index 153a1fe65b35d..d7d306bedb8eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
@@ -18,7 +18,7 @@
 ; of the uniform and the predicated paths for a loop that has not been
 ; duplicated (because of the barrier in it).
 
-; RUN: %veczc -k duplicate_preheader -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k duplicate_preheader -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
index ecfd84c823353..8d7fd3dfe41b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k nested_loops1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k nested_loops1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
index 97ca530d4c5ae..ab53afe07ee59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k nested_loops2 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k nested_loops2 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
index 95236e3d1aad1..dbcef7f094a7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k nested_loops3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k nested_loops3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
index 726f2619141d1..03690d55d1146 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k nested_loops4 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k nested_loops4 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 source_filename = "Unknown buffer"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
index 3eb72a6a10b6e..443a7d4022268 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k nested_loops5 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k nested_loops5 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
index ab362edb0f0db..7ed5152464a06 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
index a49b7b97e2cab..88fcf417937ea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
index 683031b2e4574..5901638978c51 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
index 34a686ff4d7a6..0cd31969ada6c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
index 145b4a11627d0..042bb6922a543 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
index 67d9468d3bc33..16ca0d0c2dd34 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
index dcadd516f3ae2..cff65f417327d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
index 81580111e2e4a..103ccaccb3184 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
index 02be40e3ae5ae..3272393f06bab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
index e3d20bdef7686..59b355d4eb5a5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
index 1a3f9de02d562..d8fc25d7dd7df 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
index ed8617e84ac95..b63ea25a6ecc7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
index 24f059dc49073..d58b2a1e23cfc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
index 67926b955c666..f810588857ef9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
index 59e104a589feb..243f175fde0f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
index 6a05753fd764c..3b969e4c3d09a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-8.0-only
-; RUN: %veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,function(loop(indvars)),cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,function(loop(indvars)),cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
index 02b0cb286fddb..86e8e6396a40d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
index 10120d5f5bd10..7347e9af1aa18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
index 81508c657eab6..248d033b44e75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
index fa7033c12f26a..e8f201fbbaf0f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization6 -vecz-passes="function(simplifycfg),vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization6 -vecz-passes="function(simplifycfg),vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
index 257ec93334f3b..eeb047e812ffd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
index 3932ca05554f6..984e5c44676c0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization8 -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization8 -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
index 057e704c3cee9..9bd0771950dd3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization9 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization9 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
index 621ddf10503e0..593f9a16e687d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
@@ -16,7 +16,7 @@
 
 ; TODO(CA-1981): Using `not` in qemu does not work.
 ; REQUIRES: native
-; RUN: %not %veczc -k printf_add -vecz-simd-width=4 -S -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC < %s 2>&1 | %filecheck %s
+; RUN: not veczc -k printf_add -vecz-simd-width=4 -S -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC < %s 2>&1 | FileCheck %s
 
 ; This test just checks that we don't crash while converting the control flow.
 ; LinearizeBOSCC would leave behind an invalid function when control flow fails
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
index ad4f75c240fad..7aa669f2f6b8b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; Check that we don't crash when costing a scalable reduction
-; RUN: %veczc -vecz-scalable -vecz-passes="pre-linearize" -vecz-choices=LinearizeBOSCC -S < %s
+; RUN: veczc -vecz-scalable -vecz-passes="pre-linearize" -vecz-choices=LinearizeBOSCC -S < %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
index b1fb99891c7f0..43af98e580e1a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
index eed794573ca46..9aa8796365e19 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
@@ -16,7 +16,7 @@
 
 ; Note: *not* running LLVM's mem2reg pass as before LLVM 15 it crashes for the
 ; same reason we used to!
-; RUN: %veczc -vecz-passes=vecz-mem2reg -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=vecz-mem2reg -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
index fab368e4f87ef..ee1cd2ef038c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
@@ -15,7 +15,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
index 3e6ffbe2dcdd9..dbaa44b4f8450 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
@@ -17,7 +17,7 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 declare spir_func i64 @_Z13get_global_idj(i32)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
index 0e14a786bfb7a..c70bf0fad5504 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
index be956f806b372..66427a18fa693 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
index d7d30c23d963d..d8c0981879967 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
index 9334d060cd878..df32ce419d5e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
index 899c0a195ed21..4816edba71a2a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
index b0d03c44b89bf..82343981d5f40 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=ternary-transform,verify -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=ternary-transform,verify -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
index e2e11f816701d..478fb3511df33 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
index a612d7601767f..de8b28c16a9f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
index 3440be62739c1..c9e6e5c230429 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
index a5583b224d23c..d849558b6b390 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index 9b330a3408c9f..eebeeb800820d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -16,7 +16,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
index 2b90bb3d118a8..8ccca62c6958b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "riscv64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
index 4fa347583ea33..8df7fa9db66b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "riscv64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index f851e2e9f5c3e..27519421f4c68 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -15,12 +15,12 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k extract_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE
-; RUN: %not %veczc -k extract_element_ilegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
-; RUN: %veczc -k extract_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI
-; RUN: %veczc -k extract_element_uniform_vec -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI-VEC
-; RUN: %veczc -k extract_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-INDICES
-; RUN: %veczc -k extract_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-BOOL
+; RUN: veczc -k extract_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE
+; RUN: not veczc -k extract_element_ilegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
+; RUN: veczc -k extract_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI
+; RUN: veczc -k extract_element_uniform_vec -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC
+; RUN: veczc -k extract_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-INDICES
+; RUN: veczc -k extract_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-BOOL
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index 3f91c699cdf23..7622066dd9751 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -15,11 +15,11 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k insert_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE
-; RUN: %veczc -k insert_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-UNI
-; RUN: %veczc -k insert_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-INDICES
-; RUN: %not %veczc -k insert_element_illegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
-; RUN: %veczc -k insert_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-BOOL
+; RUN: veczc -k insert_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE
+; RUN: veczc -k insert_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI
+; RUN: veczc -k insert_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES
+; RUN: not veczc -k insert_element_illegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
+; RUN: veczc -k insert_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-BOOL
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
index edf3c93c53cfc..9ec8dc44e0f7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
index 9ccda9f36ecdc..13e5ee7a26fe9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
index afed931f2b1e6..b991ee826745a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
index cf1961a1d9208..88d113dba469e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
index 87145169d836e..66fdd408918a7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 62eddff9c4af3..6989dc26cf54d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k select_scalar_vector -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k select_scalar_vector -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
index b6062349ff2cd..7586c9ef55d37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -15,12 +15,12 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-STORE-4
-; RUN: %veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-STORE-8
-; RUN: %veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-STORE-16
-; RUN: %veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-LOAD-4
-; RUN: %veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-LOAD-8
-; RUN: %veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-LOAD-16
+; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-4
+; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-8
+; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-16
+; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-4
+; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-8
+; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-16
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
index e7de572a7d473..1df7553104e6f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -16,7 +16,7 @@
 
 ; REQUIRES: llvm-14+
 
-; RUN: %veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+v -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+v -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index bf89c3b4fb613..aeaf099a739b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -16,7 +16,7 @@
 
 ; NOTE: Assertions have been autogenerated by scripts/testing/update_veczc_checks.py
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
index 8a5985633bfd3..ef7170da1c771 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k builtins -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
index fdf25e13b438a..4d0dee32a4f31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k cast -vecz-scalable -vecz-simd-width=8 -S < %s | %filecheck %s
+; RUN: veczc -k cast -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index f12c2871782cd..db32d7c162f6e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index 7941d226f9982..25465d1dc70d7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
index 86d7f65cfce01..ea9851f0a8d01 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dont_mask_workitem_builtins -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
index d3fa01fd2781d..13a5a35cc17af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 5c40ea5966351..3cbb091d746e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
index fae026f2ec3ac..8c3b185d0c5f1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index 67124dcc8fb8b..e64f8ef91b4b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -15,11 +15,11 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k extract_element -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE
-; RUN: %veczc -k extract_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI
-; RUN: %veczc -k extract_element_uniform_vec -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-UNI-VEC
-; RUN: %veczc -k extract_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-INDICES
-; RUN: %veczc -k extract_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=EE-BOOL
+; RUN: veczc -k extract_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE
+; RUN: veczc -k extract_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI
+; RUN: veczc -k extract_element_uniform_vec -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC
+; RUN: veczc -k extract_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-INDICES
+; RUN: veczc -k extract_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-BOOL
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
index 2ec5a36d63112..dd5ece32f6ba6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k fadd -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k fadd -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
index 4d74540461ed8..adbc320138b63 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %not %veczc -k fail_builtins -vecz-scalable -vecz-simd-width=4 -S < %s 2>&1 | %filecheck %s
+; RUN: not veczc -k fail_builtins -vecz-scalable -vecz-simd-width=4 -S < %s 2>&1 | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index 56f259fde786a..cef7039e66069 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -15,10 +15,10 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k insert_element -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE
-; RUN: %veczc -k insert_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-UNI
-; RUN: %veczc -k insert_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-INDICES
-; RUN: %veczc -k insert_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix=IE-BOOL
+; RUN: veczc -k insert_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE
+; RUN: veczc -k insert_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI
+; RUN: veczc -k insert_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES
+; RUN: veczc -k insert_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-BOOL
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index c9ebeb38f3ddc..e92b693849c67 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k load_interleaved -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k load_interleaved -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
index 837f87b3eaf40..b9adf794a036b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -15,9 +15,9 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k ctpop -vecz-scalable -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix CTPOP
-; RUN: %veczc -k ctlz -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s --check-prefix CTLZ
-; RUN: %veczc -k cttz -vecz-scalable -vecz-simd-width=8 -S < %s | %filecheck %s --check-prefix CTTZ
+; RUN: veczc -k ctpop -vecz-scalable -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP
+; RUN: veczc -k ctlz -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix CTLZ
+; RUN: veczc -k cttz -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s --check-prefix CTTZ
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
index d04f11fb7d98a..335a35215ed45 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
@@ -15,4 +15,4 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 # Scalable vectorization is only supported on LLVM 12+
-config.unsupported = config.llvm_version_major < 12
+config.unsupported |= config.llvm_version_major < 12
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
index dfbe70086294a..ccf6ee943da0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
index 403eda139fcc0..407fdb382db38 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k load_binops_store -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k load_binops_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
index 8a6c8a1ccc3cd..2a2b243c4d2e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-scalable -vecz-simd-width=8 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index 48f01f28b3743..8fa603f2c229f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
index dc14d1322eb4b..cf9183419ed7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k cast -vecz-scalable -S < %s | %filecheck %s
+; RUN: veczc -k cast -vecz-scalable -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
index bf64dd619998f..78467dd58f1bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index 1f8fa02ef7860..c6e1be52f087e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k select_scalar_vector -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k select_scalar_vector -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 3c9b9a952d298..b3fdbd4fccc4d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 93c216ef61c99..81977004bfa90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
index 885253dd2a0e3..26eb222c2ec2c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -S -vecz-passes=packetizer < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S -vecz-passes=packetizer < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
index e9caa97be59e3..6b164057d1737 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-scalable -w 4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -w 4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
index 6c31bc229af68..90ef5f32c81ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-scalable -w 4 -S -vecz-choices=VectorPredication < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -w 4 -S -vecz-choices=VectorPredication < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
index f464a69f2596b..c2aaf24b9904b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S -vecz-passes=packetizer < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S -vecz-passes=packetizer < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
index 7196ed98fc9d2..ea02a0334c436 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
index a3cfb59861b55..e258b0200e3ea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
@@ -16,7 +16,7 @@
 
 ; Check that we fail to vectorize but don't leave behind an invalid function.
 ; REQUIRES: llvm-13+
-; RUN: %not %veczc -k regression_phis -vecz-scalable -w 1 -vecz-passes=packetizer,verify -S < %s
+; RUN: not veczc -k regression_phis -vecz-scalable -w 1 -vecz-passes=packetizer,verify -S < %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
index f2748abcc01e2..77ecb04b3605f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k widen_vload -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k widen_vload -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
index 3d7433948c454..f83882767ee4a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k store_ult -vecz-scalable -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k store_ult -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; Check that we can scalably-vectorize a call to get_global_id by using the
 ; stepvector intrinsic
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
index 5d5438246b08c..ba9b82e6708e6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k foo -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -k foo -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
index 731cf2729cd17..0e183941a1736 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
@@ -16,7 +16,7 @@
 
 ; REQUIRES: llvm-13+
 ; Just check that the VectorPredication choice is valid
-; RUN: %veczc -k foo -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s
+; RUN: veczc -k foo -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index 326fe8da639db..dc11d9069c739 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -15,8 +15,8 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k get_sub_group_size -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-F2
-; RUN: %veczc -k get_sub_group_size -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK-S4
+; RUN: veczc -k get_sub_group_size -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-F2
+; RUN: veczc -k get_sub_group_size -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-S4
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index 480d0dccf4e7c..6b9bf54fe81b6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k f -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication:FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication:FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
index 4a91a8f7ea122..9fc81ca11e898 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
index 77e9b47c8b866..7374dda89f394 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
index 12924804cd560..66801a1ca8c60 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 5ec70e2f96116..5e6c428dafa99 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -16,10 +16,10 @@
 
 ; REQUIRES: llvm-13+
 
-; RUN: %veczc -k load_add_store_i32 -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_4F
-; RUN: %veczc -k load_add_store_i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_1S
-; RUN: %veczc -k load_add_store_v4i32 -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_V4_2F
-; RUN: %veczc -k load_add_store_v4i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s --check-prefix CHECK_V4_1S
+; RUN: veczc -k load_add_store_i32 -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_4F
+; RUN: veczc -k load_add_store_i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_1S
+; RUN: veczc -k load_add_store_v4i32 -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_2F
+; RUN: veczc -k load_add_store_v4i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_1S
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 3a4b5e4087072..0a72f44e6e7df 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
index 90a1811e3ae30..c8a928a0197f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-13+
-; RUN: %veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
index 0009fbbfcc929..2ce2482d46b8d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -w 4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index b59835f9e4dc9..00d47f7a22d50 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -w 4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
index e8880f9d01e22..b160f5560d5c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | %filecheck %s 
+; RUN: veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | FileCheck %s 
 
 ; Tests the use of the VectorPredication choice. However, note that this option
 ; currently makes no difference on fixed length vectors.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
index 596e8aa572c59..c8e64421cd0db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | %filecheck %s
+; RUN: veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | FileCheck %s
 
 ; Tests the use of the VectorPredication choice. However, note that this option
 ; currently makes no difference on fixed length vectors.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index b35f912176ea9..2ea12a5723483 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -16,7 +16,7 @@
 
 ; REQUIRES: llvm-13+
 
-; RUN: %veczc -k udiv -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | %filecheck %s
+; RUN: veczc -k udiv -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
index 8e59750414057..ea35619a37f78 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
index 8e59750414057..ea35619a37f78 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
index a1a36ef885ae6..2fb59dc4ae9c4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
index 79adbc8155ed1..9a923d3653b6f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k extract_constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k extract_constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
index d44c3842eebe4..7b956317f5ef1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
index bb369a4298bbe..5c96c9e4e30d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
index 6533fdda3e2fe..5b2cc9cf58570 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
index 3c330b6f91053..587b067945e65 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
index b64f0d1a8d1bf..4d9fdb9bdd646 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
index c9c4cdab72f5a..10c1641c7bc90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
index 44255f5e0f923..23619cd2be9f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width 4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width 4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
index 10d7722650ccf..2293e5aa88f37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
index f48cfd82fba5a..f60573c4019b2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
index cf57f8567354b..6a69d748bee4e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
index 32c46f96ce1fb..3bf5b71d4f23c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
index 51f68d42f68de..5fd3ea5e6069d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
index f39158268fcb9..60572ff89c439 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
index 95dbae10c76b6..fd73eb1aafc36 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
index 76fedc4097831..f5e7253d563c8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
index 8374890eb5309..00a47e232ee35 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
index 8ee50ab195be2..94771a1202ac7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vector_copy -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k vector_copy -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
index 4a51971786778..8ca3ced324a1f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vector_loop -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
index a361649b5f151..7643a5f7e9edf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-12+
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
index 54770ac3795e8..32ee16cfe8ee4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k widen_binops -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k widen_binops -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
index c6bad1ac63687..5ad8bcc51079d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-12+
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
index 174696e1a6d8f..401afa8897df2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
index 956e9855caa8e..1f301325cabcc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k fmin_vector_scalar -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k fmin_vector_scalar -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
index f154caf9e51b6..5eb62b4e49493 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
index 4f70e6f57350a..50a3a326b34a7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
index b45474767d66e..079cf4f47796c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
index 559517fb8a8c1..67a5e45bb1814 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
index 334b68988fdd2..dd6fcf52f7405 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
index 950704e8c4b7e..4722ba4d10994 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k widen_shufflevector -vecz-simd-width=2 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k widen_shufflevector -vecz-simd-width=2 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
index 7bd56cb87287c..17061df108ef2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_sqrt -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k test_sqrt -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
index 3153b90afd567..ef0beb022572d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
index 085479c8e79c8..493e3398e0304 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
@@ -16,7 +16,7 @@
 
 ; REQUIRES: arm
 
-; RUN: %veczc -k short3_char3_codegen -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k short3_char3_codegen -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
index 84ab91fd86cca..4b851dbe19ded 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
index 1dc3694249d2f..3796a3f03de37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
index bb8d22596f31e..ad101c818863d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
index 260f96dfc6897..9becb697dea93 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
index a8377695cc81c..3f6659e76aeeb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
index 3657705a49308..3578ca32c4407 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
index 53dfd1dda9e79..2681f5cab1372 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | %filecheck %s
+; RUN: veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
index e94aedc791c82..a6999a1893b8f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | %filecheck %s
+; RUN: veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
index dd5957ff3ef58..0b7dd50641c82 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
index 87f798a6b4905..f83fc0e1015f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k clampkernel -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k clampkernel -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
index 2ca6820a71665..36e59f1a7601a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
index 4227ce3161b68..9d802215b4f9e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
index 1aaec11f02648..23099611e2e78 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
@@ -15,7 +15,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
index b3e369f899603..db2d59b33f807 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_rhadd -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_rhadd -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
index 67c2f054c1c77..d05e657280822 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
index 30f9c4d6fa472..1c5e4b8f58fcc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -17,7 +17,7 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 declare spir_func i64 @_Z13get_global_idj(i32)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
index bad5df9fa5bf5..e654d64689809 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k cantduplicate -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k cantduplicate -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
index 1264348d42e68..3bce3301c22c8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k cantinline -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k cantinline -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
index 6ed1ae339e101..034834a50417a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k optnone -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k optnone -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
index a359ccaa5c093..7c98863706be5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k user_undefined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k user_undefined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
index 3298c21625852..1d20b9e604b6d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k builtin -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k builtin -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
index 464752bd72179..95c291f4423ae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k instrinsic -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k instrinsic -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
index 5ee1bfd24c9be..2f1f5d044e9f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k user_defined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k user_defined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
index bd8280cb08250..eff7950c29474 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
index 418514047886c..74e35d42163ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
index d0ebb3630d3e4..e58f85ead47e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -vecz-auto -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-auto -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
index e53f3088929b6..2ec80677342e6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
index 863cb33395418..069b6969e389a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 1 -S < %s | %filecheck %s
+; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 1 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
index 2545484f95565..5f4383d50adfe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 2 -S < %s | %filecheck %s
+; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 2 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
index bc01f0df0f2d7..cb505d4e4a5ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
index 02432d3e7d090..bf1b541acb2da 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_uniform_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_uniform_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
index 88ac83ac97480..a2b5802814f92 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_uniform_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_uniform_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
index 566048e4009d9..e296fda5c03f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_varying_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_varying_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
index f1784ab18d68b..e9a2c8d3abb86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_varying_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_varying_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
index e413827450cf5..f293ea4a29469 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k convert3 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k convert3 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
index 8458f7e6a7e83..486b1721f7aff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k convert4 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k convert4 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
index 2315cb6d2d9b8..d48d1da9a6f01 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k convert_contiguity -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k convert_contiguity -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
index b2132ed438cbc..f7297e315c1f9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
index ca88290e09a4e..9f0002c0e2aec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
index a82f222e1ef07..baf957450de28 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
index 6f61e5f2b2d76..654baf92be9c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
index b637c20eba7c5..9e2ff23dd69ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
index 6501a01180f61..75acd7aec4198 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
index c5e622f735c71..09058bc2f13f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
index cac0e72fb210b..d01b84639a4d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k masked_gather -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k masked_gather -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
index abb16c603e52a..b4b1892965ab1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
index 8d5834321522c..54087d88be1bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k masked_scatter -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k masked_scatter -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
index db4e8461f5aa3..33164117b644c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
index 3404f4dc87b07..312b02601413a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
index a6ddefb418556..b2fd7d0beb74b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
index 0c44c62e34d6d..ca90efb1ecce7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | %filecheck %s
+; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
index 3089422ea70e6..1589a02710990 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
index f24dd98bd7f5e..a8c7d4edf8ee8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
index 59ad68fefd9ab..486d6c200857e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
index e3b9583af073f..fa50eb13120b4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k multiple_dimensions_0 -vecz-simd-width 4 -S < %s | %filecheck %s
+; RUN: veczc -k multiple_dimensions_0 -vecz-simd-width 4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
index 5ea43e4582a75..7087195ebce69 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k entry -vecz-passes="builtin-inlining,function(instcombine,early-cse),cfg-convert,packetizer" -S < %s | %filecheck %s
+; RUN: veczc -k entry -vecz-passes="builtin-inlining,function(instcombine,early-cse),cfg-convert,packetizer" -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
index cb617c0f7517b..4cb13295f14c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k entry -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+; RUN: veczc -k entry -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
index 4d0117479d970..1f9b36c4c6d2d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
index ca5e39f7c2efa..f1580d1663091 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k extract_constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k extract_constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
index fdd570bc0f47a..5e2867c7a3796 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index 02d2fa8c3c4fa..3d41ea9e08c45 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
index aef3ff002cff3..003370383d2be 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-15+
-; RUN: %veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
index d20bb3e42429b..2b2e85e7c4514 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -S < %s | %filecheck %s
+; RUN: veczc -k test -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
index 4dee3cd867207..d0635ba750e1b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -16,7 +16,7 @@
 
 ; Check VECZ debug info for inlined DILocation metadata nodes
 
-; RUN: %veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = '/tmp/inlined_function.ll'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index 5dfe0f4e551c4..d1b9f68d77533 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -18,7 +18,7 @@
 ; intrinsics across all lanes even when scalarization masks disable some
 ; of the lanes. This occurs when we scalarize insertelement instructions.
 
-; RUN: %veczc -k unaligned_load -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k unaligned_load -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
index 827a2debad03e..72678bd74e68e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
index ca1b9b4ce1ad4..dbac3ca4c45af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k runtime_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k runtime_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
index 3e7c52d758602..748862493eb56 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width 4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
index bd2595f5c40dd..09be8955197d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k printf_kernel -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k printf_kernel -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
index 637a8ee2a7b47..b691315757940 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k load16 -vecz-simd-width 4 -S < %s | %filecheck %s
+; RUN: veczc -k load16 -vecz-simd-width 4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
index 602d4ebfd60b0..bc9c052e38ae8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
index ae8d6eb8e8ed8..a9c4dbbc4ad46 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k f -vecz-simd-width 4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k f -vecz-simd-width 4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
index f272670b419a3..c22625e138831 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
@@ -14,13 +14,13 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k ctpop -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix CTPOP
-; RUN: %veczc -k ctlz -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix CTLZ
-; RUN: %veczc -k cttz -vecz-simd-width=8 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix CTTZ
-; RUN: %veczc -k sadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix SADD_SAT
-; RUN: %veczc -k uadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix UADD_SAT
-; RUN: %veczc -k ssub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix SSUB_SAT
-; RUN: %veczc -k usub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | %filecheck %s --check-prefix USUB_SAT
+; RUN: veczc -k ctpop -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTPOP
+; RUN: veczc -k ctlz -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTLZ
+; RUN: veczc -k cttz -vecz-simd-width=8 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTTZ
+; RUN: veczc -k sadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix SADD_SAT
+; RUN: veczc -k uadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix UADD_SAT
+; RUN: veczc -k ssub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix SSUB_SAT
+; RUN: veczc -k usub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix USUB_SAT
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
index 3970e9902af39..ae05f0b8932cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
@@ -14,13 +14,13 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k ctpop -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix CTPOP
-; RUN: %veczc -k ctlz -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s --check-prefix CTLZ
-; RUN: %veczc -k cttz -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s --check-prefix CTTZ
-; RUN: %veczc -k sadd_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix SADD_SAT
-; RUN: %veczc -k uadd_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix UADD_SAT
-; RUN: %veczc -k ssub_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix SSUB_SAT
-; RUN: %veczc -k usub_sat -vecz-simd-width=2 -S < %s | %filecheck %s --check-prefix USUB_SAT
+; RUN: veczc -k ctpop -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP
+; RUN: veczc -k ctlz -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s --check-prefix CTLZ
+; RUN: veczc -k cttz -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s --check-prefix CTTZ
+; RUN: veczc -k sadd_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix SADD_SAT
+; RUN: veczc -k uadd_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix UADD_SAT
+; RUN: veczc -k ssub_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix SSUB_SAT
+; RUN: veczc -k usub_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix USUB_SAT
 
 ; It checks that the scalar intrinsics get vectorized,
 ; and the vector intrinsics get widened.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
index 9c1d42295ddaf..13f94ed77f102 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: %veczc -k irreducible_loop -S < %s | %filecheck %t
+; RUN: veczc -k irreducible_loop -S < %s | FileCheck %t
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
index b4c4943b6f101..e11efdfe3e407 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-choices=InstantiateCallsInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-choices=InstantiateCallsInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
index bacafd7367582..99cc1e48195c0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
index 076fee77621f1..fc2adc4ef9a0d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_fn -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_fn -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
index ffb8f57bf91b9..96bf189710a6e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_fn -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_fn -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
index 3c47caa892836..cf90d52b80d59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k mask -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k mask -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
index c2837cfa07eb6..3ab352a4c4ba8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k mask -vecz-simd-width=16 -S -vecz-choices=TargetIndependentPacketization < %s | %filecheck %s
+; RUN: veczc -k mask -vecz-simd-width=16 -S -vecz-choices=TargetIndependentPacketization < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
index 7f6674c8b0a51..d24ea6fcfc417 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
index dedfdbb1ce9c5..4cee814b2d641 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k entry -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | %filecheck %s
+; RUN: veczc -k entry -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
index 9ceebfdc7592f..bb6caaa83a627 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k func_10 -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | %filecheck %s
+; RUN: veczc -k func_10 -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
index 02af40372812d..ed24ec0fcf73c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
index 5dd370297701e..766a514a7bd8a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
index fc21fa5f9cd8b..b09cc25da4d88 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
index b34df1dc298c4..b2fa2956e4636 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
index 5dd370297701e..766a514a7bd8a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
index 10b377545fa87..02365aaaa5f0f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
index 0cdf65e4cc162..750a7744df7cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
index b8dc5a489cf72..99aab80558349 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
index d6eb78cf728f1..c3256fa47511b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
index b56bd47fbad2d..03a29cf5341d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
index 6e963f28dc809..e88630c2653c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
index fb3be14950ec3..e0bd5d66b3db6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
index 05e5392273595..a75d1ba3b58d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
index d20739cc1d26d..f52a188f3d48e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
index f757f558d89bc..2a41511698681 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
index 8dcc3272b4c2c..7ed4fced4a3f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
index 61627754b359f..4f5dd3091e553 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
index b689ad4bad742..edc9780cdc00d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
index 8aa9aefa122a9..952192641fa2b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k multiple_exit_blocks -vecz-passes="function(simplifycfg,dce),mergereturn,cfg-convert" -S < %s | %filecheck %s
+; RUN: veczc -k multiple_exit_blocks -vecz-passes="function(simplifycfg,dce),mergereturn,cfg-convert" -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
index 492d3b90deb46..f1eef3daa2571 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k foo3 -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k foo3 -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
index 6ed03228c87ad..c6fdc014c284c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
@@ -18,7 +18,7 @@
 ; vectorizations works in various configurations. The kernel outputs here are
 ; not interesting, only their names.
 ; REQUIRES: llvm-12+
-; RUN: %veczc -w 8 -k foo:4,8,16.2@32s -k bar:,64s -S < %s | %filecheck %s
+; RUN: veczc -w 8 -k foo:4,8,16.2@32s -k bar:,64s -S < %s | FileCheck %s
 
 ; CHECK-DAG: define spir_kernel void @foo
 ; CHECK-DAG: define spir_kernel void @bar
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
index 6682062cc8ce8..67ed397dc63c4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
@@ -17,7 +17,7 @@
 ; Check that veczc can vectorize a kernel multiple times in one go, with a
 ; correct mapping between the vectorized versions of the kernels and their
 ; scalar base
-; RUN: %veczc -k add:4,8,16 -S < %s | %filecheck %s
+; RUN: veczc -k add:4,8,16 -S < %s | FileCheck %s
 
 ; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_3:[0-9]+]] {
 ; CHECK: define spir_kernel void @__vecz_v[[DERIVED_1_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
index 1124314e5fe1d..ae7019f31e923 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
@@ -17,8 +17,8 @@
 ; Check that veczc can vectorize a kernel then vectorize the vectorized kernel,
 ; with base mappings from 1->2 and 2->3 and derived mappings back from 2->1 and
 ; 3->2.
-; RUN: %veczc -k add:2 -S < %s > %t2
-; RUN: %veczc -k __vecz_v2_add:4 -S < %t2 | %filecheck %s
+; RUN: veczc -k add:2 -S < %s > %t2
+; RUN: veczc -k __vecz_v2_add:4 -S < %t2 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
index 04bc0b4489c5c..0d92b325c4594 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
@@ -16,7 +16,7 @@
 
 ; Check that veczc can vectorize a kernel multiple times in one go, with an
 ; equal width but with one enabling vector predication.
-; RUN: %veczc -k add:1s,1sp -S < %s | %filecheck %s
+; RUN: veczc -k add:1s,1sp -S < %s | FileCheck %s
 
 declare spir_func i64 @_Z13get_global_idj(i32)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
index ecd3e0ca49140..a1f075d4913ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k priv -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k priv -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
index f8adf1480dc93..33d160b80140e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k memop_loop_dep -vecz-passes=builtin-inlining,scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k memop_loop_dep -vecz-passes=builtin-inlining,scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
index b7c3fd0218abc..03959379ef109 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k memop_loop_dep -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k memop_loop_dep -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
index d33a7fbaf01fe..1add6d1c9dbec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s -vecz-auto | %filecheck %s
+; RUN: veczc -S < %s -vecz-auto | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
index a40a7fcd54261..77cb58fb597e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s -vecz-auto | %filecheck %s
+; RUN: veczc -S < %s -vecz-auto | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
index f51b667c8a5ef..8ade7c8ae0038 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k offset_info_analysis -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k offset_info_analysis -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
index 5791487af388f..045285825a237 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
index 350f8596122f6..7031037728f09 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
index 9ef093ee2e04b..1722c0c9b5337 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isfinited -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isfinited -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
index 79a1412459b28..7d12cd2dedc88 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isfinitef -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isfinitef -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
index 18fe8b3bb99d6..22d8656b20f79 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
index 443e5c45933cc..41b3a682d3ac4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
index b45071ffd9f69..241645030342c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isinfd -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isinfd -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
index 1101efd59f06f..0b014e6271f31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isinff -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isinff -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
index e9188f2764866..66e9cfc2ccc2f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
index d3bcc7334a3f4..aad20e3bc2fae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
index 8a6105451b12f..8a51ce4fe8cca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnand -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isnand -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
index c4ab772d1ee00..ba42441427eea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnanf -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isnanf -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
index 935a7ac56a58b..efbf4c5d8bd00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
index 492e9f78dcdd3..516ee88c522c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
index fb4b861638f6b..0d3f4a0f3a7eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnormald -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isnormald -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
index d2d55fc36d237..f7f8f21a789dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_isnormalf -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_isnormalf -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
index eb9c8fcfa878f..5447a00118fa0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
index 1da14c05fff68..1e80cac1e1b7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k second_test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k second_test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
index aaf38c1333647..12b78d09d44aa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -vecz-auto -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-auto -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
index b105a9b6adf77..543302a1688fc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_branch -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_branch -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index 0cecd48c243c0..adf0731cc0392 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -17,7 +17,7 @@
 ; Check that debug info is preserved in the vectorized kernel.
 ; Specifically that the packetization pass creates vector types
 ; in the DI for the variables.
-; RUN: %veczc -k add -S < %s | %filecheck %s
+; RUN: veczc -k add -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
index c51911da17ce4..fc50f154c6a31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_nonvarying_loadstore -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_nonvarying_loadstore -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
index a062879f1c413..fccbb32843090 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_uniform_branch -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_uniform_branch -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
index 7a0ef406be97e..64355afa3c865 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
index d918b4cbd2fd1..c22771258e189 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k conditional -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k conditional -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
index 4d18b8b5ef103..327d93185eadb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k conditional -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k conditional -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
index 5115d2dd1ee1c..56069ada5aedb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k noreduce -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k noreduce -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
index 92fcc3273a0dc..c863858541e44 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k noreduce2 -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k noreduce2 -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
index 6f32f316032dc..5c9f38f5546ca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k reduce -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k reduce -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
index 530a6010f1c1b..2f58fa76a8b0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k conditional -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k conditional -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
index 6fbc520e4ed02..926d245564712 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k noreduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k noreduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
index 90a3c3d401a00..7f0782e9a3968 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k noreduce2 -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k noreduce2 -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
index 3bef75fe015ca..c95f0d6f9fb02 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k reduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k reduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
index a1563ab8c4e75..e370be4748007 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k noreduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k noreduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
index 1c11c30d70168..46baacb174abd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k noreduce2 -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k noreduce2 -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
index af46221d31cbd..a1482f948478a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k reduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k reduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
index 46bcececcc3ad..053a647ea32bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
index c30b1da79f024..32244424ddabe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
index 0f515398e5f40..d11101c7ffd37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
index 8baab708fcb5d..9b335e8f3d02f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
index 8ba5b404bd7c8..b38aa0247374d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
index 237b24cf1605b..035b584ab0b0e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
index 8f45ed6d60907..76b96c54195fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
index c856b5afbc106..78578253482b7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
index 6bbae842606b2..0da9fdfd55ef5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
index d87b91b19c9d4..06e8e72eeba65 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
index d870ca351ce11..df7dcb93cbca8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
index 69e8ecea9d0a5..7973e9ef46da2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
index 85ef2577e5a43..7cefd8bad6526 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
index 6ee82d98f8009..fd2e031325485 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
index d71e84ab8facb..1cbe02fccfb77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
index 0bcd836a36de0..5f2f5c3146b18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-12+
-; RUN: %veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
index 1f04b8bf7143d..ce1252b537ae9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization23 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization23 -vecz-passes=cfg-convert -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
index 57c57890c1d10..c61c6baa9947e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
index 1c557d6ae1ca2..655ae12b89510 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
index c469d64eec092..218d993ad36ae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
index a81e663bb0575..ad5918b645550 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization6 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization6 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
index 9ca98830fd917..7976d741c2eaf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
index 8b7dd995ba092..bdec2081eeb19 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization8 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization8 -vecz-passes=cfg-convert -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
index bff1b5b466b6d..cb1212315acc2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k partial_linearization9 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | %filecheck %s
+; RUN: veczc -k partial_linearization9 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
index 20f649a8ffa45..ad2f4f3d7dde6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test < %s
+; RUN: veczc -k test < %s
 
 ; This test ensures that VECZ does not crash during control flow conversion due
 ; to a missing exit mask. As such, we need only verify that the return code from
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
index 052cf3ed75b9e..ef109b67827a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
@@ -14,9 +14,9 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k foo -w 2 -debug-vecz-pipeline -S < %s 2>&1 | %filecheck %s
-; RUN: %veczc -k foo -w 2 -vecz-passes scalarize -debug-vecz-pipeline -S < %s 2>&1 | %filecheck %s --check-prefix=PASSES1
-; RUN: %veczc -k foo -w 2 -vecz-passes scalarize,packetizer -debug-vecz-pipeline -S < %s 2>&1 | %filecheck %s --check-prefix=PASSES2
+; RUN: veczc -k foo -w 2 -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s
+; RUN: veczc -k foo -w 2 -vecz-passes scalarize -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s --check-prefix=PASSES1
+; RUN: veczc -k foo -w 2 -vecz-passes scalarize,packetizer -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s --check-prefix=PASSES2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
index a05a1957e8a68..efe037e65b3d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: llvm-12+
-; RUN: %veczc -k foo -w 2 -vecz-passes scalarize,mask-memops,packetizer -print-after mask-memops -S < %s 2>&1 | %filecheck %s
+; RUN: veczc -k foo -w 2 -vecz-passes scalarize,mask-memops,packetizer -print-after mask-memops -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
index 57d979a591c71..eb6550ae076b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k codegen_2 -vecz-simd-width 16 -vecz-choices=TargetIndependentPacketization -S < %s | %filecheck %s
+; RUN: veczc -k codegen_2 -vecz-simd-width 16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index ec2b2e9ccb618..a603cf94b1c63 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -17,7 +17,7 @@
 ; Check that debug info intrinsics are correctly placed after
 ; phi nodes.
 
-; RUN: %veczc -k loop_phi -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k loop_phi -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
index fe4bfed65f3ce..f35d1e5d7f068 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k phi_memory -S < %s | %filecheck %s
+; RUN: veczc -k phi_memory -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
index 0daa9f7a40bd2..4f780b3e97285 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k phi_memory -S < %s | %filecheck %s
+; RUN: veczc -k phi_memory -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
index 6b76b45458390..9af4b5aae0e7e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k predicate_with_switch -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k predicate_with_switch -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
index b5ac1cf16b6f0..5698c31901290 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S -vecz-passes=packetizer < %s | %filecheck %s
+; RUN: veczc -S -vecz-passes=packetizer < %s | FileCheck %s
 
 ; CHECK: %{{.*}} = fcmp nnan ninf olt <4 x float> %{{.*}}, %{{.*}}
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
index 3f3eb7c48d2af..c661d2c931d57 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_float -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_float -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
index ae102ec12fc8f..0cec9bb1fef51 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 
-; RUN: %veczc -k regression_by_all -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k regression_by_all -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | FileCheck %s
 
 ; The purpose of this test is to make sure the block `c` does not get considered
 ; as a by_all because one of its predecessors is by_all. In fact, because `c`
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
index 5be2aa5aff22e..5527d3ab9c78f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
index 615b307f584f6..3776c68fd9da1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
index 3121b9b26c55d..3411e43b95a64 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
index 9b081570b8ea6..43241ec257ff1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s -w 16 | %filecheck %s
+; RUN: veczc -S < %s -w 16 | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
index 385119729092f..ec972edbc80d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
index 86ddd892043c9..c13008f425340 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
index 1f7952620f6aa..ba1f94c452384 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
index 66a2ae70067d5..09c2e0b3680bb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
index 2e06ea7193ab2..c8c424e746203 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
index f8d4531c7cb68..4765e95d902ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
index 5ee40b4c1f247..a98976bebcb4c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
index cac1ea1b23ea9..ceba451f571cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
index 49c0e8fd4cb2c..e4dbe5f61eb36 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -18,7 +18,7 @@
 ; Specifically that the scalarization pass doesn't destroy DI
 ; intrinsics attached to the vector instructions it scalarizes.
 
-; RUN: %veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
index 936aab9504adc..dff0d17cc7d73 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
index 93df5a9d99fa1..09f7f00ff9603 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
index 890fa663c968f..0435c80f1be5b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
index eba06f982f35b..c86a1c6843226 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
index 9049e0dc02f77..b6da6887d0384 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
index 3cc0bb9c40984..1586feb7c1ea8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -13,7 +13,7 @@
 ; under the License.
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-; RUN: %veczc -k bar -vecz-simd-width=4 -S -o - %s | %filecheck %s
+; RUN: veczc -k bar -vecz-simd-width=4 -S -o - %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
index bda25dd467c25..e4284b6aa7a32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k scan_fact -vecz-passes=cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k scan_fact -vecz-passes=cfg-convert -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
index fd89387589746..0986e47372bd9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
index a6559e6a728be..6ab3427e9f117 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-fail-quietly -k test -vecz-passes="cfg-convert" -S < %s
+; RUN: veczc -vecz-fail-quietly -k test -vecz-passes="cfg-convert" -S < %s
 
 ; This tests only that the kernel does not crash the vectorizer.
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
index 310c502fe1ad9..24cc2c542276f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
index 785db3ed704af..3060f924d4286 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
index e2f37483be09e..f7e7aed565711 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
index 8c1f4bf96a105..e3d06caf03fb1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
index e541aaddc8515..01607469b7cf2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
index f0e79d8b30ce1..f138056a51daa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
index 5d7a033d24799..182ff8263ae8f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
index 0e1e562e4e99d..26c5a233edf88 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
index 2bc9e829aaef9..491be71b88b7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
index 4b28f9a3722d6..ae92e82b5f872 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | %filecheck %s
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
index 5d4a03930acf7..83e216f489d46 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k squash -vecz-passes="squash-small-vecs,packetizer" -S < %s | %filecheck %s
+; RUN: veczc -k squash -vecz-passes="squash-small-vecs,packetizer" -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
index 9201bd1dfb8b1..6af90e4bba69d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
index b91498a8e6e60..9ad94ffa958a9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s -vecz-choices=FullScalarization | %filecheck %s
+; RUN: veczc -S < %s -vecz-choices=FullScalarization | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
index 1062dd331c5bd..1ae62a0924222 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s | %filecheck %s
+; RUN: veczc -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
index 56373b722abe6..3d317f732a4be 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -S < %s -vecz-choices=FullScalarization | %filecheck %s
+; RUN: veczc -S < %s -vecz-choices=FullScalarization | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
index 1cb75279d01ed..74276d0da284a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
index c502c2af024b8..9dc3a66c66a56 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index 238fbd8703e13..27e08e88a8b5e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -vecz-simd-width=4 -S < %s | %filecheck %s 
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s 
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
index c859f7b830e52..69e4ccc0fb961 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -S < %s | %filecheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 2d58eecf4dd99..8588ef0099bbd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -S < %s | %filecheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
index 917d6628d3a7e..297310993af81 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
@@ -14,8 +14,8 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: sed 's/VERSION/i32 1, i32 2/g' %s | %veczc -w 4 -S -vecz-passes=packetizer | %filecheck %s --check-prefixes CHECK,CHECK-12
-; RUN: sed 's/VERSION/i32 3, i32 0/g' %s | %veczc -w 4 -S -vecz-passes=packetizer | %filecheck %s --check-prefixes CHECK,CHECK-30
+; RUN: sed 's/VERSION/i32 1, i32 2/g' %s | veczc -w 4 -S -vecz-passes=packetizer | FileCheck %s --check-prefixes CHECK,CHECK-12
+; RUN: sed 's/VERSION/i32 3, i32 0/g' %s | veczc -w 4 -S -vecz-passes=packetizer | FileCheck %s --check-prefixes CHECK,CHECK-30
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
index c62fa5afbc9db..961c98612b291 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -w 4 -S -vecz-passes=packetizer < %s | %filecheck %s
+; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
index 91416479c68cd..e1582ba7f8c47 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
index 87f5b5ff759ee..725d747de1fc0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
index 43d427df52ec5..16759e0bf5845 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
index 3841b6fb6b3a4..2de729f9a8a10 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_negative -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_negative -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
index 9a5c22fd24859..36d2eb284c15c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
index 84620148e2478..3fe020663d73b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
index 4e8f64ad12793..4bd5d6e4d1c51 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
index dfe67263c6480..e068dd704d13b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform,packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
index e46aa62838945..12b54cd4eb005 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
index bac1dd59268ec..d27a4186ee311 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
index f411cacc3328a..72d03f0395bf5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: linux
-; RUN: %veczc -k add -vecz-simd-width=128 -S < %s | %filecheck %s
+; RUN: veczc -k add -vecz-simd-width=128 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
index 1aa122a00703a..c30ed8f1549d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; REQUIRES: linux
-; RUN: %veczc -k add -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k add -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
index bf906b4421d2c..fdbbdaa70dfdf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
@@ -17,7 +17,7 @@
 ; Check that debug info intrinsics aren't created using undef values.
 ; These cause the backend to assert in codegen.
 
-; RUN: %veczc -k test_fn -S < %s | %filecheck %s
+; RUN: veczc -k test_fn -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
index 2912620a122a6..c6b5c3aadbdb1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
index 8e853bded41c3..e59c91661f1eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k uniform_address_index -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k uniform_address_index -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
index 8e853bded41c3..e59c91661f1eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k uniform_address_index -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k uniform_address_index -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
index fe624c9710b60..0b35a0f28a69c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
index 227ad0a41eaa8..b136be330fffd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
index efb1cd8c26117..90b03a8169bb5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
index f7b64a8e24ced..be591e2e1eb61 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
index 5e2d194e69919..f1a86ad703fe2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
index 434cd9df3af19..075ec2ea1ac71 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -w 4 -S < %s | %filecheck %s
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
index 3ac51a2c18e02..2278613d19ebb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
index 44d59f35f3dec..32396838c6fa3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
index 538f5e9b8d229..5d31ca92ed2be 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
index 365ded52c7027..f694914f58e62 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k k_controlflow_loop_if -S < %s | %filecheck %s
+; RUN: veczc -k k_controlflow_loop_if -S < %s | FileCheck %s
 
 ; ModuleID = 'test.cl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
index 0bee5edba1984..8ebf588e8235f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k entry -w 2 -vecz-handle-declaration-only-calls -vecz-passes=cfg-convert,packetizer -S < %s | %filecheck %s
+; RUN: veczc -k entry -w 2 -vecz-handle-declaration-only-calls -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
index b1cbef14bd152..316ae3ab78c98 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k varying_load1 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k varying_load1 -vecz-passes=cfg-convert -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
index 7bd24a5255c5a..6484400e45602 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k varying_load2 -vecz-passes=cfg-convert -S < %s | %filecheck %s
+; RUN: veczc -k varying_load2 -vecz-passes=cfg-convert -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
index edc53c43d5ed1..d9912a284866b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
@@ -15,7 +15,7 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; Test the -cl-opt-disable compile option
-; RUN: %veczc -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
index 3440be62739c1..c9e6e5c230429 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
index 9cd71a06bcedb..6cccccaa70d7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vector_loop -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
index 5b7c052576409..42841969cf2ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
index 6678a13d3a18f..09fa585880423 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test32 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test32 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
index 661c9da4c71a6..786d3867f31bb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test64 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test64 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
index f4061f3326c00..0c26001dcef81 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_float_vectors -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k test_float_vectors -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
index 06c4e97bd888f..0098cb81fa0ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k test_float_vectors -vecz-simd-width=4 -vecz-double-support=false -vecz-choices=FullScalarization -S < %s | %filecheck %s
+; RUN: veczc -k test_float_vectors -vecz-simd-width=4 -vecz-double-support=false -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
index 8e64112885dc9..6fff2283f5329 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k blend_div_loop -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k blend_div_loop -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
index 6a5ed7a82ebe2..debd60a1627dd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vecz_scalar_gather_load -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | %filecheck %s
+; RUN: veczc -k vecz_scalar_gather_load -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "kernel.opencl"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
index 410079f3bcbe8..749c56abe8b96 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k vecz_scalar_interleaved_load -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | %filecheck %s
+; RUN: veczc -k vecz_scalar_interleaved_load -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
index d8955549ab32d..f66905fcdb66a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
@@ -14,7 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %veczc -k dont_mask_workitem_builtins -S < %s | %filecheck %s
+; RUN: veczc -k dont_mask_workitem_builtins -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

From 6e408436e658f504bcacb04037305196d7121174 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 22 Jun 2023 10:56:18 +0100
Subject: [PATCH 004/182] [compiler] Replace all uses of llvm::Optional

Since llvm::Optional is removed in LLVM 17, we need to migrate away from
it. Where it forms part of our public APIs I've moved to
multi_llvm::Optional, and for internal uses I've moved to std::optional.
---
 .../include/multi_llvm/optional_helper.h                  | 4 ++++
 .../compiler_passes/vecz/include/vecz/pass.h              | 3 +--
 .../compiler_passes/vecz/source/include/debugging.h       | 1 -
 .../source/transform/control_flow_conversion_pass.cpp     | 2 +-
 .../source/transform/interleaved_group_combine_pass.cpp   | 6 +++---
 .../compiler_passes/vecz/source/transform/packetizer.cpp  | 8 ++++----
 .../compiler_passes/vecz/source/transform/passes.cpp      | 3 ++-
 7 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
index 454b091dabfa3..d0465bc467e23 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
@@ -17,8 +17,12 @@
 #ifndef MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
 #define MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
 
+#include <multi_llvm/llvm_version.h>
+
+#if (LLVM_VERSION_MAJOR < 17)
 #include <llvm/ADT/None.h>
 #include <llvm/ADT/Optional.h>
+#endif
 
 #if (LLVM_VERSION_MAJOR >= 16)
 #include <optional>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index d9419af1024e8..17c429b9b0c7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -22,7 +22,6 @@
 #define VECZ_PASS_H
 
 #include <compiler/utils/vectorization_factor.h>
-#include <llvm/ADT/Optional.h>
 #include <llvm/IR/PassManager.h>
 
 #include <cstdint>
@@ -61,7 +60,7 @@ struct VeczPassOptions {
   /// @brief Index of vectorization dimension to use (0 => x, 1 => y, 2 => z).
   uint32_t vec_dim_idx;
 
-  /// @param local_size Value specifying the local size for the function (0 is
+  /// @brief local_size Value specifying the local size for the function (0 is
   /// unknown)
   uint64_t local_size;
 };
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
index 2911916b2a9da..efe67adef667c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -21,7 +21,6 @@
 #ifndef VECZ_DEBUGGING_H_INCLUDED
 #define VECZ_DEBUGGING_H_INCLUDED
 
-#include <llvm/ADT/Optional.h>
 #include <llvm/ADT/StringRef.h>
 #include <llvm/IR/DiagnosticInfo.h>
 #include <llvm/IR/Function.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 05f3602613001..fe83e6b40b483 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -1100,7 +1100,7 @@ bool ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
     if (tryApplyMaskToBinOp(I, mask, toDelete, safeDivisors)) {
       continue;
     }
-    Optional<MemOp> memOp = MemOp::get(&I);
+    multi_llvm::Optional<MemOp> memOp = MemOp::get(&I);
     // Turn loads and stores into masked loads and stores.
     if (memOp && (memOp->isLoad() || memOp->isStore())) {
       if (!tryApplyMaskToMemOp(*memOp, mask, toDelete)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index 5d63a0a0dbecd..a10f26b8ca2d6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -260,7 +260,7 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
         continue;
       }
 
-      Optional<MemOp> Op = MemOp::get(CI);
+      multi_llvm::Optional<MemOp> Op = MemOp::get(CI);
       // We can't optimize interleaved memops if we don't know the stride at
       // runtime, since we need to check if the stride and the group size match.
       if (!Op || !Op->isStrideConstantInt()) {
@@ -324,7 +324,7 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
             Group.Kind == eMaskedInterleavedLoad) {
           Masks.reserve(Group.Data.size());
           for (auto *V : Group.Data) {
-            Optional<MemOp> Op = MemOp::get(cast<Instruction>(V));
+            multi_llvm::Optional<MemOp> Op = MemOp::get(cast<Instruction>(V));
             assert(Op && "Unanalyzable interleaved access?");
             Masks.push_back(Op->getMaskOperand());
           }
@@ -455,7 +455,7 @@ bool InterleavedGroupCombinePass::findGroup(
           CanMove = canMoveUp(Group.Data, cast<Instruction>(InfoN.Op));
 
           if (InfoN.Kind == eMaskedInterleavedLoad) {
-            Optional<MemOp> Op = MemOp::get(InfoN.Op);
+            multi_llvm::Optional<MemOp> Op = MemOp::get(InfoN.Op);
             assert(Op && "Unanalyzable load?");
             if (auto *MaskInst = dyn_cast<Instruction>(Op->getMaskOperand())) {
               CanMove &= Group.canDeinterleaveMask(*MaskInst);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index b377881034085..cfe169738d685 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -38,10 +38,10 @@
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/opaque_pointers.h>
-#include <multi_llvm/optional_helper.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <memory>
+#include <optional>
 
 #include "analysis/instantiation_analysis.h"
 #include "analysis/packetization_analysis.h"
@@ -1662,13 +1662,13 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
   // operations. The value 'None' here represents an operation where the sign
   // of the operands is unimportant, such as floating-point operations, or
   // integer addition.
-  multi_llvm::Optional<bool> optIsSignedInt;
+  std::optional<bool> optIsSignedInt;
   bool isInt = Tys[0]->isIntOrIntVectorTy();
 
   // Determine whether this is a signed or unsigned integer min/max scan.
-  const auto isSignedArg0 = [isInt, fnName, &mangler]() -> Optional<bool> {
+  const auto isSignedArg0 = [isInt, fnName, &mangler]() -> std::optional<bool> {
     if (!isInt) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     // Demangle the function name to get the type qualifiers.
     SmallVector<Type *, 2> types;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index ec52a14847195..bab0daefa5bea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -86,7 +86,8 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
   TargetInfo &VTI = Ctx.targetInfo();
   std::vector<Instruction *> ToDelete;
   for (Function &Builtin : F.getParent()->functions()) {
-    Optional<MemOpDesc> BuiltinDesc = MemOpDesc::analyzeMaskedMemOp(Builtin);
+    multi_llvm::Optional<MemOpDesc> BuiltinDesc =
+        MemOpDesc::analyzeMaskedMemOp(Builtin);
     if (!BuiltinDesc) {
       continue;
     }

From 5eabd4d14f4c832117929515da8875621d08b0a5 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 22 Jun 2023 13:32:24 +0100
Subject: [PATCH 005/182] [multi_llvm] Add Triple.h wrapper

Triple.h moves in LLVM 17, so this provides a stable point with which to
include that header.
---
 .../include/multi_llvm/multi_llvm.h           |  2 +-
 .../include/multi_llvm/triple.h               | 27 +++++++++++++++++++
 .../vecz/source/vector_target_info.cpp        |  2 +-
 3 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 33238e60f936f..3592fa4128d9a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -18,7 +18,6 @@
 
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/Triple.h>
 #include <llvm/Analysis/IVDescriptors.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
 #include <llvm/IR/BasicBlock.h>
@@ -26,6 +25,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <multi_llvm/llvm_version.h>
+#include <multi_llvm/triple.h>
 
 namespace multi_llvm {
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h
new file mode 100644
index 0000000000000..1b069001f7019
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h
@@ -0,0 +1,27 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_TRIPLE_H_INCLUDED
+#define MULTI_LLVM_TRIPLE_H_INCLUDED
+
+#include <multi_llvm/llvm_version.h>
+
+#if LLVM_VERSION_MAJOR >= 17
+#include <llvm/TargetParser/Triple.h>
+#else
+#include <llvm/ADT/Triple.h>
+#endif
+
+#endif  // MULTI_LLVM_TRIPLE_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 8f3094e93434a..2d22f5d5d3d5f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -14,7 +14,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <llvm/ADT/Triple.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
@@ -22,6 +21,7 @@
 #include <llvm/Target/TargetMachine.h>
 #include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/opaque_pointers.h>
+#include <multi_llvm/triple.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"

From f01fae7c7c2b5d0fcb257e2633d1ae06dc3df446 Mon Sep 17 00:00:00 2001
From: Ori Sky Farrell <ori@codeplay.com>
Date: Thu, 22 Jun 2023 12:20:48 +0100
Subject: [PATCH 006/182] Run clang-format-9 on everything

This patch runs clang-format-9 across all relevant files in the
repository, excluding external files.
---
 .../vecz/source/include/transform/scalarizer.h                | 4 ++--
 .../compiler_passes/vecz/source/vector_target_info_arm.cpp    | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
index 224ab9a6cb439..757a7b7fbb926 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
@@ -189,8 +189,8 @@ class Scalarizer {
   ///
   /// @return Packet containing scalarized values or null.
   SimdPacket *scalarizeBinaryOp(llvm::BinaryOperator *BinOp, PacketMask PM);
-// Freeze instruction is not available in LLVM versions prior 10.0
-// and not used in LLVM versions prior to 11.0
+  // Freeze instruction is not available in LLVM versions prior 10.0
+  // and not used in LLVM versions prior to 11.0
   /// @brief Scalarize a freeze instruction.
   ///
   /// @param[in] FreezeInst Instruction to scalarize.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index cb7b220cabd45..912aabeb060f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -18,7 +18,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IntrinsicsAArch64.h>
 #include <llvm/IR/IntrinsicsARM.h>
-
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"

From f64652a640bf5bfc7a45fa60231eafc494a0b92c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 26 Jun 2023 14:19:55 +0100
Subject: [PATCH 007/182] [vecz] Update vsetvli intrinsic for LLVM 17

The 'opt' version was removed and the baseline intrinsic was made
optimizable.
---
 .../compiler_passes/vecz/source/vector_target_info_riscv.cpp  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index a9a58cad5326c..d9046ad27b7f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -745,7 +745,11 @@ Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
   auto *const I64Ty = Type::getInt64Ty(B.getContext());
 
   auto *const VL =
+#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+      B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli, {I64Ty},
+#else
       B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli_opt, {I64Ty},
+#endif
                         {RemainingIters, VSEW, VLMul});
 
   return B.CreateTrunc(VL, I32Ty);

From 148a386747e8907e34a53926e12d435d3f03601f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 26 Jun 2023 14:30:08 +0100
Subject: [PATCH 008/182] [compiler] Adapt to removal of llvm.dbg.addr in LLVM
 17+

This intrinsic was removed and llvm.dbg.declare is the only intrinsic
we'd expect in this situation.
---
 .../vecz/source/transform/basic_mem2reg_pass.cpp              | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index bfc937f441009..15a85bff5b5b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -182,7 +182,11 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
       StoredValue = Store->getValueOperand();
       ToDelete.push_back(Store);
       DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
+#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+      auto DbgIntrinsics = FindDbgDeclareUses(Alloca);
+#else
       auto DbgIntrinsics = FindDbgAddrUses(Alloca);
+#endif
       for (auto oldDII : DbgIntrinsics) {
         ConvertDebugDeclareToDebugValue(oldDII, Store, DIB);
       }

From 1c957a041004ca75fbd382cb9af63c31fdf0638e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 26 Jun 2023 15:13:52 +0100
Subject: [PATCH 009/182] [compiler] Drop last traces of LLVM 14 support

---
 .../include/multi_llvm/opaque_pointers.h         | 14 ++------------
 .../include/multi_llvm/optional_helper.h         | 12 ------------
 .../transform/control_flow_conversion_pass.cpp   |  7 +------
 .../vecz/source/transform/scalarizer.cpp         |  5 -----
 .../vecz/source/vector_target_info_riscv.cpp     | 16 ++++------------
 .../compiler_passes/vecz/tools/source/veczc.cpp  |  4 +---
 6 files changed, 8 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
index 9c871f9a0c3ef..4a3dc772ca944 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
@@ -27,28 +27,18 @@ inline bool isOpaquePointerTy(llvm::Type *Ty) {
   return false;
 }
 
-inline bool isOpaqueOrPointeeTypeMatches(llvm::PointerType *PTy,
-                                         llvm::Type *EltTy) {
-#if LLVM_VERSION_MAJOR >= 15
-  (void)EltTy;
+inline bool isOpaqueOrPointeeTypeMatches(llvm::PointerType *PTy, llvm::Type *) {
   (void)PTy;
   assert(PTy->isOpaque() && "No support for typed pointers in LLVM 15+");
   return true;
-#else
-  return PTy->isOpaque() || PTy->getPointerElementType() == EltTy;
-#endif
 }
 
 inline llvm::Type *getPtrElementType(llvm::PointerType *PTy) {
   if (PTy->isOpaque()) {
     return nullptr;
   }
-#if LLVM_VERSION_MAJOR >= 15
-  assert(false && "No support for typed pointers in LLVM 15+");
+  assert(false && "No support for typed pointers");
   return nullptr;
-#else
-  return PTy->getPointerElementType();
-#endif
 }
 
 };  // namespace multi_llvm
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
index d0465bc467e23..13fdd88569e7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
@@ -64,18 +64,6 @@ class Optional : public llvm::Optional<T> {
   inline constexpr bool has_value() const {
     return llvm::Optional<T>::hasValue();
   }
-
-#if (LLVM_VERSION_MAJOR <= 14)
-  inline constexpr const T &value() const {
-    return llvm::Optional<T>::getValue();
-  }
-  inline constexpr T &value() { return llvm::Optional<T>::getValue(); }
-
-  template <typename U>
-  constexpr T value_or(U &&alt) const & {
-    return llvm::Optional<T>::getValueOr(alt);
-  }
-#endif
 };
 
 #endif
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index fe83e6b40b483..9b60d4dc64467 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -31,8 +31,7 @@
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/multi_llvm.h>
-#include <multi_llvm/vector_type_helper.h>
+#include <multi_llvm/optional_helper.h>
 
 #include <queue>
 #include <utility>
@@ -3064,11 +3063,7 @@ bool ControlFlowConversionState::Impl::simplifyMasks() {
         if (I.use_empty()) {
           toDelete.push_back(&I);
         } else {
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
           Value *simpleMask = simplifyInstruction(&I, Q);
-#else
-          Value *simpleMask = SimplifyInstruction(&I, Q);
-#endif
           if (simpleMask && simpleMask != &I) {
             I.replaceAllUsesWith(simpleMask);
             toDelete.push_back(&I);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 25ae2d4433073..d20d7794f5b32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -27,7 +27,6 @@
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -667,11 +666,7 @@ SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) {
     }
 
     Value *Idx = B.getInt32(i);
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
     Value *Extract = simplifyExtractElementInst(V, Idx, Q);
-#else
-    Value *Extract = SimplifyExtractElementInst(V, Idx, Q);
-#endif
     if (!Extract) {
       Extract = B.CreateExtractElement(V, Idx);
     }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index d9046ad27b7f5..5907b1aa14feb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -360,10 +360,8 @@ llvm::Value *TargetInfoRISCV::createScalableExtractElement(
                                  indices, zero);
 
   SmallVector<Value *, 4> ops;
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
-  // LLVM 15+ has a pass-through operand - we set it to undef.
+  // Add the a pass-through operand - we set it to undef.
   ops.push_back(UndefValue::get(srcTy));
-#endif
   ops.push_back(src);
   ops.push_back(indices);
   ops.push_back(avl);
@@ -428,10 +426,8 @@ llvm::Value *TargetInfoRISCV::createScalableBroadcast(llvm::IRBuilder<> &B,
   auto *const avl = getIntrinsicVL(B, VL, wideTy, getTargetMachine());
 
   SmallVector<Value *, 4> ops;
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
-  // LLVM 15+ has a pass-through operand - we set it to undef.
+  // Add the pass-through operand - we set it to undef.
   ops.push_back(UndefValue::get(vs2->getType()));
-#endif
   ops.push_back(vs2);
   ops.push_back(vs1);
   ops.push_back(avl);
@@ -641,10 +637,8 @@ llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
   auto *const avl = getIntrinsicVL(B, VL, gatherTy, getTargetMachine());
 
   SmallVector<Value *, 4> ops;
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
-  // LLVM 15+ has a pass-through operand - we set it to undef.
+  // Add the pass-through operand - we set it to undef.
   ops.push_back(UndefValue::get(gatherTy));
-#endif
   ops.push_back(src);
   ops.push_back(mask);
   ops.push_back(avl);
@@ -676,10 +670,8 @@ llvm::Value *TargetInfoRISCV::createVectorSlideUp(llvm::IRBuilder<> &B,
   auto *const avl = getIntrinsicVL(B, VL, srcTy, getTargetMachine());
 
   SmallVector<Value *, 4> ops;
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
-  // LLVM 15+ has a pass-through operand - we set it to undef.
+  // Add the pass-through operand - we set it to undef.
   ops.push_back(UndefValue::get(srcTy));
-#endif
   ops.push_back(src);
   ops.push_back(insert);
   ops.push_back(avl);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 7ff39a21eff04..3cc04f29a25bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -46,7 +46,6 @@
 
 #include <string>
 
-#include "multi_llvm/multi_llvm.h"
 #include "vecz/pass.h"
 #include "vecz/vecz_target_info.h"
 
@@ -287,9 +286,8 @@ int main(const int argc, const char *const argv[]) {
 
   llvm::SMDiagnostic err;
   llvm::LLVMContext context;
-#if LLVM_VERSION_GREATER_EQUAL(15, 0)
   context.setOpaquePointers(true);
-#endif
+
   std::unique_ptr<llvm::Module> module =
       llvm::parseIRFile(InputFilename, err, context);
 

From 7a0e6550e13972ceed33e34a624a62d561e1e2f5 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 29 Jun 2023 16:52:35 +0100
Subject: [PATCH 010/182] [multi_llvm] Provide implicit conversion from
 Optional to std::optional

---
 .../include/multi_llvm/optional_helper.h                 | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
index 13fdd88569e7c..410606bc736ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
@@ -24,9 +24,7 @@
 #include <llvm/ADT/Optional.h>
 #endif
 
-#if (LLVM_VERSION_MAJOR >= 16)
 #include <optional>
-#endif
 
 namespace multi_llvm {
 
@@ -64,6 +62,13 @@ class Optional : public llvm::Optional<T> {
   inline constexpr bool has_value() const {
     return llvm::Optional<T>::hasValue();
   }
+
+  // Provide implicit conversions to the future proof std::optional.
+  inline constexpr operator std::optional<T>() const {
+    return llvm::Optional<T>::hasValue()
+               ? std::optional<T>(llvm::Optional<T>::getValue())
+               : std::nullopt;
+  }
 };
 
 #endif

From 0dafdeb07b0a241555c5964e1cb6d357c9dd95f3 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 4 Jul 2023 15:37:30 +0100
Subject: [PATCH 011/182] [compiler] Remove need for passing Module to
 NameMangler

This was only used for its LLVMContext, which we already provide to the
mangler upon construction - perhaps for legacy reasons.
---
 .../compiler_pipeline/include/multi_llvm/multi_llvm.h         | 4 ++--
 .../compiler_passes/vecz/source/vectorization_context.cpp     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 3592fa4128d9a..a9c99fcb3f61c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -63,9 +63,9 @@ inline llvm::InlineResult InlineFunction(llvm::CallInst *CI,
 #endif
 }
 
-inline llvm::StructType *getStructTypeByName(llvm::Module &module,
+inline llvm::StructType *getStructTypeByName(llvm::LLVMContext &ctx,
                                              llvm::StringRef name) {
-  return llvm::StructType::getTypeByName(module.getContext(), name);
+  return llvm::StructType::getTypeByName(ctx, name);
 }
 
 inline llvm::DILocation *getDILocation(unsigned Line, unsigned Column,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 8bf445572b7b3..5e27bf0632efc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -143,7 +143,7 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
     return result;
   }
 
-  compiler::utils::NameMangler Mangler(&F.getContext(), &Module);
+  compiler::utils::NameMangler Mangler(&F.getContext());
   auto const BuiltinName = Mangler.demangleName(F.getName()).str();
 
   result.func = VectorCallee;

From 29945d212de05ac96b2242200339bf4a31232aa3 Mon Sep 17 00:00:00 2001
From: Amy <135044214+AmyCodeplay@users.noreply.github.com>
Date: Mon, 17 Jul 2023 13:29:38 +0100
Subject: [PATCH 012/182] Subgroup broadcast of uniform value becomes a NOP
 (#59)

---
 .../vecz/source/transform/packetizer.cpp      | 14 +++++-
 .../vecz/test/lit/llvm/subgroup_broadcast.ll  | 46 +++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index cfe169738d685..412d5dea9a84f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1258,10 +1258,20 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
 
   IRBuilder<> B(buildAfter(CI, F));
 
-  auto *const idx = CI->getArgOperand(1);
+  auto *const src = CI->getArgOperand(0);
 
-  auto op = packetize(CI->getArgOperand(0));
+  auto op = packetize(src);
   PACK_FAIL_IF(!op);
+
+  // If the source operand happened to be a broadcast value already, we can use
+  // it directly.
+  if (op.info->numInstances == 0) {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(src);
+    return src;
+  }
+
+  auto *const idx = CI->getArgOperand(1);
   Value *val = nullptr;
   // Optimize the constant fixed-vector case, where we can choose the exact
   // subpacket to extract from directly.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
new file mode 100644
index 0000000000000..e43769a8627c4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s 
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i32 @_Z22get_sub_group_local_idv()
+declare spir_func i32 @_Z19sub_group_broadcastij(i32, i32)
+
+; It makes sure broadcast still works when its source operand is uniform
+define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @_Z19sub_group_broadcastij(i32 %v, i32 0)
+  %idx = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idx
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast(
+; CHECK: [[LD:%.+]] = load i32, ptr addrspace(1) %{{.+}}, align 4
+; CHECK: [[INS:%.+]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK: [[BCAST:%.+]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %out, align 4
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}

From 84807258677a9fc2693ba2d463118417117b0a01 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 25 Jul 2023 16:49:29 +0100
Subject: [PATCH 013/182] [vecz] Update some lit tests to account for LLVM 17

* LLVM 17 started introducing new NaNs during a fold.
* LLVM 17 saw the internal 'undef' global variable, and wasn't
  vectorizing as it determined its users to also be undef, which makes
  sense.
---
 .../vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll   | 5 ++---
 .../compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll | 2 +-
 .../vecz/test/lit/llvm/overaligned_allocas.ll                | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index aeaf099a739b8..41c9ec2762099 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -15,7 +15,6 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; NOTE: Assertions have been autogenerated by scripts/testing/update_veczc_checks.py
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
@@ -29,7 +28,7 @@ entry:
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
-  %2 = fadd <4 x float> %1, <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %2 = fadd <4 x float> %1, <float 0x7FF8000020000000, float 0x7FF8000020000000, float 0x7FF8000020000000, float 0x7FF8000020000000>
   %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
   store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
   ret void
@@ -102,7 +101,7 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{(i32|i64)}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF8000020000000, {{(i32|i64)}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
index e58f85ead47e3..819c952ce071f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
@@ -20,7 +20,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-@entry_test_alloca.lm = internal unnamed_addr addrspace(3) constant [16 x <2 x float>] undef, align 8
+@entry_test_alloca.lm = external unnamed_addr addrspace(3) constant [16 x <2 x float>], align 8
 
 define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr {
 entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
index 12b78d09d44aa..069a82c6f7449 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
@@ -20,7 +20,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-@entry_test_alloca.lm = internal unnamed_addr addrspace(3) constant [16 x <2 x float>] undef, align 8
+@entry_test_alloca.lm = external unnamed_addr addrspace(3) constant [16 x <2 x float>], align 8
 
 define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr {
 entry:

From 5a5439431e3aa428fb8941fa6c04282ac8ff912d Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 9 Aug 2023 11:53:48 +0100
Subject: [PATCH 014/182] [compiler] Enable mux subgroup/workgroup builtins

This enables the use of the mux subgroup and workgroup 'collective'
builtins in the pipeline, replacing the OpenCL ones early on.

The passes which used to care about the OpenCL builtins have been
updated to care about the mux builtins. There is no real change in
behaviour, except in a few cases where OpenCL any/all worked on i32
types but the mux versions work on i1 types: vecz was already
translating between these two types, so there's less to do now.

This should simplify a lot of the code.

* The mux sub-group builtins are defined by DefineMuxBuiltinsPass; there
  is no 'trivial' implementation in the OpenCL headers.
* As such, the LinkBuiltinsPass no longer needs to know whether it's
  'early' or not, as the sub-group builtins are declarations and are
  thus safe to link at any point.
* We can get away with doing a lot less function name demangling,
  mangling and manipulation, as we have exposed enums for the builtin
  IDs.
  * The DegenerateSubGroupPass still does name shenanigans, but we can
    fix that up separately.
---
 .../analysis/uniform_value_analysis.cpp       |  17 +-
 .../inline_post_vectorization_pass.cpp        |   2 +-
 .../vecz/source/transform/packetizer.cpp      | 254 +++++-------------
 .../llvm/ScalableVectors/subgroup_builtins.ll |  18 +-
 .../llvm/ScalableVectors/subgroup_scans.ll    |  60 ++---
 ...cans_spv_khr_uniform_group_instructions.ll |  66 +++--
 ...s_spv_khr_uniform_group_instructions_vp.ll |  66 +++--
 .../llvm/ScalableVectors/subgroup_scans_vp.ll |  60 ++---
 .../compute_vector_length.ll                  |   8 +-
 .../VectorPredication/subgroup_reductions.ll  | 104 +++----
 ...ions_spv_khr_uniform_group_instructions.ll |  80 +++---
 .../llvm/VectorPredication/subgroup_scans.ll  |  60 ++---
 ...cans_spv_khr_uniform_group_instructions.ll |  66 +++--
 .../vecz/test/lit/llvm/subgroup_broadcast.ll  |  12 +-
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |  38 ++-
 .../vecz/test/lit/llvm/subgroup_reductions.ll | 108 ++++----
 ...ions_spv_khr_uniform_group_instructions.ll |  80 +++---
 .../vecz/test/lit/llvm/subgroup_scans.ll      |  88 +++---
 ...cans_spv_khr_uniform_group_instructions.ll |  66 +++--
 19 files changed, 547 insertions(+), 706 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 98d6ae406fae2..bd7eb2b452c61 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -123,11 +123,20 @@ static bool isSubgroupBroadcastOrReduction(
     return false;
   }
   auto const Builtin = BI.analyzeBuiltin(*Callee);
-  if (compiler::utils::eBuiltinSubgroupReduceInvalid !=
-      BI.getBuiltinSubgroupReductionKind(Builtin)) {
-    return true;
+  if (auto Info = BI.isMuxGroupCollective(Builtin.ID);
+      Info &&
+      Info->Scope == compiler::utils::GroupCollective::ScopeKind::SubGroup) {
+    switch (Info->Op) {
+      default:
+        return false;
+      case compiler::utils::GroupCollective::OpKind::Any:
+      case compiler::utils::GroupCollective::OpKind::All:
+      case compiler::utils::GroupCollective::OpKind::Reduction:
+      case compiler::utils::GroupCollective::OpKind::Broadcast:
+        return true;
+    }
   }
-  return Builtin.isValid() && Builtin.ID == BI.getSubgroupBroadcastBuiltin();
+  return false;
 }
 
 void UniformValueResult::findVectorLeaves(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index 98b03796c53b1..65c4c120242b6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -78,7 +78,7 @@ Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
   // order to maintain equivalence between the scalar/vector forms. Do this
   // here due to a tight coupling between the vectorized version and these
   // remaining scalar versions.
-  if (Builtin.isValid() && Builtin.ID == BI.getSubgroupLocalIdBuiltin()) {
+  if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
     return ConstantInt::getNullValue(CI->getType());
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 412d5dea9a84f..a8a01788d3069 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -221,8 +221,8 @@ class Packetizer::Impl : public Packetizer {
   /// @param[in] SubgroupScanKind type of subgroup scan to packetized.
   ///
   /// @return Packetized values.
-  ValuePacket packetizeSubgroupScan(
-      CallInst *CI, compiler::utils::BuiltinSubgroupScanKind SubgroupScanKind);
+  ValuePacket packetizeSubgroupScan(CallInst *CI,
+                                    compiler::utils::GroupCollective Scan);
   /// @brief Perform post-packetization tasks for the given scalar value.
   ///
   /// @param[in] Scalar Scalar value to assign a vectorized value.
@@ -683,10 +683,10 @@ bool Packetizer::Impl::packetize() {
 
   compiler::utils::NameMangler Mangler(&F.getContext());
 
-  // Handle get_sub_group_size specially (i.e., not in BuiltinInfo) since
-  // inlining it requires extra vectorization context, such as the
-  // vectorization width and choices; this inlining is too tightly coupled to
-  // the vectorizer context to exist in a generic sense.
+  // Handle __mux_get_sub_group_size specially (i.e., not in BuiltinInfo) since
+  // inlining it requires extra vectorization context, such as the vectorization
+  // width and choices; this inlining is too tightly coupled to the vectorizer
+  // context to exist in a generic sense.
   for (auto &BB : F) {
     for (auto &I : BB) {
       CallInst *CI = dyn_cast<CallInst>(&I);
@@ -695,8 +695,8 @@ bool Packetizer::Impl::packetize() {
       }
 
       auto *const Callee = CI->getCalledFunction();
-      if (Callee &&
-          "get_sub_group_size" == Mangler.demangleName(Callee->getName())) {
+      if (Callee && Ctx.builtins().analyzeBuiltin(*Callee).ID ==
+                        compiler::utils::eMuxBuiltinGetSubGroupSize) {
         auto *const replacement = [this](CallInst *CI) -> Value * {
           if (VL) {
             return VL;
@@ -1084,11 +1084,20 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
   Function *callee = CI->getCalledFunction();
 
   auto const Builtin = BI.analyzeBuiltin(*callee);
-  auto const subgroupReduceKind = BI.getBuiltinSubgroupReductionKind(Builtin);
+  auto const Info = BI.isMuxGroupCollective(Builtin.ID);
 
-  if (subgroupReduceKind == compiler::utils::eBuiltinSubgroupReduceInvalid) {
+  if (!Info ||
+      Info->Scope != compiler::utils::GroupCollective::ScopeKind::SubGroup) {
     return nullptr;
   }
+  switch (Info->Op) {
+    default:
+      return nullptr;
+    case compiler::utils::GroupCollective::OpKind::Any:
+    case compiler::utils::GroupCollective::OpKind::All:
+    case compiler::utils::GroupCollective::OpKind::Reduction:
+      break;
+  }
 
   SmallVector<Value *, 16> opPackets;
   IRBuilder<> B(buildAfter(CI, F));
@@ -1106,103 +1115,18 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
 
   auto op = packetize(CI->getArgOperand(0));
 
-  bool isSignedInt = false;
-  bool const isFP = argTy->isFPOrFPVectorTy();
-  bool const isBool = argTy->isIntOrIntVectorTy(/*BitWidth*/ 1);
-  (void)isBool;
-
-  // Determine whether this is a signed or unsigned integer min/max reduction.
-  if (!isFP &&
-      (subgroupReduceKind == compiler::utils::eBuiltinSubgroupReduceMax ||
-       subgroupReduceKind == compiler::utils::eBuiltinSubgroupReduceMin)) {
-    // Demangle the function name to get the type qualifiers.
-    SmallVector<Type *, 2> Types;
-    SmallVector<compiler::utils::TypeQualifiers, 2> Quals;
-    compiler::utils::NameMangler Mangler(&F.getContext());
-    if (!Mangler.demangleName(callee->getName(), Types, Quals).empty()) {
-      assert(!Quals.empty());
-      auto &Qual = Quals[0];
-      while (!isSignedInt && Qual.getCount()) {
-        isSignedInt |= Qual.pop_front() == compiler::utils::eTypeQualSignedInt;
-      }
-    }
-  }
-
-  RecurKind recurK;
-  switch (subgroupReduceKind) {
-    default:
-      emitVeczRemarkMissed(&F, nullptr, "Unimplemented subgroup reduction");
-      VECZ_FAIL();
-      break;
-    case compiler::utils::eBuiltinSubgroupAll:
-      recurK = RecurKind::And;
-      break;
-    case compiler::utils::eBuiltinSubgroupAny:
-      recurK = RecurKind::Or;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceAdd:
-      recurK = isFP ? RecurKind::FAdd : RecurKind::Add;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceMin:
-      recurK = isFP ? RecurKind::FMin
-                    : (isSignedInt ? RecurKind::SMin : RecurKind::UMin);
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceMax:
-      recurK = isFP ? RecurKind::FMax
-                    : (isSignedInt ? RecurKind::SMax : RecurKind::UMax);
-      break;
-    // SPV_KHR_uniform_group_instructions
-    case compiler::utils::eBuiltinSubgroupReduceMul:
-      recurK = isFP ? RecurKind::FMul : RecurKind::Mul;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceAnd:
-      assert(!isFP && "Invalid subgroup reduction");
-      recurK = RecurKind::And;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceOr:
-      assert(!isFP && "Invalid subgroup reduction");
-      recurK = RecurKind::Or;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceXor:
-      assert(!isFP && "Invalid subgroup reduction");
-      recurK = RecurKind::Xor;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceLogicalAnd:
-      assert(isBool && "Invalid subgroup reduction");
-      recurK = RecurKind::And;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceLogicalOr:
-      assert(isBool && "Invalid subgroup reduction");
-      recurK = RecurKind::Or;
-      break;
-    case compiler::utils::eBuiltinSubgroupReduceLogicalXor:
-      assert(isBool && "Invalid subgroup reduction");
-      recurK = RecurKind::Xor;
-      break;
-  }
-
   // Reduce the packet values in-place.
   // TODO: can we add 'reassoc' to the floating-point reductions to absolve
   // them of ordering? See CA-3969.
   op.getPacketValues(packetWidth, opPackets);
 
-  // Any/All reductions are defined as reducing over the i32 value being
-  // "evaluated to non-zero", so emit the required comparisons.
-  if (subgroupReduceKind == compiler::utils::eBuiltinSubgroupAll ||
-      subgroupReduceKind == compiler::utils::eBuiltinSubgroupAny) {
-    for (unsigned i = 0, e = opPackets.size(); i != e; i++) {
-      opPackets[i] = B.CreateICmpNE(
-          opPackets[i], ConstantInt::get(opPackets[i]->getType(), 0));
-    }
-  }
-
   // When in VP mode, pre-sanitize the reduction input (before VP reduction
   // intrinsics, introduced in LLVM 14)
   if (VL) {
     assert(opPackets.size() == 1 &&
            "Should have bailed if dealing with more than one packet");
     Value *&val = opPackets.front();
-    val = sanitizeVPReductionInput(B, val, VL, recurK);
+    val = sanitizeVPReductionInput(B, val, VL, Info->Recurrence);
     if (!val) {
       emitVeczRemarkMissed(&F, CI,
                            "Can not vector-predicate subgroup reduction");
@@ -1222,22 +1146,17 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
     for (decltype(packetWidth) i = 0; i < packetWidth; ++i) {
       Value *const lhs = opPackets[i];
       Value *const rhs = opPackets[i + packetWidth];
-      opPackets[i] = multi_llvm::createBinOpForRecurKind(B, lhs, rhs, recurK);
+      opPackets[i] =
+          multi_llvm::createBinOpForRecurKind(B, lhs, rhs, Info->Recurrence);
     }
   }
 
   // Reduce to a scalar.
-  Value *v = createSimpleTargetReduction(B, &TTI, opPackets.front(), recurK);
+  Value *v =
+      createSimpleTargetReduction(B, &TTI, opPackets.front(), Info->Recurrence);
 
   IC.deleteInstructionLater(CI);
 
-  // For any/all reductions we have to get back from an i1 to the original
-  // type.
-  if (subgroupReduceKind == compiler::utils::eBuiltinSubgroupAll ||
-      subgroupReduceKind == compiler::utils::eBuiltinSubgroupAny) {
-    v = B.CreateSExt(v, CI->getType());
-  }
-
   CI->replaceAllUsesWith(v);
 
   return v;
@@ -1252,7 +1171,12 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
   Function *callee = CI->getCalledFunction();
   auto const Builtin = BI.analyzeBuiltin(*callee);
 
-  if (!Builtin.isValid() || Builtin.ID != BI.getSubgroupBroadcastBuiltin()) {
+  if (auto Info = BI.isMuxGroupCollective(Builtin.ID)) {
+    if (Info->Scope != compiler::utils::GroupCollective::ScopeKind::SubGroup ||
+        Info->Op != compiler::utils::GroupCollective::OpKind::Broadcast) {
+      return nullptr;
+    }
+  } else {
     return nullptr;
   }
 
@@ -1542,13 +1466,14 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
   }
 
   auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
-  auto const subgroupScanKind =
-      Ctx.builtins().getBuiltinSubgroupScanKind(Builtin);
 
   // Handle subgroup scans, which defer to internal builtins.
-  if (Builtin.isValid() &&
-      subgroupScanKind != compiler::utils::eBuiltinSubgroupScanInvalid) {
-    return packetizeSubgroupScan(CI, subgroupScanKind);
+  if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin.ID)) {
+    if (Info->Scope == compiler::utils::GroupCollective::ScopeKind::SubGroup &&
+        (Info->Op == compiler::utils::GroupCollective::OpKind::ScanExclusive ||
+         Info->Op == compiler::utils::GroupCollective::OpKind::ScanInclusive)) {
+      return packetizeSubgroupScan(CI, *Info);
+    }
   }
 
   // Handle external builtins.
@@ -1651,7 +1576,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
 }
 
 ValuePacket Packetizer::Impl::packetizeSubgroupScan(
-    CallInst *CI, compiler::utils::BuiltinSubgroupScanKind subgroupScanKind) {
+    CallInst *CI, compiler::utils::GroupCollective Scan) {
   ValuePacket results;
 
   Function *callee = CI->getCalledFunction();
@@ -1659,101 +1584,60 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
     return results;
   }
 
-  const StringRef fnName = callee->getName();
   compiler::utils::NameMangler mangler(&CI->getContext());
 
   // The operands and types for the internal builtin
   SmallVector<Value *, 2> Ops = {packetize(CI->getArgOperand(0)).getAsValue()};
   SmallVector<Type *, 2> Tys = {getWideType(CI->getType(), SimdWidth)};
 
-  bool isInclusive = true;
+  bool isInclusive =
+      Scan.Op == compiler::utils::GroupCollective::OpKind::ScanInclusive;
   StringRef op = "add";
   // min/max scans are prefixed with s/u if they are signed/unsigned integer
   // operations. The value 'None' here represents an operation where the sign
   // of the operands is unimportant, such as floating-point operations, or
   // integer addition.
-  std::optional<bool> optIsSignedInt;
-  bool isInt = Tys[0]->isIntOrIntVectorTy();
-
-  // Determine whether this is a signed or unsigned integer min/max scan.
-  const auto isSignedArg0 = [isInt, fnName, &mangler]() -> std::optional<bool> {
-    if (!isInt) {
-      return std::nullopt;
-    }
-    // Demangle the function name to get the type qualifiers.
-    SmallVector<Type *, 2> types;
-    SmallVector<compiler::utils::TypeQualifiers, 2> quals;
-    if (mangler.demangleName(fnName, types, quals).empty()) {
-      return false;
-    }
-    assert(!quals.empty());
-    auto &qual = quals[0];
-    bool isSignedInt = false;
-    while (!isSignedInt && qual.getCount()) {
-      isSignedInt |= qual.pop_front() == compiler::utils::eTypeQualSignedInt;
-    }
-    return isSignedInt;
-  };
-
-  switch (subgroupScanKind) {
+  bool opIsSignedInt = false;
+
+  switch (Scan.Recurrence) {
     default:
       assert(false && "Impossible subgroup scan kind");
       return results;
-    case compiler::utils::eBuiltinSubgroupScanAddExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanAddIncl:
+    case llvm::RecurKind::Add:
+    case llvm::RecurKind::FAdd:
       op = "add";
       break;
-    case compiler::utils::eBuiltinSubgroupScanMinExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanMinIncl:
+    case llvm::RecurKind::SMin:
+      op = "smin";
+      opIsSignedInt = true;
+      break;
+    case llvm::RecurKind::UMin:
+      op = "umin";
+      break;
+    case llvm::RecurKind::FMin:
       op = "min";
-      optIsSignedInt = isSignedArg0();
       break;
-    case compiler::utils::eBuiltinSubgroupScanMaxExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanMaxIncl:
+    case llvm::RecurKind::SMax:
+      op = "smax";
+      opIsSignedInt = true;
+      break;
+    case llvm::RecurKind::UMax:
+      op = "umax";
+      break;
+    case llvm::RecurKind::FMax:
       op = "max";
-      optIsSignedInt = isSignedArg0();
       break;
-      /// Scans provided by SPV_KHR_uniform_group_instructions.
-    case compiler::utils::eBuiltinSubgroupScanMulExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanMulIncl:
+    case llvm::RecurKind::Mul:
+    case llvm::RecurKind::FMul:
       op = "mul";
       break;
-    case compiler::utils::eBuiltinSubgroupScanAndExcl:
-    case compiler::utils::eBuiltinSubgroupScanLogicalAndExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanAndIncl:
-    case compiler::utils::eBuiltinSubgroupScanLogicalAndIncl:
-      // Since we only support logical and on boolean types, we can re-use the
-      // regular bitwise and builtin.
+    case llvm::RecurKind::And:
       op = "and";
       break;
-    case compiler::utils::eBuiltinSubgroupScanOrExcl:
-    case compiler::utils::eBuiltinSubgroupScanLogicalOrExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanOrIncl:
-    case compiler::utils::eBuiltinSubgroupScanLogicalOrIncl:
-      // Since we only support logical or on boolean types, we can re-use the
-      // regular bitwise or builtin.
+    case llvm::RecurKind::Or:
       op = "or";
       break;
-    case compiler::utils::eBuiltinSubgroupScanXorExcl:
-    case compiler::utils::eBuiltinSubgroupScanLogicalXorExcl:
-      isInclusive = false;
-      LLVM_FALLTHROUGH;
-    case compiler::utils::eBuiltinSubgroupScanXorIncl:
-    case compiler::utils::eBuiltinSubgroupScanLogicalXorIncl:
-      // Since we only support logical xor on boolean types, we can re-use the
-      // regular bitwise xor builtin.
+    case llvm::RecurKind::Xor:
       op = "xor";
       break;
   }
@@ -1767,12 +1651,11 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
   bool const VP = VL && SimdWidth.isScalable();
 
   O << VectorizationContext::InternalBuiltinPrefix << "sub_group_scan_"
-    << (isInclusive ? "inclusive" : "exclusive") << "_"
-    << (optIsSignedInt.has_value() ? (*optIsSignedInt ? "s" : "u") : "") << op
+    << (isInclusive ? "inclusive" : "exclusive") << "_" << op
     << (VP ? "_vp" : "") << "_";
 
   compiler::utils::TypeQualifiers VecQuals(
-      compiler::utils::eTypeQualNone, optIsSignedInt == true
+      compiler::utils::eTypeQualNone, opIsSignedInt
                                           ? compiler::utils::eTypeQualSignedInt
                                           : compiler::utils::eTypeQualNone);
   if (!mangler.mangleType(O, Tys[0], VecQuals)) {
@@ -2622,8 +2505,7 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
   if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
     // The subgroup ID is just a simple index sequence. There is no dimension
     // to it, and we only support 1D workgroups.
-    if (Builtin.isValid() &&
-        Builtin.ID == Ctx.builtins().getSubgroupLocalIdBuiltin()) {
+    if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
       IRBuilder<> B(buildAfter(CI, F));
       return multi_llvm::createIndexSequence(
           B, VectorType::get(CI->getType(), SimdWidth), SimdWidth,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 81977004bfa90..1f14ab3d42c23 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -20,15 +20,15 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i32 @_Z16get_sub_group_idv()
-declare spir_func i32 @_Z18get_sub_group_sizev()
-declare spir_func i32 @_Z22get_sub_group_local_idv()
-declare spir_func i32 @_Z19sub_group_broadcastij(i32, i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+declare spir_func i32 @__mux_get_sub_group_size()
+declare spir_func i32 @__mux_get_sub_group_local_id()
+declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32)
 
 define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call.i = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %call.i = tail call spir_func i32 @__mux_get_sub_group_id()
   %conv = zext i32 %call.i to i64
-  %call2 = tail call spir_func i32 @_Z18get_sub_group_sizev()
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_size()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
   ret void
@@ -39,7 +39,7 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 }
 
 define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
   store i32 %call, i32 addrspace(1)* %arrayidx, align 4
   ret void
@@ -49,10 +49,10 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 }
 
 define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
   %v = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %broadcast = call spir_func i32 @_Z19sub_group_broadcastij(i32 %v, i32 0)
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
   store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
index 26eb222c2ec2c..c6c8a4cf5ce47 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
@@ -19,25 +19,25 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
-declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
-declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
-declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
 
 define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -47,10 +47,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
   %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
   %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
   store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -60,10 +60,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
@@ -73,10 +73,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -86,10 +86,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -99,10 +99,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -112,10 +112,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -125,10 +125,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
@@ -138,17 +138,13 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmax_f32(
 ; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> %{{.*}})
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
index 6b164057d1737..f51a13bfd957c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -19,29 +19,29 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
-declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
-declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_i32(
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4j(<vscale x 4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -51,10 +51,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4j(<vscale x 4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -64,10 +64,10 @@ entry:
 ; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4f(<vscale x 4 x float> %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -77,10 +77,10 @@ entry:
 ; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4f(<vscale x 4 x float> %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -90,10 +90,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_and_u5nxv4j(<vscale x 4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -103,10 +103,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_or_u5nxv4j(<vscale x 4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -116,10 +116,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4j(<vscale x 4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -129,11 +129,11 @@ entry:
 ; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_and_u5nxv4b(<vscale x 4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -144,11 +144,11 @@ entry:
 ; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_or_u5nxv4b(<vscale x 4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -159,17 +159,13 @@ entry:
 ; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4b(<vscale x 4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
index 90ef5f32c81ab..455df1ab0be8a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
@@ -19,29 +19,29 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
-declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
-declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_i32(
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -51,10 +51,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -64,10 +64,10 @@ entry:
 ; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -77,10 +77,10 @@ entry:
 ; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -90,10 +90,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -103,10 +103,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -116,10 +116,10 @@ entry:
 ; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -129,11 +129,11 @@ entry:
 ; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -144,11 +144,11 @@ entry:
 ; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -159,17 +159,13 @@ entry:
 ; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
index c2aaf24b9904b..009ed6abda0f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
@@ -19,25 +19,25 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
-declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
-declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
-declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
 
 define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -47,10 +47,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
   %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
   %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
   store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -60,10 +60,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
@@ -73,10 +73,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -86,10 +86,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -99,10 +99,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -112,10 +112,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -125,10 +125,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
@@ -138,17 +138,13 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmax_f32(
 ; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index dc11d9069c739..92b783c208490 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -21,13 +21,13 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i32 @_Z16get_sub_group_idv()
-declare spir_func i32 @_Z18get_sub_group_sizev()
+declare spir_func i32 @__mux_get_sub_group_id()
+declare spir_func i32 @__mux_get_sub_group_size()
 
 define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call.i = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %call.i = tail call spir_func i32 @__mux_get_sub_group_id()
   %conv = zext i32 %call.i to i64
-  %call2 = tail call spir_func i32 @_Z18get_sub_group_sizev()
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_size()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
index 2ce2482d46b8d..7cc1b0ec0c5c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -19,32 +19,34 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
 
-declare spir_func i32 @_Z13sub_group_alli(i32)
-declare spir_func i32 @_Z13sub_group_anyi(i32)
+declare spir_func i1 @__mux_sub_group_all_i1(i1)
+declare spir_func i1 @__mux_sub_group_any_i1(i1)
 
-declare spir_func i32 @_Z20sub_group_reduce_addi(i32)
-declare spir_func i64 @_Z20sub_group_reduce_addl(i64)
-declare spir_func float @_Z20sub_group_reduce_addf(float)
-declare spir_func i32 @_Z20sub_group_reduce_mini(i32)
-declare spir_func i32 @_Z20sub_group_reduce_minj(i32)
-declare spir_func i32 @_Z20sub_group_reduce_maxi(i32)
-declare spir_func i32 @_Z20sub_group_reduce_maxj(i32)
-declare spir_func float @_Z20sub_group_reduce_minf(float)
-declare spir_func float @_Z20sub_group_reduce_maxf(float)
+declare spir_func i32 @__mux_sub_group_reduce_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_add_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fadd_f32(float)
+declare spir_func i32 @__mux_sub_group_reduce_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umax_i32(i32)
+declare spir_func float @__mux_sub_group_reduce_fmin_f32(float)
+declare spir_func float @__mux_sub_group_reduce_fmax_f32(float)
 
 define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z13sub_group_alli(i32 %0)
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
-  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32(
 ; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
@@ -60,14 +62,16 @@ entry:
 
 define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z13sub_group_anyi(i32 %0)
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
-  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32(
 ; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
@@ -83,12 +87,12 @@ entry:
 
 define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_addi(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -103,12 +107,12 @@ entry:
 
 define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
   %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_addl(i64 %0)
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv
   store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -123,12 +127,12 @@ entry:
 
 define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_addf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
@@ -143,12 +147,12 @@ entry:
 
 define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_mini(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -163,12 +167,12 @@ entry:
 
 define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_minj(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -183,12 +187,12 @@ entry:
 
 define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxi(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -203,12 +207,12 @@ entry:
 
 define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxj(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -223,12 +227,12 @@ entry:
 
 define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_minf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
@@ -243,12 +247,12 @@ entry:
 
 define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_maxf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
@@ -260,7 +264,3 @@ entry:
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[I]])
 ; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 00d47f7a22d50..032074917d73f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -19,20 +19,20 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
 
-declare spir_func i32 @_Z20sub_group_reduce_muli(i32)
-declare spir_func i64 @_Z20sub_group_reduce_mull(i64)
-declare spir_func float @_Z20sub_group_reduce_mulf(float)
+declare spir_func i32 @__mux_sub_group_reduce_mul_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_mul_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fmul_f32(float)
 
-declare spir_func i32 @_Z20sub_group_reduce_andj(i32)
-declare spir_func i32 @_Z19sub_group_reduce_ori(i32)
-declare spir_func i64 @_Z20sub_group_reduce_xorl(i64)
+declare spir_func i32 @__mux_sub_group_reduce_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_or_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_xor_i64(i64)
 
-declare spir_func i1 @_Z28sub_group_reduce_logical_andb(i1)
-declare spir_func i1 @_Z27sub_group_reduce_logical_orb(i1)
-declare spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32(
 ; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
@@ -43,12 +43,12 @@ declare spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1)
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_muli(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -63,12 +63,12 @@ entry:
 ; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
   %0 = load i64, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_mull(i64 %0)
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
   store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -83,12 +83,12 @@ entry:
 ; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_mulf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv
   store float %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -103,12 +103,12 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_andj(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -123,12 +123,12 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z19sub_group_reduce_ori(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -143,12 +143,12 @@ entry:
 ; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i64, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_xorl(i64 %0)
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
   store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -161,13 +161,13 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_andb(i1 %1)
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 %1)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   %zext = zext i1 %call2 to i32
   store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
@@ -180,13 +180,13 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call2 = tail call spir_func i1 @_Z27sub_group_reduce_logical_orb(i1 %1)
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 %1)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   %zext = zext i1 %call2 to i32
   store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
@@ -200,19 +200,15 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1 %1)
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 %1)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   %zext = zext i1 %call2 to i32
   store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
index b160f5560d5c3..1d186c09d93cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
@@ -22,25 +22,25 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
-declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
-declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
-declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
 
 define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -50,10 +50,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
   %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
   %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
   store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -63,10 +63,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
@@ -76,10 +76,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -89,10 +89,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -102,10 +102,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -115,10 +115,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -128,10 +128,10 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
@@ -141,17 +141,13 @@ entry:
 
 define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmax_f32(
 ; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
index c8e64421cd0db..145faeec0e6f1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -22,29 +22,29 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
-declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
-declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_i32(
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -54,10 +54,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -67,10 +67,10 @@ entry:
 ; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -80,10 +80,10 @@ entry:
 ; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -93,10 +93,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -106,10 +106,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -119,10 +119,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -132,11 +132,11 @@ entry:
 ; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -147,11 +147,11 @@ entry:
 ; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -162,17 +162,13 @@ entry:
 ; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
index e43769a8627c4..59b2dee6b099b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -20,16 +20,16 @@ target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 declare spir_func i32 @_Z16get_sub_group_idv()
-declare spir_func i32 @_Z22get_sub_group_local_idv()
-declare spir_func i32 @_Z19sub_group_broadcastij(i32, i32)
+declare spir_func i32 @__mux_get_sub_group_local_id()
+declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32)
 
 ; It makes sure broadcast still works when its source operand is uniform
 define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
   %call = tail call spir_func i32 @_Z16get_sub_group_idv()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
   %v = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %broadcast = call spir_func i32 @_Z19sub_group_broadcastij(i32 %v, i32 0)
-  %idx = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
+  %idx = tail call spir_func i32 @__mux_get_sub_group_local_id()
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idx
   store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -40,7 +40,3 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
 ; CHECK: [[INS:%.+]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
 ; CHECK: [[BCAST:%.+]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %out, align 4
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index 27e08e88a8b5e..5dfcca1e82f05 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -19,17 +19,17 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i32 @_Z16get_sub_group_idv()
-declare spir_func i32 @_Z18get_sub_group_sizev()
-declare spir_func i32 @_Z22get_sub_group_local_idv()
-declare spir_func i32 @_Z19sub_group_broadcastij(i32, i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i32 @_Z13sub_group_anyi(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+declare spir_func i32 @__mux_get_sub_group_size()
+declare spir_func i32 @__mux_get_sub_group_local_id()
+declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32)
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i1 @__mux_sub_group_any_i1(i1)
 
 define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call.i = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %call.i = tail call spir_func i32 @__mux_get_sub_group_id()
   %conv = zext i32 %call.i to i64
-  %call2 = tail call spir_func i32 @_Z18get_sub_group_sizev()
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_size()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
   ret void
@@ -38,7 +38,7 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 }
 
 define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
   store i32 %call, i32 addrspace(1)* %arrayidx, align 4
   ret void
@@ -47,10 +47,10 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 }
 
 define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
   %v = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %broadcast = call spir_func i32 @_Z19sub_group_broadcastij(i32 %v, i32 0)
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
   store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
   ret void
@@ -63,8 +63,8 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
 ; This used to crash as packetizing get_sub_group_local_id produces a Constant, which we weren't expecting.
 define spir_kernel void @regression_sub_group_local_id(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %xy, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z22get_sub_group_local_idv()
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_local_id()
   %0 = shl i64 %call, 32
   %idxprom = ashr exact i64 %0, 32
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %xy, i64 %idxprom
@@ -72,17 +72,15 @@ entry:
   %2 = insertelement <4 x i32> %1, i32 %call1, i64 0
   %3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, i64 0, i64 0
   store i32 %call1, i32 addrspace(1)* %3, align 16
-  %call2 = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_id()
   %4 = insertelement <4 x i32> %2, i32 %call2, i64 1
   store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx, align 16
   %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
   %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
-  %call7 = tail call spir_func i32 @_Z13sub_group_anyi(i32 %5)
+  %6 = icmp ne i32 %5, 0
+  %call7 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %6)
+  %7 = sext i1 %call7 to i32
   %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
-  store i32 %call7, i32 addrspace(1)* %arrayidx9, align 4
+  store i32 %7, i32 addrspace(1)* %arrayidx9, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
index 69e4ccc0fb961..7d27f2dc1cce8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -19,32 +19,34 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i32 @_Z16get_sub_group_idv()
-
-declare spir_func i32 @_Z13sub_group_alli(i32)
-declare spir_func i32 @_Z13sub_group_anyi(i32)
-
-declare spir_func i32 @_Z20sub_group_reduce_addi(i32)
-declare spir_func i64 @_Z20sub_group_reduce_addl(i64)
-declare spir_func float @_Z20sub_group_reduce_addf(float)
-declare spir_func i32 @_Z20sub_group_reduce_mini(i32)
-declare spir_func i32 @_Z20sub_group_reduce_minj(i32)
-declare spir_func i32 @_Z20sub_group_reduce_maxi(i32)
-declare spir_func i32 @_Z20sub_group_reduce_maxj(i32)
-declare spir_func float @_Z20sub_group_reduce_minf(float)
-declare spir_func float @_Z20sub_group_reduce_maxf(float)
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+
+declare spir_func i1 @__mux_sub_group_all_i1(i1)
+declare spir_func i1 @__mux_sub_group_any_i1(i1)
+
+declare spir_func i32 @__mux_sub_group_reduce_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_add_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fadd_f32(float)
+declare spir_func i32 @__mux_sub_group_reduce_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umax_i32(i32)
+declare spir_func float @__mux_sub_group_reduce_fmin_f32(float)
+declare spir_func float @__mux_sub_group_reduce_fmax_f32(float)
 
 define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z13sub_group_alli(i32 %0)
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
-  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_all_i32(
 ; CHECK: [[T2:%.*]] = icmp eq <4 x i32> %{{.*}}, zeroinitializer
@@ -58,14 +60,16 @@ entry:
 
 define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z13sub_group_anyi(i32 %0)
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
-  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_any_i32(
 ; CHECK: [[T2:%.*]] = icmp ne <4 x i32> %{{.*}}, zeroinitializer
@@ -79,12 +83,12 @@ entry:
 
 define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_addi(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -98,8 +102,8 @@ entry:
 
 define spir_kernel void @reduce_add_i32_uniform(i32 addrspace(1)* %out, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z20sub_group_reduce_addi(i32 %n)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %n)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx, align 4
   ret void
@@ -115,12 +119,12 @@ entry:
 
 define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
   %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_addl(i64 %0)
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv
   store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -131,12 +135,12 @@ entry:
 
 define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_addf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
@@ -147,12 +151,12 @@ entry:
 
 define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_mini(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -163,12 +167,12 @@ entry:
 
 define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_minj(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -179,12 +183,12 @@ entry:
 
 define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxi(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -195,12 +199,12 @@ entry:
 
 define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_maxj(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
@@ -211,12 +215,12 @@ entry:
 
 define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_minf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
@@ -227,12 +231,12 @@ entry:
 
 define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_maxf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 8588ef0099bbd..067dd7ac7983d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -19,32 +19,32 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i32 @_Z16get_sub_group_idv()
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
 
-declare spir_func i32 @_Z20sub_group_reduce_muli(i32)
-declare spir_func i64 @_Z20sub_group_reduce_mull(i64)
-declare spir_func float @_Z20sub_group_reduce_mulf(float)
+declare spir_func i32 @__mux_sub_group_reduce_mul_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_mul_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fmul_f32(float)
 
-declare spir_func i32 @_Z20sub_group_reduce_andj(i32)
-declare spir_func i32 @_Z19sub_group_reduce_ori(i32)
-declare spir_func i64 @_Z20sub_group_reduce_xorl(i64)
+declare spir_func i32 @__mux_sub_group_reduce_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_or_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_xor_i64(i64)
 
-declare spir_func i1 @_Z28sub_group_reduce_logical_andb(i1)
-declare spir_func i1 @_Z27sub_group_reduce_logical_orb(i1)
-declare spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_v4_reduce_mul_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %{{.*}})
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_muli(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -55,12 +55,12 @@ entry:
 ; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
   %0 = load i64, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_mull(i64 %0)
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
   store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -71,12 +71,12 @@ entry:
 ; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func float @_Z20sub_group_reduce_mulf(float %0)
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float %0)
   %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv
   store float %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -87,12 +87,12 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z20sub_group_reduce_andj(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -103,12 +103,12 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i32 @_Z19sub_group_reduce_ori(i32 %0)
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 %0)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -119,12 +119,12 @@ entry:
 ; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i64, ptr addrspace(1) %arrayidx, align 4
-  %call2 = tail call spir_func i64 @_Z20sub_group_reduce_xorl(i64 %0)
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
   store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
@@ -137,13 +137,13 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_andb(i1 %1)
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 %1)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   %zext = zext i1 %call2 to i32
   store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
@@ -156,13 +156,13 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call2 = tail call spir_func i1 @_Z27sub_group_reduce_logical_orb(i1 %1)
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 %1)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   %zext = zext i1 %call2 to i32
   store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
@@ -176,19 +176,15 @@ entry:
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i32 @_Z16get_sub_group_idv() #6
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call2 = tail call spir_func i1 @_Z28sub_group_reduce_logical_xorb(i1 %1)
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 %1)
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
   %zext = zext i1 %call2 to i32
   store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
index 297310993af81..837726ecee774 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
@@ -14,149 +14,137 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: sed 's/VERSION/i32 1, i32 2/g' %s | veczc -w 4 -S -vecz-passes=packetizer | FileCheck %s --check-prefixes CHECK,CHECK-12
-; RUN: sed 's/VERSION/i32 3, i32 0/g' %s | veczc -w 4 -S -vecz-passes=packetizer | FileCheck %s --check-prefixes CHECK,CHECK-30
+; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32)
-declare spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64)
-declare spir_func float @_Z28sub_group_scan_inclusive_addf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_minf(float)
-declare spir_func float @_Z28sub_group_scan_inclusive_maxf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
 
 define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32(
-; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}})
-; Obviously this codegen doesn't make sense for a real sub-group scan, but in
-; CL1.2 this isn't identified as one. Check instead that the call has been instantiated.
-; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
-; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
-; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
-; CHECK-12: %{{.*}} = call spir_func i32 @_Z28sub_group_scan_inclusive_addi(i32 %{{.*}})
-
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
   %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i64 @_Z28sub_group_scan_inclusive_addl(i64 %0)
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
   %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
   store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64(
-; CHECK-30: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}})
+; CHECK: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_addf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32(
-; CHECK-30: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}})
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_mini(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32(
-; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_minj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32(
-; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}})
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32(
-; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_maxj(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32(
-; CHECK-30: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}})
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_minf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32(
-; CHECK-30: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}})
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}})
 }
 
 define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_maxf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32(
-; CHECK-30: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{VERSION}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
index 961c98612b291..aa3de91133f61 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -19,29 +19,29 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare spir_func i64 @__mux_get_global_id(i32)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_inclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32)
-declare spir_func float @_Z28sub_group_scan_exclusive_mulf(float)
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
 
-declare spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32)
-declare spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32)
-declare spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1)
-declare spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1)
-declare spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_i32(
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -51,10 +51,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_exclusive_muli(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -64,10 +64,10 @@ entry:
 ; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}})
 define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_inclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -77,10 +77,10 @@ entry:
 ; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}})
 define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
   %0 = load float, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func float @_Z28sub_group_scan_exclusive_mulf(float %0)
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
   store float %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -90,10 +90,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_andi(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -103,10 +103,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z27sub_group_scan_inclusive_ori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -116,10 +116,10 @@ entry:
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}})
 define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z28sub_group_scan_inclusive_xori(i32 %0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
   ret void
@@ -129,11 +129,11 @@ entry:
 ; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_andb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -144,11 +144,11 @@ entry:
 ; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z35sub_group_scan_inclusive_logical_orb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
@@ -159,17 +159,13 @@ entry:
 ; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}})
 define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %1 = trunc i32 %0 to i1
-  %call1 = tail call spir_func i1 @_Z36sub_group_scan_inclusive_logical_xorb(i1 %1)
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
   %2 = zext i1 %call1 to i32
   store i32 %2, ptr addrspace(1) %arrayidx2, align 4
   ret void
 }
-
-!opencl.ocl.version = !{!0}
-
-!0 = !{i32 3, i32 0}

From 335019020417ca92179233340a2bf06de92ff485 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 9 Aug 2023 16:54:11 +0100
Subject: [PATCH 015/182] [compiler] Provide methods for common group operation
 checks

---
 .../vecz/source/analysis/uniform_value_analysis.cpp           | 3 +--
 .../compiler_passes/vecz/source/transform/packetizer.cpp      | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index bd7eb2b452c61..c29137f58ed19 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -124,8 +124,7 @@ static bool isSubgroupBroadcastOrReduction(
   }
   auto const Builtin = BI.analyzeBuiltin(*Callee);
   if (auto Info = BI.isMuxGroupCollective(Builtin.ID);
-      Info &&
-      Info->Scope == compiler::utils::GroupCollective::ScopeKind::SubGroup) {
+      Info && Info->isSubGroupScope()) {
     switch (Info->Op) {
       default:
         return false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index a8a01788d3069..851600797aa46 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1469,9 +1469,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
 
   // Handle subgroup scans, which defer to internal builtins.
   if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin.ID)) {
-    if (Info->Scope == compiler::utils::GroupCollective::ScopeKind::SubGroup &&
-        (Info->Op == compiler::utils::GroupCollective::OpKind::ScanExclusive ||
-         Info->Op == compiler::utils::GroupCollective::OpKind::ScanInclusive)) {
+    if (Info->isSubGroupScope() && Info->isScan()) {
       return packetizeSubgroupScan(CI, *Info);
     }
   }

From 3d0063398c0ae90595495326cc971e91b1941a20 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@gmail.com>
Date: Mon, 14 Aug 2023 12:05:31 +0100
Subject: [PATCH 016/182] [compiler] Add more helper methods for mux group
 operations

Add isReduction and isBroadcast methods to reflect isAnyAll and isScan.
---
 .../analysis/uniform_value_analysis.cpp       | 17 +++-------------
 .../vecz/source/transform/packetizer.cpp      | 20 ++++---------------
 2 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index c29137f58ed19..10fd49cce220e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -122,20 +122,9 @@ static bool isSubgroupBroadcastOrReduction(
   if (!Callee) {
     return false;
   }
-  auto const Builtin = BI.analyzeBuiltin(*Callee);
-  if (auto Info = BI.isMuxGroupCollective(Builtin.ID);
-      Info && Info->isSubGroupScope()) {
-    switch (Info->Op) {
-      default:
-        return false;
-      case compiler::utils::GroupCollective::OpKind::Any:
-      case compiler::utils::GroupCollective::OpKind::All:
-      case compiler::utils::GroupCollective::OpKind::Reduction:
-      case compiler::utils::GroupCollective::OpKind::Broadcast:
-        return true;
-    }
-  }
-  return false;
+  auto Info = BI.isMuxGroupCollective(BI.analyzeBuiltin(*Callee).ID);
+  return Info && Info->isSubGroupScope() &&
+         (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast());
 }
 
 void UniformValueResult::findVectorLeaves(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 851600797aa46..32f7803492522 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1086,18 +1086,10 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
   auto const Builtin = BI.analyzeBuiltin(*callee);
   auto const Info = BI.isMuxGroupCollective(Builtin.ID);
 
-  if (!Info ||
-      Info->Scope != compiler::utils::GroupCollective::ScopeKind::SubGroup) {
+  if (!Info || !Info->isSubGroupScope() ||
+      (!Info->isAnyAll() && !Info->isReduction())) {
     return nullptr;
   }
-  switch (Info->Op) {
-    default:
-      return nullptr;
-    case compiler::utils::GroupCollective::OpKind::Any:
-    case compiler::utils::GroupCollective::OpKind::All:
-    case compiler::utils::GroupCollective::OpKind::Reduction:
-      break;
-  }
 
   SmallVector<Value *, 16> opPackets;
   IRBuilder<> B(buildAfter(CI, F));
@@ -1171,12 +1163,8 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
   Function *callee = CI->getCalledFunction();
   auto const Builtin = BI.analyzeBuiltin(*callee);
 
-  if (auto Info = BI.isMuxGroupCollective(Builtin.ID)) {
-    if (Info->Scope != compiler::utils::GroupCollective::ScopeKind::SubGroup ||
-        Info->Op != compiler::utils::GroupCollective::OpKind::Broadcast) {
-      return nullptr;
-    }
-  } else {
+  if (auto Info = BI.isMuxGroupCollective(Builtin.ID);
+      !Info || !Info->isSubGroupScope() || !Info->isBroadcast()) {
     return nullptr;
   }
 

From 106d3b93b892817b6208ab9b28599873a7274246 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@gmail.com>
Date: Mon, 14 Aug 2023 17:33:30 +0100
Subject: [PATCH 017/182] [vecz] Migrate vecz lit tests to mux builtins

---
 .../llvm/AArch64/shuffled_load_aarch64_1.ll   |  6 +--
 .../llvm/AArch64/shuffled_load_aarch64_2.ll   |  6 +--
 .../llvm/AArch64/shuffled_load_aarch64_3.ll   |  6 +--
 .../llvm/AArch64/shuffled_load_aarch64_4.ll   |  6 +--
 .../llvm/AArch64/shuffled_load_aarch64_5.ll   |  6 +--
 .../llvm/AArch64/shuffled_load_aarch64_6.ll   |  6 +--
 .../vecz/test/lit/llvm/Boscc/boscc_killer.ll  |  8 ++--
 .../vecz/test/lit/llvm/Boscc/boscc_merge.ll   |  8 ++--
 .../vecz/test/lit/llvm/Boscc/boscc_merge2.ll  | 24 +++++------
 .../vecz/test/lit/llvm/Boscc/boscc_merge3.ll  |  6 +--
 .../lit/llvm/Boscc/duplicate_preheader.ll     |  4 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops1.ll |  8 ++--
 .../vecz/test/lit/llvm/Boscc/nested_loops2.ll |  4 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops3.ll |  4 +-
 .../vecz/test/lit/llvm/Boscc/nested_loops4.ll |  8 ++--
 .../vecz/test/lit/llvm/Boscc/nested_loops5.ll |  8 ++--
 .../lit/llvm/Boscc/partial_linearization0.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization1.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization10.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization11.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization12.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization13.ll |  8 ++--
 .../lit/llvm/Boscc/partial_linearization14.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization15.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization16.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization17.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization18.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization19.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization2.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization20.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization21.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization22.ll |  4 +-
 .../lit/llvm/Boscc/partial_linearization3.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization4.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization5.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization6.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization7.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization8.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization9.ll  |  4 +-
 .../vecz/test/lit/llvm/Boscc/printf.ll        | 20 ++++-----
 .../lit/llvm/OpaquePointers/basic_mem2reg.ll  |  6 +--
 .../llvm/OpaquePointers/basic_vecz_mem2reg.ll |  8 ++--
 .../OpaquePointers/builtin_pointer_return.ll  |  6 +--
 .../control_flow_conversion_ptrs.ll           |  4 +-
 .../OpaquePointers/interleaved_load_ooo.ll    |  6 +--
 .../lit/llvm/OpaquePointers/load_add_store.ll |  6 +--
 .../lit/llvm/OpaquePointers/masked_store.ll   |  6 +--
 .../lit/llvm/OpaquePointers/remove_intptr.ll  |  6 +--
 .../llvm/OpaquePointers/ternary_transform.ll  | 18 ++++----
 .../define_interleaved_store.ll               |  8 ++--
 .../define_interleaved_store_as_masked.ll     |  8 ++--
 .../vector_phi_uniform.ll                     |  8 ++--
 .../vector_phi_varying.ll                     |  8 ++--
 .../test/lit/llvm/RISCV/broadcast_vector.ll   | 24 +++++------
 .../test/lit/llvm/RISCV/extract_element.ll    | 14 +++----
 .../test/lit/llvm/RISCV/insert_element.ll     | 12 +++---
 .../test/lit/llvm/RISCV/packetize_shuffle.ll  |  4 +-
 .../lit/llvm/RISCV/packetize_shuffle_bool.ll  |  4 +-
 .../llvm/RISCV/packetize_shuffle_concat.ll    |  4 +-
 .../llvm/RISCV/packetize_shuffle_narrow.ll    |  4 +-
 .../lit/llvm/RISCV/packetize_shuffle_wider.ll |  4 +-
 .../lit/llvm/RISCV/select_scalar_vector.ll    |  4 +-
 .../vecz/test/lit/llvm/RISCV/vp_memops.ll     |  6 +--
 .../vecz/test/lit/llvm/RISCV/vp_vsetvli.ll    |  4 +-
 .../llvm/ScalableVectors/broadcast_vector.ll  | 20 ++++-----
 .../test/lit/llvm/ScalableVectors/builtins.ll |  4 +-
 .../test/lit/llvm/ScalableVectors/cast.ll     |  4 +-
 .../define_interleaved_store.ll               |  8 ++--
 .../define_interleaved_store_as_masked.ll     |  8 ++--
 .../ScalableVectors/define_masked_load.ll     | 18 ++++----
 .../define_masked_scatter_gather.ll           |  6 +--
 .../llvm/ScalableVectors/extract_element.ll   | 12 +++---
 .../test/lit/llvm/ScalableVectors/fadd.ll     |  4 +-
 .../lit/llvm/ScalableVectors/fail_builtins.ll |  4 +-
 .../llvm/ScalableVectors/insert_element.ll    | 10 ++---
 .../llvm/ScalableVectors/interleaved_load.ll  |  4 +-
 .../lit/llvm/ScalableVectors/intrinsics.ll    | 16 +++----
 .../llvm/ScalableVectors/load_add_store.ll    |  4 +-
 .../llvm/ScalableVectors/load_binops_store.ll |  4 +-
 .../test/lit/llvm/ScalableVectors/metadata.ll |  4 +-
 .../ScalableVectors/packetize_mask_varying.ll |  4 +-
 .../lit/llvm/ScalableVectors/scalable_auto.ll |  4 +-
 .../test/lit/llvm/ScalableVectors/select.ll   |  6 +--
 .../ScalableVectors/select_scalar_vector.ll   |  4 +-
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |  6 +--
 .../test/lit/llvm/ScalableVectors/vectors.ll  |  4 +-
 .../ScalableVectors/verification_fail_phi.ll  |  4 +-
 .../lit/llvm/ScalableVectors/widen_vload.ll   |  4 +-
 .../llvm/ScalableVectors/workitem_funcs.ll    |  4 +-
 .../llvm/VectorPredication/boscc_reduction.ll |  4 +-
 .../test/lit/llvm/VectorPredication/choice.ll |  4 +-
 .../define_interleaved_load_store.ll          |  6 +--
 .../define_masked_load_store.ll               | 16 +++----
 .../define_masked_scatter_gather.ll           |  6 +--
 .../llvm/VectorPredication/load_add_store.ll  |  6 +--
 .../packetize_mask_varying.ll                 |  4 +-
 .../llvm/VectorPredication/scatter_gather.ll  |  6 +--
 .../test/lit/llvm/VectorPredication/udiv.ll   |  4 +-
 .../VectorWidening/define_interleaved_load.ll |  8 ++--
 .../define_interleaved_load_as_masked.ll      |  8 ++--
 .../VectorWidening/delete_packetized_memop.ll |  4 +-
 .../extractelement_constant_index.ll          |  4 +-
 .../extractelement_runtime_index.ll           |  4 +-
 .../extractelement_runtime_index2.ll          |  4 +-
 .../extractelement_runtime_index3.ll          |  4 +-
 .../insertelement_constant_index.ll           |  4 +-
 ...rtelement_constant_index_constant_value.ll |  4 +-
 .../insertelement_runtime_index.ll            |  4 +-
 .../llvm/VectorWidening/interleaved_safety.ll | 12 +++---
 .../onearg_relationals_isfiniteDv4_d.ll       |  6 +--
 .../onearg_relationals_isfiniteDv4_f.ll       |  4 +-
 .../onearg_relationals_isinfDv4_d.ll          |  4 +-
 .../onearg_relationals_isinfDv4_f.ll          |  4 +-
 .../onearg_relationals_isnanDv4_d.ll          |  4 +-
 .../onearg_relationals_isnanDv4_f.ll          |  4 +-
 .../onearg_relationals_isnormalDv4_d.ll       |  4 +-
 .../onearg_relationals_isnormalDv4_f.ll       |  4 +-
 .../llvm/VectorWidening/scalar_vector_user.ll |  4 +-
 .../lit/llvm/VectorWidening/vector_copy.ll    |  4 +-
 .../llvm/VectorWidening/vector_phi_varying.ll |  8 ++--
 .../test/lit/llvm/VectorWidening/widen_abs.ll | 10 ++---
 .../lit/llvm/VectorWidening/widen_binops.ll   |  4 +-
 .../lit/llvm/VectorWidening/widen_copysign.ll | 10 ++---
 .../test/lit/llvm/VectorWidening/widen_fma.ll |  4 +-
 .../widen_fmin_vector_scalar.ll               |  4 +-
 .../lit/llvm/VectorWidening/widen_fmuladd.ll  |  4 +-
 .../lit/llvm/VectorWidening/widen_fmuladd2.ll |  4 +-
 .../llvm/VectorWidening/widen_fmuladd_phi.ll  |  4 +-
 .../lit/llvm/VectorWidening/widen_fshl.ll     |  4 +-
 .../lit/llvm/VectorWidening/widen_fshr.ll     |  4 +-
 .../VectorWidening/widen_shufflevector.ll     |  4 +-
 .../lit/llvm/VectorWidening/widen_sqrt.ll     |  4 +-
 .../vecz/test/lit/llvm/alloca_alias.ll        |  4 +-
 .../vecz/test/lit/llvm/arm_neon_store.ll      |  4 +-
 .../lit/llvm/async_workgroup_copy_uniform.ll  | 12 +++---
 .../vecz/test/lit/llvm/atomic_cmpxchg.ll      |  8 ++--
 .../vecz/test/lit/llvm/atomicrmw.ll           |  8 ++--
 .../vecz/test/lit/llvm/atomicrmw_uniform.ll   |  8 ++--
 .../vecz/test/lit/llvm/basic_mem2reg.ll       |  6 +--
 .../vecz/test/lit/llvm/bitcast_function.ll    |  4 +-
 .../test/lit/llvm/branch_splitting_and.ll     |  6 +--
 .../vecz/test/lit/llvm/branch_splitting_or.ll |  6 +--
 .../test/lit/llvm/builtin_inlining_addsat.ll  | 14 +++----
 .../test/lit/llvm/builtin_inlining_fmax.ll    |  2 +-
 .../test/lit/llvm/builtin_inlining_fmin.ll    |  2 +-
 .../lit/llvm/builtin_inlining_negative.ll     |  8 ++--
 .../lit/llvm/builtin_inlining_positive.ll     |  6 +--
 .../test/lit/llvm/builtin_pointer_return.ll   |  6 +--
 ...all_instantiation_failure_cantduplicate.ll | 16 +++----
 .../call_instantiation_failure_cantinline.ll  | 16 +++----
 .../call_instantiation_failure_optnone.ll     | 16 +++----
 ...ll_instantiation_failure_user_undefined.ll | 16 +++----
 .../call_instantiation_success_builtin.ll     | 16 +++----
 .../call_instantiation_success_instrinsic.ll  | 16 +++----
 ...call_instantiation_success_user_defined.ll | 16 +++----
 .../vecz/test/lit/llvm/constant_address.ll    |  6 +--
 .../vecz/test/lit/llvm/contiguous_allocas.ll  |  8 ++--
 .../control_flow_conversion_nested_loops.ll   | 10 ++---
 .../llvm/control_flow_conversion_order_y.ll   | 10 ++---
 .../llvm/control_flow_conversion_order_z.ll   | 10 ++---
 .../lit/llvm/control_flow_conversion_ptrs.ll  |  6 +--
 .../control_flow_conversion_uniform_if.ll     | 10 ++---
 .../control_flow_conversion_uniform_loop.ll   | 10 ++---
 .../control_flow_conversion_varying_if.ll     | 10 ++---
 .../control_flow_conversion_varying_loop.ll   | 10 ++---
 .../vecz/test/lit/llvm/convert3.ll            |  4 +-
 .../vecz/test/lit/llvm/convert4.ll            |  4 +-
 .../vecz/test/lit/llvm/convert_contiguity.ll  |  4 +-
 .../vecz/test/lit/llvm/define_gather_load.ll  |  4 +-
 .../lit/llvm/define_gather_load_as_masked.ll  |  4 +-
 .../test/lit/llvm/define_interleaved_load.ll  |  8 ++--
 .../llvm/define_interleaved_load_as_masked.ll |  8 ++--
 .../test/lit/llvm/define_interleaved_store.ll |  8 ++--
 .../define_interleaved_store_as_masked.ll     |  8 ++--
 .../lit/llvm/define_masked_gather_load.ll     |  6 +--
 .../vecz/test/lit/llvm/define_masked_load.ll  | 16 +++----
 .../lit/llvm/define_masked_scatter_store.ll   |  6 +--
 .../vecz/test/lit/llvm/define_masked_store.ll | 16 +++----
 .../test/lit/llvm/define_scatter_store.ll     |  4 +-
 .../llvm/define_scatter_store_as_masked.ll    |  4 +-
 .../test/lit/llvm/delete_packetized_memop.ll  |  4 +-
 .../vecz/test/lit/llvm/early-cse-mul-swap.ll  | 18 ++++----
 .../vecz/test/lit/llvm/emit_memintrinsics.ll  |  4 +-
 .../llvm/emit_no_unaligned_memintrinsics.ll   |  4 +-
 .../vecz/test/lit/llvm/expect_assume.ll       |  6 +--
 .../lit/llvm/extractelement_constant_index.ll |  4 +-
 .../lit/llvm/extractelement_runtime_index.ll  |  4 +-
 .../vecz/test/lit/llvm/gep_duplication.ll     |  4 +-
 .../vecz/test/lit/llvm/gep_elim_opaque.ll     |  8 ++--
 .../lit/llvm/inlined_function_debug_info.ll   |  4 +-
 .../lit/llvm/insert_element_debug_info.ll     |  4 +-
 .../lit/llvm/insertelement_constant_index.ll  |  4 +-
 .../lit/llvm/insertelement_runtime_index.ll   |  4 +-
 .../test/lit/llvm/instantiate_constants.ll    |  4 +-
 .../llvm/interleaved_defuse_instantiated.ll   |  6 +--
 .../vecz/test/lit/llvm/interleaved_load16.ll  |  6 +--
 .../test/lit/llvm/interleaved_load_ooo.ll     |  6 +--
 .../vecz/test/lit/llvm/interleaved_safety.ll  | 12 +++---
 .../test/lit/llvm/intrinsics-scalarize.ll     | 16 +++----
 .../vecz/test/lit/llvm/intrinsics.ll          | 16 +++----
 .../vecz/test/lit/llvm/irreducible_loop.ll    |  4 +-
 .../test/lit/llvm/loop_call_instantiation.ll  |  4 +-
 .../test/lit/llvm/masked_calls_max_builtin.ll |  4 +-
 .../vecz/test/lit/llvm/masked_interleaved.ll  |  4 +-
 .../lit/llvm/masked_interleaved_as_scatter.ll |  4 +-
 .../test/lit/llvm/masked_interleaved_group.ll |  4 +-
 .../lit/llvm/masked_interleaved_group2.ll     |  4 +-
 .../vecz/test/lit/llvm/masking_exit_blocks.ll |  4 +-
 .../vecz/test/lit/llvm/memop_stride.ll        |  4 +-
 .../vecz/test/lit/llvm/memop_stride10.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride11.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride12.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride13.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride14.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride15.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride16.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride17.ll      |  6 +--
 .../vecz/test/lit/llvm/memop_stride18.ll      |  4 +-
 .../vecz/test/lit/llvm/memop_stride2.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride3.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride4.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride5.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride6.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride7.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride8.ll       |  4 +-
 .../vecz/test/lit/llvm/memop_stride9.ll       |  4 +-
 .../test/lit/llvm/multiple_exit_blocks.ll     |  8 ++--
 .../lit/llvm/multiple_kernels_inlining.ll     |  6 +--
 .../test/lit/llvm/multiple_vectorizations.ll  |  4 +-
 .../llvm/multiple_vectorizations_nested.ll    |  4 +-
 .../lit/llvm/multiple_vectorizations_vp.ll    |  4 +-
 .../test/lit/llvm/no_instantiate_memop.ll     |  4 +-
 .../test/lit/llvm/no_over_scalarization.ll    |  4 +-
 .../test/lit/llvm/no_redundant_bitcasts.ll    |  4 +-
 .../vecz/test/lit/llvm/no_vecz1.ll            |  4 +-
 .../vecz/test/lit/llvm/no_vecz2.ll            |  4 +-
 .../test/lit/llvm/offset_info_analysis.ll     |  6 +--
 .../llvm/onearg_relationals_isfiniteDv4_d.ll  |  6 +--
 .../llvm/onearg_relationals_isfiniteDv4_f.ll  |  4 +-
 .../lit/llvm/onearg_relationals_isfinited.ll  | 42 +++++++++----------
 .../lit/llvm/onearg_relationals_isfinitef.ll  | 42 +++++++++----------
 .../lit/llvm/onearg_relationals_isinfDv4_d.ll |  4 +-
 .../lit/llvm/onearg_relationals_isinfDv4_f.ll |  4 +-
 .../lit/llvm/onearg_relationals_isinfd.ll     | 42 +++++++++----------
 .../lit/llvm/onearg_relationals_isinff.ll     | 42 +++++++++----------
 .../lit/llvm/onearg_relationals_isnanDv4_d.ll |  4 +-
 .../lit/llvm/onearg_relationals_isnanDv4_f.ll |  4 +-
 .../lit/llvm/onearg_relationals_isnand.ll     | 42 +++++++++----------
 .../lit/llvm/onearg_relationals_isnanf.ll     | 42 +++++++++----------
 .../llvm/onearg_relationals_isnormalDv4_d.ll  |  4 +-
 .../llvm/onearg_relationals_isnormalDv4_f.ll  |  4 +-
 .../lit/llvm/onearg_relationals_isnormald.ll  | 42 +++++++++----------
 .../lit/llvm/onearg_relationals_isnormalf.ll  | 42 +++++++++----------
 .../vecz/test/lit/llvm/opencl_metadata1.ll    |  6 +--
 .../vecz/test/lit/llvm/opencl_metadata2.ll    |  6 +--
 .../vecz/test/lit/llvm/overaligned_allocas.ll |  8 ++--
 .../test/lit/llvm/packetization_branch.ll     |  6 +--
 .../test/lit/llvm/packetization_debug_info.ll |  4 +-
 .../test/lit/llvm/packetization_nonvarying.ll | 10 ++---
 .../lit/llvm/packetization_uniform_branch.ll  | 10 ++---
 .../test/lit/llvm/packetize_struct_gep.ll     |  4 +-
 .../lit/llvm/packetize_uniform_conditional.ll | 20 ++++-----
 .../packetize_uniform_default_conditional.ll  | 20 ++++-----
 .../packetize_uniform_default_noreduce.ll     | 20 ++++-----
 .../packetize_uniform_default_noreduce2.ll    | 10 ++---
 .../llvm/packetize_uniform_default_reduce.ll  | 22 +++++-----
 .../packetize_uniform_loops_conditional.ll    | 20 ++++-----
 .../llvm/packetize_uniform_loops_noreduce.ll  | 20 ++++-----
 .../llvm/packetize_uniform_loops_noreduce2.ll | 10 ++---
 .../llvm/packetize_uniform_loops_reduce.ll    | 10 ++---
 .../lit/llvm/packetize_uniform_noreduce.ll    | 20 ++++-----
 .../lit/llvm/packetize_uniform_noreduce2.ll   | 10 ++---
 .../test/lit/llvm/packetize_uniform_reduce.ll | 10 ++---
 .../test/lit/llvm/partial_linearization0.ll   |  4 +-
 .../test/lit/llvm/partial_linearization1.ll   |  4 +-
 .../test/lit/llvm/partial_linearization10.ll  |  4 +-
 .../test/lit/llvm/partial_linearization11.ll  |  4 +-
 .../test/lit/llvm/partial_linearization12.ll  |  4 +-
 .../test/lit/llvm/partial_linearization13.ll  |  8 ++--
 .../test/lit/llvm/partial_linearization14.ll  |  4 +-
 .../test/lit/llvm/partial_linearization15.ll  |  4 +-
 .../test/lit/llvm/partial_linearization16.ll  |  4 +-
 .../test/lit/llvm/partial_linearization17.ll  |  4 +-
 .../test/lit/llvm/partial_linearization18.ll  |  4 +-
 .../test/lit/llvm/partial_linearization19.ll  |  4 +-
 .../test/lit/llvm/partial_linearization2.ll   |  4 +-
 .../test/lit/llvm/partial_linearization20.ll  |  4 +-
 .../test/lit/llvm/partial_linearization21.ll  |  4 +-
 .../test/lit/llvm/partial_linearization22.ll  |  4 +-
 .../test/lit/llvm/partial_linearization23.ll  |  4 +-
 .../test/lit/llvm/partial_linearization3.ll   |  4 +-
 .../test/lit/llvm/partial_linearization4.ll   |  4 +-
 .../test/lit/llvm/partial_linearization5.ll   |  4 +-
 .../test/lit/llvm/partial_linearization6.ll   |  4 +-
 .../test/lit/llvm/partial_linearization7.ll   |  4 +-
 .../test/lit/llvm/partial_linearization8.ll   |  4 +-
 .../test/lit/llvm/partial_linearization9.ll   |  4 +-
 .../llvm/partial_linearization_exit_masks.ll  |  4 +-
 .../vecz/test/lit/llvm/pass_pipeline.ll       |  4 +-
 .../test/lit/llvm/pass_pipeline_printafter.ll |  8 ++--
 .../vecz/test/lit/llvm/phi_interleaved.ll     |  4 +-
 .../vecz/test/lit/llvm/phi_node_debug_info.ll |  4 +-
 .../vecz/test/lit/llvm/phi_scatter_gather.ll  |  4 +-
 .../test/lit/llvm/phi_scatter_gather_2.ll     |  4 +-
 .../test/lit/llvm/predicate_with_switch.ll    |  8 ++--
 .../vecz/test/lit/llvm/preserve-fast-math.ll  |  4 +-
 .../vecz/test/lit/llvm/printf_float.ll        |  6 +--
 .../vecz/test/lit/llvm/regression_by_all.ll   |  4 +-
 .../vecz/test/lit/llvm/remove_intptr.ll       |  6 +--
 .../vecz/test/lit/llvm/remove_intptr_2.ll     |  4 +-
 .../vecz/test/lit/llvm/remove_intptr_phi.ll   |  4 +-
 .../vecz/test/lit/llvm/roscc_simplify.ll      |  4 +-
 .../vecz/test/lit/llvm/scalar_vector_user.ll  |  4 +-
 .../vecz/test/lit/llvm/scalarization_calls.ll |  4 +-
 .../test/lit/llvm/scalarization_debug_info.ll |  4 +-
 .../lit/llvm/scalarization_instructions.ll    |  4 +-
 .../llvm/scalarization_masked_load_store.ll   |  6 +--
 .../vecz/test/lit/llvm/scalarize-gather.ll    |  4 +-
 .../vecz/test/lit/llvm/scalarize-splat.ll     |  4 +-
 .../vecz/test/lit/llvm/scalarize_mixed_gep.ll |  4 +-
 .../vecz/test/lit/llvm/scan_fact.ll           | 12 +++---
 .../vecz/test/lit/llvm/select-no-crash.ll     |  4 +-
 .../vecz/test/lit/llvm/shuffled_load_1.ll     |  6 +--
 .../vecz/test/lit/llvm/shuffled_load_2.ll     |  6 +--
 .../vecz/test/lit/llvm/shuffled_load_3.ll     |  6 +--
 .../vecz/test/lit/llvm/shuffled_load_4.ll     |  6 +--
 .../vecz/test/lit/llvm/shuffled_load_5.ll     |  6 +--
 .../vecz/test/lit/llvm/shuffled_load_6.ll     |  6 +--
 .../vecz/test/lit/llvm/squash_extract_sext.ll |  4 +-
 .../lit/llvm/squash_extract_sext_bigendian.ll |  4 +-
 .../vecz/test/lit/llvm/squash_extract_zext.ll |  4 +-
 .../lit/llvm/squash_extract_zext_bigendian.ll |  4 +-
 .../test/lit/llvm/squash_float2_gather.ll     |  6 +--
 .../vecz/test/lit/llvm/stride_aligned.ll      | 22 +++++-----
 .../lit/llvm/stride_aligned_scalarized.ll     | 22 +++++-----
 .../vecz/test/lit/llvm/stride_misaligned.ll   | 22 +++++-----
 .../lit/llvm/stride_misaligned_scalarized.ll  | 22 +++++-----
 .../vecz/test/lit/llvm/struct_phi.ll          |  4 +-
 .../vecz/test/lit/llvm/struct_select.ll       |  4 +-
 .../vecz/test/lit/llvm/subgroup_broadcast.ll  | 12 +++---
 .../ternary_transform_different_strides.ll    |  6 +--
 .../llvm/ternary_transform_divergent_gep.ll   |  4 +-
 .../ternary_transform_divergent_source.ll     |  4 +-
 .../lit/llvm/ternary_transform_negative.ll    |  6 +--
 .../lit/llvm/ternary_transform_positive.ll    |  6 +--
 ...ary_transform_uniform_cond_diff_strides.ll |  6 +--
 .../ternary_transform_uniform_condition.ll    |  4 +-
 ..._transform_uniform_condition_packetized.ll |  4 +-
 .../llvm/ternary_transform_uniform_source.ll  |  6 +--
 .../llvm/ternary_transform_uniform_sources.ll |  6 +--
 .../llvm/too_large_simdwidth_packetization.ll |  4 +-
 .../llvm/too_large_simdwidth_scalarization.ll |  4 +-
 .../vecz/test/lit/llvm/undef_debug_info.ll    |  4 +-
 .../vecz/test/lit/llvm/undef_ub.ll            | 11 ++---
 .../test/lit/llvm/uniform_address_base.ll     |  6 +--
 .../test/lit/llvm/uniform_address_index.ll    |  6 +--
 .../lit/llvm/uniform_loop_contiguous_phi1.ll  |  4 +-
 .../lit/llvm/uniform_loop_contiguous_phi2.ll  |  4 +-
 .../lit/llvm/uniform_loop_contiguous_phi3.ll  |  4 +-
 .../lit/llvm/uniform_loop_contiguous_phi4.ll  |  4 +-
 .../test/lit/llvm/uniform_reassociation1.ll   |  6 +--
 .../test/lit/llvm/uniform_reassociation2.ll   |  6 +--
 .../test/lit/llvm/uniform_reassociation3.ll   |  6 +--
 .../vecz/test/lit/llvm/user_calls.ll          | 14 +++----
 .../vecz/test/lit/llvm/varying_load1.ll       |  4 +-
 .../vecz/test/lit/llvm/varying_load2.ll       | 12 +++---
 .../llvm/vector_intrinsics_scalarization.ll   |  6 +--
 .../vecz/test/lit/llvm/vector_phi_uniform.ll  |  8 ++--
 .../vecz/test/lit/llvm/vector_phi_varying.ll  |  8 ++--
 .../vecz/test/lit/llvm/vector_printf.ll       | 10 ++---
 .../vecz/test/lit/llvm/vector_printf32.ll     | 10 ++---
 .../vecz/test/lit/llvm/vector_printf64.ll     | 10 ++---
 .../test/lit/llvm/vector_printf_floats.ll     | 10 ++---
 .../vector_printf_floats_no_double_support.ll | 10 ++---
 .../vecz/test/lit/llvm/vecz_blend_div_loop.ll |  6 +--
 .../test/lit/llvm/vecz_scalar_gather_load.ll  |  8 ++--
 .../lit/llvm/vecz_scalar_interleaved_load.ll  |  6 +--
 .../vecz/test/lit/llvm/workitem_builtins.ll   | 37 ++++++++--------
 378 files changed, 1399 insertions(+), 1403 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
index 84f54c35b55f6..16a72cc565aff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define {{(dso_local )?}}spir_kernel void @load16
 ; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
index 9e400effc5b9d..fb45e248d8a4f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -41,7 +41,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define {{(dso_local )?}}spir_kernel void @load16
 ; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
index fe94dc4b90812..1b402d7e66254 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define {{(dso_local )?}}spir_kernel void @load16
 ; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
index c27a536614a30..88abe95158263 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define {{(dso_local )?}}spir_kernel void @load16
 ; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
index 6f866824b68cf..2750a23db71ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -52,7 +52,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define {{(dso_local )?}}spir_kernel void @load16
 ; CHECK: [[LOAD1:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
index a287d5c3d4f5c..c09fcd7f7dbfb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define {{(dso_local )?}}spir_kernel void @load16
 ; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
index 219d7f25dde1b..51191f7a20b95 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
@@ -21,15 +21,15 @@ source_filename = "Unknown buffer"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 @boscc_killer.shared = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @boscc_killer(float addrspace(1)* %A, float addrspace(1)* %B, i32 %N, i32 %lda) {
 entry:
-  %gid0 = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %gid0 = tail call i64 @__mux_get_local_id(i32 0)
   %cmp0 = icmp eq i64 %gid0, 0
   br i1 %cmp0, label %if.then, label %if.end
 
@@ -78,7 +78,7 @@ if.then3:                             ; preds = %for.cond.exit, %if.then53
   %v23 = load float, float addrspace(1)* %arrayidxB, align 16
   %arrayidxA = getelementptr inbounds float, float addrspace(1)* %A, i64 %gid0
   store float %v23, float addrspace(1)* %arrayidxA, align 16
-  %call149 = tail call spir_func i64 @_Z14get_local_sizej(i32 0) #6
+  %call149 = tail call i64 @__mux_get_local_size(i32 0) #6
   %conv152 = add i64 %call149, %gid0
   %cmp71 = icmp slt i64 %conv152, 0
   br label %exit
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
index 50ca9820ff272..e8e6062d818f1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -21,13 +21,13 @@ source_filename = "Unknown buffer"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32) #0
-declare spir_func i64 @_Z14get_local_sizej(i32) #0
+declare i64 @__mux_get_local_id(i32) #0
+declare i64 @__mux_get_local_size(i32) #0
 
 define spir_kernel void @boscc_merge(i32 %n, float addrspace(1)* %out, i64 %x) {
 entry:
-  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0)
-  %lsize = tail call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %lsize = tail call i64 @__mux_get_local_size(i32 0)
   %out_ptr = getelementptr inbounds float, float addrspace(1)* %out, i64 %x
   %lid_sum_lsize = add i64 %lid, %lsize
   %cmp1 = icmp ult i64 %lsize, %x
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
index 9b40d771ffd59..88e6392293a9c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
@@ -24,8 +24,8 @@ target triple = "spir64-unknown-unknown"
 declare float @llvm.fmuladd.f32(float, float, float) #2
 declare void @__mux_work_group_barrier(i32, i32, i32) #3
 declare spir_func float @_Z3maxff(float, float) #1
-declare spir_func i64 @_Z12get_local_idj(i32) #1
-declare spir_func i64 @_Z12get_group_idj(i32) #1
+declare i64 @__mux_get_local_id(i32) #1
+declare i64 @__mux_get_group_id(i32) #1
 
 @fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared = internal addrspace(3) global [640 x float] undef, align 4
 @fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared = internal addrspace(3) global [1152 x float] undef, align 4
@@ -42,38 +42,38 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp1, label %if.then, label %if.else
 
 if.then:                                      ; preds = %for.cond
-  %call1 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
-  %call2 = call spir_func i64 @_Z12get_group_idj(i32 1) #5
+  %call1 = call i64 @__mux_get_local_id(i32 0) #5
+  %call2 = call i64 @__mux_get_group_id(i32 1) #5
   %idx1 = getelementptr inbounds [640 x float], [640 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared, i64 0, i64 %call1
   store float 0.000000e+00, float addrspace(3)* %idx1, align 4
   %cmp2 = icmp sgt i64 %call2, %call1
   br i1 %cmp2, label %if.then2, label %land.lhs.true1
 
 land.lhs.true1:                                 ; preds = %if.then
-  %call3 = call spir_func i64 @_Z12get_group_idj(i32 1) #5
-  %call4 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call3 = call i64 @__mux_get_group_id(i32 1) #5
+  %call4 = call i64 @__mux_get_local_id(i32 0) #5
   %cmp3 = icmp slt i64 %call3, %call4
   br i1 %cmp3, label %land.lhs.true2, label %if.then2
 
 land.lhs.true2:                                 ; preds = %land.lhs.true1
-  %call5 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
-  %call6 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %call5 = call i64 @__mux_get_local_id(i32 0) #5
+  %call6 = call i64 @__mux_get_group_id(i32 0) #5
   %cmp4 = icmp sgt i64 %call6, %call5
   br i1 %cmp4, label %if.then2, label %land.lhs.true3
 
 land.lhs.true3:                                 ; preds = %land.lhs.true2
-  %call7 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
-  %call8 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call7 = call i64 @__mux_get_group_id(i32 0) #5
+  %call8 = call i64 @__mux_get_local_id(i32 0) #5
   %cmp5 = icmp slt i64 %call7, %call8
   br i1 %cmp5, label %cond.true4, label %if.then2
 
 cond.true4:                                     ; preds = %land.lhs.true3
-  %call9 = call spir_func i64 @_Z12get_local_idj(i32 1) #5
+  %call9 = call i64 @__mux_get_local_id(i32 1) #5
   %idx2 = getelementptr inbounds float, float addrspace(1)* %input0, i64 %call9
   br label %if.then2
 
 if.then2:                                      ; preds = %cond.true4, %land.lhs.true3, %land.lhs.true2, %land.lhs.true1, %if.then
-  %call10 = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call10 = call i64 @__mux_get_local_id(i32 0) #5
   %conv = trunc i64 %call10 to i32
   %idx3 = sext i32 %conv to i64
   %idx4 = getelementptr inbounds [1152 x float], [1152 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared, i64 0, i64 %idx3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
index dda323e5be329..ccd79ca20e4b6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -22,15 +22,15 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z13get_global_idj(i32) #0
+declare i64 @__mux_get_global_id(i32) #0
 
 ; Function Attrs: nounwind readnone
 declare spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64, float addrspace(1)*)
 
 define spir_kernel void @boscc_merge3(float addrspace(1)* %out, i64 %n, float %m) {
 entry:
-  %gid0 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
-  %gid1 = tail call spir_func i64 @_Z13get_global_idj(i32 1) #0
+  %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0
   %cmp1 = icmp slt i64 %gid0, %n
   br i1 %cmp1, label %if.then1, label %end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
index d7d306bedb8eb..caac5a4913b05 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
@@ -28,11 +28,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: noduplicate
 declare void @__mux_work_group_barrier(i32, i32, i32) #1
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
 define spir_kernel void @duplicate_preheader(i32 addrspace(1)* %out, i32 %n) {
 entry:
-  %id = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %id = tail call i64 @__mux_get_local_id(i32 0)
   %cmp = icmp sgt i64 %id, 3
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
index 8d7fd3dfe41b3..d3fcb5d70f7b6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
@@ -22,10 +22,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z13get_global_idj(i32) #0
+declare i64 @__mux_get_global_id(i32) #0
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z15get_global_sizej(i32) #0
+declare i64 @__mux_get_global_size(i32) #0
 
 ; Function Attrs: nounwind readnone
 declare spir_func float @_Z3madfff(float, float, float) #0
@@ -33,8 +33,8 @@ declare spir_func float @_Z3madfff(float, float, float) #0
 ; Function Attrs: nounwind
 define spir_kernel void @nested_loops1(i32 %n, float addrspace(1)* %out) #1 {
 entry:
-  %gid = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
-  %gsize = tail call spir_func i64 @_Z15get_global_sizej(i32 0) #0
+  %gid = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gsize = tail call i64 @__mux_get_global_size(i32 0) #0
   %trunc_gid = trunc i64 %gid to i32
   %trunc_gsize = trunc i64 %gsize to i32
   %cmp1 = icmp slt i32 %trunc_gid, %n
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
index ab53afe07ee59..0b105a2bd0304 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @nested_loops2(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp slt i32 %conv, 16
   br i1 %cmp, label %if.then, label %if.end25
@@ -83,7 +83,7 @@ if.end25:                                         ; preds = %for.cond, %entry
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
index dbcef7f094a7c..ab7f0a99dce07 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
@@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #0
+declare i64 @__mux_get_global_id(i32) #0
 
 ; Function Attrs: nounwind readnone speculatable
 declare float @llvm.fmuladd.f32(float, float, float) #1
@@ -30,7 +30,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
 ; Function Attrs: convergent nounwind
 define spir_kernel void @nested_loops3(float addrspace(1)* %symmat, float addrspace(1)* %data, i32 %m, i32 %n) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %conv = trunc i64 %call to i32
   %sub = add nsw i32 %m, -1
   %cmp = icmp sgt i32 %sub, %conv
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
index 03690d55d1146..0861376c97c47 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
@@ -21,10 +21,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z13get_global_idj(i32) #0
+declare i64 @__mux_get_global_id(i32) #0
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z15get_global_sizej(i32) #0
+declare i64 @__mux_get_global_size(i32) #0
 
 ; Function Attrs: nounwind readnone
 declare spir_func float @_Z3dotDv2_fS_(<2 x float>, <2 x float>) #0
@@ -36,8 +36,8 @@ declare spir_func i32 @_Z6mul_hijj(i32, i32) #0
 
 define spir_kernel void @nested_loops4(i32 %n, float addrspace(1)* %out) {
 entry:
-  %gid = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
-  %gsize = tail call spir_func i64 @_Z15get_global_sizej(i32 0) #0
+  %gid = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gsize = tail call i64 @__mux_get_global_size(i32 0) #0
   %trunc_gid = trunc i64 %gid to i32
   %trunc_gsize = trunc i64 %gsize to i32
   %cmp1 = icmp slt i32 %trunc_gid, %n
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
index 443a7d4022268..7c7df2bcaf60e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
@@ -21,14 +21,14 @@ source_filename = "Unknown buffer"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_size(i32)
 
 define spir_kernel void @nested_loops5(float addrspace(1)*) {
 entry:
-  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0)
-  %lsize = tail call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %lsize = tail call i64 @__mux_get_local_size(i32 0)
   %cmp1 = icmp ult i64 %lid, %lsize
   br i1 %cmp1, label %loop, label %end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
index 7ed5152464a06..b6d4689590781 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
@@ -103,7 +103,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %rem = srem i32 %conv, 5
   %cmp = icmp eq i32 %rem, 0
@@ -233,7 +233,7 @@ if.end73:                                         ; preds = %if.end70, %if.end41
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
index 88fcf417937ea..aa6a0574e072a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
@@ -95,7 +95,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -186,7 +186,7 @@ early:                                            ; preds = %for.end34, %for.end
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
index 5901638978c51..339bdf357c1ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
@@ -163,7 +163,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -318,7 +318,7 @@ s:                                                ; preds = %for.cond68, %for.co
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
index 0cd31969ada6c..180455e75554d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
@@ -148,7 +148,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -263,7 +263,7 @@ n46:                                              ; preds = %i44, %for.cond35
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
index 042bb6922a543..eba24fbbd23c1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
@@ -215,7 +215,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -433,7 +433,7 @@ v:                                                ; preds = %for.cond107, %for.c
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
index 16ca0d0c2dd34..f7937ad89512f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -98,8 +98,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %call1 = call i64 @__mux_get_global_size(i32 0) #2
   %add = add i64 %call, 1
   %cmp = icmp ult i64 %add, %call1
   br i1 %cmp, label %if.then, label %if.else
@@ -164,10 +164,10 @@ if.end17:                                         ; preds = %sw.bb14, %if.else,
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z15get_global_sizej(i32) #1
+declare i64 @__mux_get_global_size(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
index cff65f417327d..da33027e89833 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
@@ -101,7 +101,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp slt i32 %n, 5
   br i1 %cmp, label %for.cond, label %while.body
@@ -200,7 +200,7 @@ early:                                            ; preds = %for.end49, %for.end
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
index 103ccaccb3184..2dc5add4f2f75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
@@ -147,7 +147,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -274,7 +274,7 @@ q:                                                ; preds = %for.cond59, %for.co
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
index 3272393f06bab..63a13af815eae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
@@ -109,7 +109,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp slt i32 %n, 5
   br i1 %cmp, label %for.cond, label %while.body
@@ -220,7 +220,7 @@ early:                                            ; preds = %for.cond52, %for.en
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
index 59b355d4eb5a5..5e7b83b787240 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -132,7 +132,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -251,7 +251,7 @@ p:                                                ; preds = %for.cond60, %for.en
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
index d8fc25d7dd7df..571c0a48fdc06 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
@@ -111,7 +111,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -194,7 +194,7 @@ if.end42:                                         ; preds = %if.else40, %i38
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
index b63ea25a6ecc7..7b67cbd488fdd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
@@ -120,7 +120,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -207,7 +207,7 @@ j:                                                ; preds = %for.cond40, %for.co
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
index d58b2a1e23cfc..371755af3e382 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
@@ -93,7 +93,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
@@ -187,7 +187,7 @@ end:                                              ; preds = %i42, %h
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
index f810588857ef9..acfb739e81564 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
@@ -106,7 +106,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -159,7 +159,7 @@ g:                                                ; preds = %for.cond, %e, %whil
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
index 243f175fde0f4..c08463fce5e5c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
@@ -102,7 +102,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -141,7 +141,7 @@ f:                                                ; preds = %e, %if.else, %while
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
index 3b969e4c3d09a..c5fe9d0b5efb2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
@@ -112,7 +112,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -172,7 +172,7 @@ h:                                                ; preds = %for.cond, %f, %if.e
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
index 86e8e6396a40d..56ec40be4a215 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
@@ -91,7 +91,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
@@ -185,7 +185,7 @@ end:                                              ; preds = %i42, %for.cond
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
index 7347e9af1aa18..d82b4174fa513 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
@@ -82,7 +82,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %0 = icmp eq i32 %conv, -2147483648
   %1 = icmp eq i32 %n, -1
@@ -141,7 +141,7 @@ g:                                                ; preds = %f, %e
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
index 248d033b44e75..a2ff9ce17de79 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -90,7 +90,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %rem1 = and i32 %conv, 1
   %cmp = icmp eq i32 %rem1, 0
@@ -160,7 +160,7 @@ g:                                                ; preds = %f, %if.then
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
index e8f201fbbaf0f..c5a8af2b8b89b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -87,7 +87,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -142,7 +142,7 @@ early:                                            ; preds = %e, %while.end
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
index eeb047e812ffd..1bbf53c652c9e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
@@ -104,7 +104,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else5
@@ -167,7 +167,7 @@ i29:                                              ; preds = %h, %for.cond19
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
index 984e5c44676c0..e106fc964a61b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
@@ -84,7 +84,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %0 = icmp eq i32 %conv, -2147483648
   %1 = icmp eq i32 %n, -1
@@ -144,7 +144,7 @@ g:                                                ; preds = %f, %e
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
index 9bd0771950dd3..8eabade1279f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
@@ -77,7 +77,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -112,7 +112,7 @@ while.end:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
index 593f9a16e687d..d3a4c0600cf14 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
@@ -38,7 +38,7 @@ entry:
   store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
   store i32 addrspace(1)* %status, i32 addrspace(1)** %status.addr, align 8
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %call = call i64 @__mux_get_global_id(i32 0) #4
   store i64 %call, i64* %tid, align 8
   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8
   %1 = load i64, i64* %tid, align 8
@@ -58,17 +58,17 @@ entry:
   %9 = load i64, i64* %tid, align 8
   %conv = trunc i64 %9 to i32
   %10 = load i32, i32* %sum, align 4
-  %11 = call spir_func i64 @_Z14get_num_groupsj(i32 0)
+  %11 = call i64 @__mux_get_num_groups(i32 0)
   %12 = trunc i64 %11 to i32
-  %13 = call spir_func i64 @_Z14get_num_groupsj(i32 1)
+  %13 = call i64 @__mux_get_num_groups(i32 1)
   %14 = trunc i64 %13 to i32
-  %15 = call spir_func i64 @_Z14get_num_groupsj(i32 2)
+  %15 = call i64 @__mux_get_num_groups(i32 2)
   %16 = trunc i64 %15 to i32
-  %17 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %17 = call i64 @__mux_get_group_id(i32 0)
   %18 = trunc i64 %17 to i32
-  %19 = call spir_func i64 @_Z12get_group_idj(i32 1)
+  %19 = call i64 @__mux_get_group_id(i32 1)
   %20 = trunc i64 %19 to i32
-  %21 = call spir_func i64 @_Z12get_group_idj(i32 2)
+  %21 = call i64 @__mux_get_group_id(i32 2)
   %22 = trunc i64 %21 to i32
   %23 = mul i32 %12, %20
   %24 = mul i32 %14, %16
@@ -117,9 +117,9 @@ store.i:                                          ; preds = %entry
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z12get_group_idj(i32)
-declare spir_func i64 @_Z14get_num_groupsj(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_group_id(i32)
+declare i64 @__mux_get_num_groups(i32)
 
 ; We can't vectorize this control flow
 ; CHECK: Error: Failed to vectorize function 'printf_add'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
index 43af98e580e1a..5a23321bf148a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
@@ -24,7 +24,7 @@ entry:
   %d = alloca i32
   %e = alloca i32
   %f = alloca float
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %sum = add i32 %a, %b
   store i32 %sum, i32* %d, align 4
   store i32 %sum, i32* %e, align 4
@@ -43,12 +43,12 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @foo(i32*)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf)
 ; CHECK: %e = alloca i32
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %sum = add i32 %a, %b
 ; CHECK: store i32 %sum, ptr %e
 ; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
index 9aa8796365e19..fcfbbee2bd38a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p) {
   %data = alloca i32, align 4
-  %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %1 = tail call i64 @__mux_get_global_id(i32 0) #4
   %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1
   %3 = load i32, ptr addrspace(1) %2, align 4
   store i32 %3, ptr %data, align 4
@@ -33,7 +33,7 @@ define spir_kernel void @load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p
 
 define spir_kernel void @load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) {
   %data = alloca i32, align 4
-  %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %1 = tail call i64 @__mux_get_global_id(i32 0) #4
   %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1
   %3 = load i32, ptr addrspace(1) %2, align 4
   store i32 %3, ptr %data, align 4
@@ -43,7 +43,7 @@ define spir_kernel void @load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
 
 define spir_kernel void @store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) {
   %data = alloca i32, align 4
-  %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #4
+  %1 = tail call i64 @__mux_get_global_id(i32 0) #4
   %2 = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %1
   %3 = load i16, ptr addrspace(1) %2, align 4
   store i16 %3, ptr %data, align 2
@@ -51,7 +51,7 @@ define spir_kernel void @store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p)
 ; CHECK-NOT: alloca i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
index dbaa44b4f8450..74f64b5b77c12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
@@ -19,7 +19,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func float @_Z5fractfPf(float, float*)
 declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
@@ -31,7 +31,7 @@ declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
 
 define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
   %iouta = alloca float
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx
   %x = load float, float* %arrayidx.x, align 4
   %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta)
@@ -49,7 +49,7 @@ define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr)
 
 define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
   %iouta = alloca <2 x float>
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx
   %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8
   %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
index c70bf0fad5504..689cf30575889 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_varying_if_ptr(i32 %a, ptr %b, ptr %on_true, ptr %on_false) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
index 66427a18fa693..81a08efe6f618 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
@@ -23,9 +23,9 @@ target triple = "spir64-unknown-unknown"
 
 define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %conv, %mul
@@ -53,5 +53,5 @@ entry:
 ; CHECK:  %deinterleave1 = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK:  %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
index d8c0981879967..abcbc465aae0e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
@@ -19,11 +19,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @load_add_store(ptr %aptr, ptr %bptr, ptr %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
@@ -33,7 +33,7 @@ entry:
   store i32 %sum, ptr %arrayidxz, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_load_add_store(ptr %aptr, ptr %bptr, ptr %zptr)
-; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
 ; CHECK: %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
 ; CHECK: %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
index df32ce419d5e2..43e027f0bf8b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_varying_if(i32 %a, ptr %b, float %on_true, float %on_false) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -46,7 +46,7 @@ if.end:
 define spir_kernel void @test_varying_if_as3(i32 %a, ptr addrspace(3) %b, float %on_true, float %on_false) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
index 4816edba71a2a..6872f1118377f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
@@ -26,7 +26,7 @@ target triple = "spir64-unknown-unknown"
 ; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
 define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = ptrtoint i8 addrspace(1)* %in to i64
   %shl = shl i64 %call, 2
   %add = add i64 %shl, %0
@@ -43,7 +43,7 @@ entry:
 ; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
 define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = ptrtoint i16 addrspace(1)* %in to i64
   %shl = shl i64 %call, 2
   %add = add i64 %shl, %0
@@ -51,4 +51,4 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
index 82343981d5f40..8121eec00c7c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_positive(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -35,7 +35,7 @@ entry:
 
 define spir_kernel void @test_positive_gep_different_type(i64 %a, i64 %b, i8* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   %c1 = getelementptr i64, i64* %c, i64 0
@@ -61,7 +61,7 @@ entry:
 
 define spir_kernel void @test_vector_scalar_cond(i64 %a, <2 x i32> %b, <2 x i32>* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr <2 x i32>, <2 x i32>* %c, i64 %gid
   %c1 = getelementptr <2 x i32>, <2 x i32>* %c, i64 0
@@ -71,10 +71,10 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
 ; CHECK: store i64 %b, ptr %c0, align 4
@@ -87,7 +87,7 @@ declare spir_func i64 @_Z13get_global_idj(i32)
 ; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
 
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive_gep_different_type(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
 ; CHECK: store i64 %b, ptr %c0, align 4
@@ -100,7 +100,7 @@ declare spir_func i64 @_Z13get_global_idj(i32)
 ; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP2]], i1 %[[XOR]])
 
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
 ; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
@@ -109,7 +109,7 @@ declare spir_func i64 @_Z13get_global_idj(i32)
 
 ; Note: we don't perform this transform on vector accesses - see CA-4337.
 ; CHECK: define spir_kernel void @__vecz_v4_test_vector_scalar_cond(i64 %a, <2 x i32> %b, ptr %c)
-; CHECK:   %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK:   %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK:   %cond = icmp eq i64 %a, %gid
 ; CHECK:   %c0 = getelementptr <2 x i32>, ptr %c, i64 %gid
 ; CHECK:   %c1 = getelementptr <2 x i32>, ptr %c, i64 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
index 478fb3511df33..8f246189cd678 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
index de8b28c16a9f5..1b49ee53f5bbe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
index c9e6e5c230429..dca8e8649bd00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -23,13 +23,13 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %call, 0
   br i1 %cmp, label %for.end, label %for.cond
 
 for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
   %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
   %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -77,8 +77,8 @@ for.end:                                          ; preds = %entry, %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
 
 ; This test checks if a uniform <4 x i32> phi is not scalarized
 ; CHECK: define spir_kernel void @__vecz_v4_vector_loop
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
index d849558b6b390..010cafb0a3b70 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @vector_loop(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %in2, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %initaddr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in2, i64 %call
   %init = load <4 x i32>, <4 x i32> addrspace(1)* %initaddr
   %cmp = icmp eq i64 %call, 0
@@ -31,7 +31,7 @@ entry:
 
 for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ %init, %entry ]
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
   %0 = extractelement <4 x i32> %storemerge, i64 0
   %cmp2 = icmp slt i32 %0, %conv
@@ -77,8 +77,8 @@ for.end:                                          ; preds = %entry, %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
 
 ; This test checks if a varying <4 x i32> phi gets scalarized
 ; if it is only accessed through individually extracted elements.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index eebeeb800820d..eb72c2d7a195a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -21,11 +21,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -37,7 +37,7 @@ entry:
 
 define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -49,7 +49,7 @@ entry:
 
 define dso_local spir_kernel void @vector_broadcast_illegal(<32 x float> addrspace(1)* nocapture readonly %in, <32 x float> %addend, <32 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
   %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
@@ -61,7 +61,7 @@ entry:
 
 define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)*
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16
@@ -81,7 +81,7 @@ entry:
 define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 {
 entry:
   %existing.alloc = alloca <4 x i32>
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc
   %scalar = bitcast <4 x i32>* %existing.alloc to i32*
   store i32 1, i32* %scalar
@@ -99,7 +99,7 @@ entry:
 
 define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -112,7 +112,7 @@ entry:
 }
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
@@ -125,7 +125,7 @@ entry:
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
 ; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
-; CHECK-NEXT:  [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:  [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:  [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
@@ -142,7 +142,7 @@ entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> undef)
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 128 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 128 x float> [[TMP3]], [[TMP1]]
@@ -152,7 +152,7 @@ entry:
 ;
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
@@ -176,7 +176,7 @@ entry:
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
 ; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
-; CHECK-NEXT:  [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:  store <4 x i32> zeroinitializer, ptr [[EXISTINGALLOC]], align 16
 ; CHECK-NEXT:  store i32 1, ptr [[EXISTINGALLOC]], align 16
 ; CHECK-NEXT:  [[V:%.*]] = load <4 x i32>, ptr [[EXISTINGALLOC]], align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index 27519421f4c68..97c2c61f539c5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -25,11 +25,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -43,7 +43,7 @@ entry:
 
 define spir_kernel void @extract_element_ilegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
   %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
@@ -55,7 +55,7 @@ entry:
 
 define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %0 = extractelement <4 x float> %in, i32 %idx
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %0, float addrspace(1)* %arrayidx3, align 4
@@ -64,7 +64,7 @@ entry:
 
 define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %i = urem i64 %call, 4
   %0 = extractelement <4 x float> %in, i64 %i
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
@@ -74,7 +74,7 @@ entry:
 
 define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
   %idx = load i32, i32 addrspace(1)* %arrayidxidx
   %i = urem i32 %idx, 4
@@ -88,7 +88,7 @@ entry:
 
 define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index 7622066dd9751..b8d714e5aa0ac 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -24,11 +24,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -40,7 +40,7 @@ entry:
 
 define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %0 = insertelement <4 x float> %in, float %val, i32 %idx
   %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
   store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
   %idx = load i32, i32 addrspace(1)* %arrayidxidx
   %i = urem i32 %idx, 4
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @insert_element_illegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <32 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
   %idx = load i32, i32 addrspace(1)* %arrayidxidx, align 4
   %i = urem i32 %idx, 32
@@ -81,7 +81,7 @@ entry:
 
 define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
index 9ec8dc44e0f7a..0de0d65443bf0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
   %out.data = shufflevector <4 x i32> %in.data, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic.
 ; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
index 13e5ee7a26fe9..85d9a06556440 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
   %in.bool = icmp ne <4 x i32> %in.data, zeroinitializer
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic,
 ; and that it works with a vector of i1 type by temporarily extending to i8.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
index b991ee826745a..86d8e27d53d21 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<2 x i32> addrspace(1)* %a, <2 x i32> addrspace(1)* %b, <4 x i32> addrspace(1)* %out) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %a.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 %gid
   %a.data = load <2 x i32>, <2 x i32> addrspace(1)* %a.ptr
   %b.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 %gid
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; It checks that a two-operand shuffle is packetized to a gather intrinsics and a select.
 ; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
index 88d113dba469e..b890c711027dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <2 x i32> addrspace(1)* %out) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
   %out.data = shufflevector <4 x i32> %in.data, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; It checks that a single-operand shuffle that narrows the vector is packetized to a gather intrinsic.
 ; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
index 66fdd408918a7..41e4e99f8f060 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<2 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <2 x i32>, <2 x i32> addrspace(1)* %in.ptr
   %out.data = shufflevector <2 x i32> %in.data, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; It checks that a single-operand shuffle that widens the vector is packetized to a gather intrinsic.
 ; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 6989dc26cf54d..8628498656dd6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -20,11 +20,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
index 7586c9ef55d37..f24052cb60f26 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -27,7 +27,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @store_element(i32 %0, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp ne i64 %call, 0
   br i1 %cond, label %do, label %ret
 
@@ -62,7 +62,7 @@ ret:
 
 define spir_kernel void @load_element(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp ne i64 %call, 0
   br i1 %cond, label %do, label %ret
 
@@ -97,4 +97,4 @@ ret:
 ; CHECK-LOAD-16-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p1(ptr addrspace(1) [[TMP0]], i32 4, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i32> {{undef|poison}})
 ; CHECK-LOAD-16-NEXT: ret <vscale x 16 x i32> [[TMP7]]
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
index 1df7553104e6f..64b179504e2f2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -21,11 +21,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 41c9ec2762099..3eda351c2d3c2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -20,11 +20,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -36,7 +36,7 @@ entry:
 
 define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -48,7 +48,7 @@ entry:
 
 define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)*
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16
@@ -68,7 +68,7 @@ entry:
 define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 {
 entry:
   %existing.alloc = alloca <4 x i32>
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc
   %scalar = bitcast <4 x i32>* %existing.alloc to i32*
   store i32 1, i32* %scalar
@@ -86,7 +86,7 @@ entry:
 
 define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -99,7 +99,7 @@ entry:
 }
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF8000020000000, {{(i32|i64)}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
@@ -113,7 +113,7 @@ entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
@@ -123,7 +123,7 @@ entry:
 
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
@@ -149,7 +149,7 @@ entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> {{(undef|poison)}})
-; CHECK-NEXT:    [[CALL:%.*]] = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    [[V:%.*]] = load <4 x i32>, ptr [[EXISTING_ALLOC]], align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
index ef7170da1c771..0bfd6536581a6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @builtins(float* %aptr, float* %bptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z9isgreaterff(float, float)
 
 ; CHECK: void @__vecz_nxv4_builtins
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
index 4d0dee32a4f31..66d1abccd24bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @cast(i32* %aptr, float* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -32,4 +32,4 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_nxv8_cast
 ; CHECK: sitofp <vscale x 8 x i32> {{%[0-9]+}} to <vscale x 8 x float>
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index db32d7c162f6e..1d8a25b9fc14f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -22,11 +22,11 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2)
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -46,9 +46,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index 25465d1dc70d7..8954814274853 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -22,11 +22,11 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2)
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -46,9 +46,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func void @_Z7barrierj(i32)
+declare void @__mux_work_group_barrier(i32, i32, i32)
 
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
index ea9851f0a8d01..98b7a2580137d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -22,13 +22,13 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %conv, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call2 = call i64 @__mux_get_global_id(i32 0)
   %conv3 = trunc i64 %call2 to i32
   %idxprom = sext i32 %conv3 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
@@ -39,8 +39,8 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0)
-  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %call8 = call i64 @__mux_get_local_size(i32 0)
+  %call9 = call i64 @__mux_get_group_id(i32 0)
   %mul = mul i64 %call9, %call8
   %add = add i64 %mul, %call
   %sext = shl i64 %add, 32
@@ -53,15 +53,15 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func void @_Z7barrierj(i32)
+declare void @__mux_work_group_barrier(i32, i32, i32)
 
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_size(i32)
 
-declare spir_func i64 @_Z12get_group_idj(i32)
+declare i64 @__mux_get_group_id(i32)
 
 ; Test if the masked load is defined correctly
 ; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_load4_u5nxv4ju3ptrU3AS2u5nxv4b(ptr addrspace(2){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
index 13a5a35cc17af..bef3ee020dc97 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -51,7 +51,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -75,7 +75,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the masked scatter store is defined correctly
 ; CHECK: define void @__vecz_b_masked_scatter_store4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b(<vscale x 4 x i32>{{( %0)?}}, <vscale x 4 x ptr addrspace(1)>{{( %1)?}}, <vscale x 4 x i1>{{( %2)?}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index e64f8ef91b4b8..d90c4e788f336 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -24,11 +24,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -40,7 +40,7 @@ entry:
 
 define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %0 = extractelement <4 x float> %in, i32 %idx
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
   store float %0, float addrspace(1)* %arrayidx3, align 4
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %i = urem i64 %call, 4
   %0 = extractelement <4 x float> %in, i64 %i
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
@@ -59,7 +59,7 @@ entry:
 
 define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
   %idx = load i32, i32 addrspace(1)* %arrayidxidx
   %i = urem i32 %idx, 4
@@ -73,7 +73,7 @@ entry:
 
 define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
index dd5ece32f6ba6..2c1c76a4c47b1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @fadd(float* %aptr, float* %bptr, float* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
@@ -37,4 +37,4 @@ entry:
 ; CHECK: load <vscale x 4 x float>, ptr
 ; CHECK: fadd <vscale x 4 x float>
 ; CHECK: store <vscale x 4 x float>
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
index adbc320138b63..78538bb0832e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @fail_builtins(float* %aptr, float* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
   %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
   %a = load float, float* %arrayidxa, align 4
@@ -30,7 +30,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func float @_Z4tanff(float)
 
 ; We can't scalarize this builtin call
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index cef7039e66069..8722870800f35 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -23,11 +23,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
   %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
@@ -39,7 +39,7 @@ entry:
 
 define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %0 = insertelement <4 x float> %in, float %val, i32 %idx
   %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
   store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4
@@ -48,7 +48,7 @@ entry:
 
 define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
   %idx = load i32, i32 addrspace(1)* %arrayidxidx
   %i = urem i32 %idx, 4
@@ -64,7 +64,7 @@ entry:
 
 define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #6
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index e92b693849c67..5b71067b8d4a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @load_interleaved(i32 addrspace(1)* nocapture readonly %input, i32 addrspace(1)* nocapture %output, i32 %stride) local_unnamed_addr {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %0 = trunc i64 %call to i32
   %conv1 = mul i32 %0, %stride
   %idxprom = sext i32 %conv1 to i64
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define void @__vecz_b_interleaved_store4_V_u5nxv4ju3ptrU3AS1(<vscale x 4 x i32> [[ARG0:%.*]], ptr addrspace(1) [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
 ; CHECK-NEXT: entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
index b9adf794a036b..e9c1dfd32d483 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -24,7 +24,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -40,7 +40,7 @@ entry:
 
 define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -56,7 +56,7 @@ entry:
 
 define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -72,7 +72,7 @@ entry:
 
 define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -90,7 +90,7 @@ entry:
 
 define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -108,7 +108,7 @@ entry:
 
 define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -126,7 +126,7 @@ entry:
 
 define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -163,7 +163,7 @@ declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
 declare i32 @llvm.usub.sat.i32(i32, i32)
 declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CTPOP: void @__vecz_nxv2_ctpop
 ; CTPOP: = call <vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> %{{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
index ccf6ee943da0c..eb9d20e6486ce 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
@@ -37,4 +37,4 @@ entry:
 ; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
 ; CHECK: [[sum:%[0-9a-z]+]] = add <vscale x 4 x i32> [[lhs]], [[rhs]]
 ; CHECK: store <vscale x 4 x i32> [[sum]],
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
index 407fdb382db38..d1bc2db6f979b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load_binops_store(i32* %aptr, i32* %bptr, i32* %cptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxc = getelementptr inbounds i32, i32* %cptr, i64 %idx
@@ -44,4 +44,4 @@ entry:
 ; CHECK: mul <vscale x 4 x i32>
 ; CHECK: ashr <vscale x 4 x i32>
 ; CHECK: store <vscale x 4 x i32>
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
index 2a2b243c4d2e0..d59356adddd4b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %load = load i32, i32 addrspace(1)* %in
   %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx
   store i32 %load, i32 addrspace(1)* %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index 8fa603f2c229f..df0d0db831545 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -24,7 +24,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; a single varying splatted bit.
 define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %mod_idx = urem i64 %idx, 2
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
   %ins = insertelement <4 x i1> undef, i1 true, i32 0
@@ -52,5 +52,5 @@ if.end:
 
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
index cf9183419ed7c..af4773e3fb058 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @cast(i32* %aptr, float* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -34,4 +34,4 @@ entry:
 ; appropriate scalable vectorization factor.
 ; CHECK: define spir_kernel void @__vecz_nxv[[VF:[0-9]+]]_cast
 ; CHECK: sitofp <vscale x [[VF]] x i32> {{%[0-9]+}} to <vscale x [[VF]] x float>
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
index 78467dd58f1bf..f6a8addb32062 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @select_scalar_scalar(i32* %aptr, i32* %bptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
@@ -35,7 +35,7 @@ entry:
 
 define spir_kernel void @select_vector_vector(<2 x i32>* %aptr, <2 x i32>* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds <2 x i32>, <2 x i32>* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i32>, <2 x i32>* %bptr, i64 %idx
   %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
@@ -49,7 +49,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_scalar
 ; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index c6e1be52f087e..85dd9e30fe409 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -20,11 +20,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index b3fdbd4fccc4d..3cdae586545ad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -45,7 +45,7 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
 }
 
 define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x i32>* %zptr) {
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
   %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
   %insert = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -59,4 +59,4 @@ define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x
 ; CHECK: store <vscale x 16 x i32> [[splat]], ptr
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
index ea02a0334c436..9ce170d82a36c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load_add_store(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store
 ; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 16 x i32>, ptr
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
index e258b0200e3ea..15c13446b3762 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
@@ -21,11 +21,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @regression_phis(i64 addrspace(1)* %xs, i64 addrspace(1)* %ys, i32 addrspace(1)* %out, i64 %lim) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx.x = getelementptr inbounds i64, i64 addrspace(1)* %xs, i64 %call
   %x = load i64, i64 addrspace(1)* %arrayidx.x, align 4
   %cond = icmp eq i64 %call, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
index 77ecb04b3605f..f223a95ddec14 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @widen_vload(<4 x i32>* %aptr, <4 x i32>* %zptr) {
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %mod_idx = urem i64 %idx, 2
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %mod_idx
   %v = load <4 x i32>, <4 x i32>* %arrayidxa, align 16
@@ -32,4 +32,4 @@ define spir_kernel void @widen_vload(<4 x i32>* %aptr, <4 x i32>* %zptr) {
 ; CHECK: %v4 = call <vscale x 16 x i32> @__vecz_b_gather_load16_u6nxv16ju10nxv16u3ptr(<vscale x 16 x ptr> %{{.*}})
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
index f83882767ee4a..30d440b5ea5e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -25,7 +25,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @store_ult(i32* %out, i64* %N) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %0 = load i64, i64* %N, align 8
   %cmp = icmp ult i64 %call, %0
   %conv = zext i1 %cmp to i32
@@ -34,7 +34,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_nxv4_store_ult
 ; CHECK:   [[step:%[0-9.a-z]+]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
index ba9b82e6708e6..c8393af17a5f0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -20,11 +20,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @foo(float addrspace(1)* nocapture readonly %a, i32 addrspace(1)* nocapture %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
index 0e183941a1736..6a26896527739 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
@@ -21,11 +21,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @foo(float* %aptr, float* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
   %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
   %a = load float, float* %arrayidxa, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index 6b9bf54fe81b6..fd80c369026c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
@@ -45,9 +45,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
index 9fc81ca11e898..210a95872cdd5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -24,13 +24,13 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %conv, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call2 = call i64 @__mux_get_global_id(i32 0)
   %conv3 = trunc i64 %call2 to i32
   %idxprom = sext i32 %conv3 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
@@ -41,8 +41,8 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0)
-  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0)
+  %call8 = call i64 @__mux_get_local_size(i32 0)
+  %call9 = call i64 @__mux_get_group_id(i32 0)
   %mul = mul i64 %call9, %call8
   %add = add i64 %mul, %call
   %sext = shl i64 %add, 32
@@ -55,13 +55,13 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_size(i32)
 
-declare spir_func i64 @_Z12get_group_idj(i32)
+declare i64 @__mux_get_group_id(i32)
 
 ; Test if the masked store is defined correctly
 ; CHECK: define void @__vecz_b_masked_store4_vp_Dv4_ju3ptrU3AS1Dv4_bj(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}, i32{{( %3)?}}) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
index 7374dda89f394..f05b6106c5032 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -57,7 +57,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -81,7 +81,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the vector-predicated gather load is defined correctly
 ; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)>{{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 5e6c428dafa99..f364b93b90bf1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -24,11 +24,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @load_add_store_i32(i32* %aptr, i32* %bptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @load_add_store_v4i32(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 0a72f44e6e7df..4e7ba7db75a99 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -24,7 +24,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; a single varying splatted bit.
 define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %mod_idx = urem i64 %idx, 2
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
   %ins = insertelement <4 x i1> undef, i1 true, i32 0
@@ -49,5 +49,5 @@ if.end:
 ; CHECK: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
index c8a928a0197f5..1c4fccb05352e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -20,12 +20,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; With VP all gathers become masked ones.
 define spir_kernel void @unmasked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %rem
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
@@ -42,7 +42,7 @@ entry:
 ; With VP all scatters become masked ones.
 define spir_kernel void @unmasked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index 2ea12a5723483..7cd87a3cd55a1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -21,11 +21,11 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @udiv(i32* %aptr, i32* %bptr, i32* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
index ea35619a37f78..d86cefcb3adb5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
index ea35619a37f78..d86cefcb3adb5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
index 2fb59dc4ae9c4..280e4a912cf0f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
@@ -52,7 +52,7 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
index 9a923d3653b6f..5a2d34b7553d4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @extract_constant_index(<4 x i64> addrspace(1)* %in, i32 %x, i64 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call
   %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 4
   %vecext = extractelement <4 x i64> %0, i32 0;
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
 ; CHECK: %[[LD:.+]] = load <16 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
index 7b956317f5ef1..33286c6dd1ec0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind
 define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
   %vecext = extractelement <4 x float> %0, i32 %x
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
index 5c96c9e4e30d0..3b48683b42ccd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind
 define spir_kernel void @extract_runtime_index(i32 addrspace(1)* %in, <4 x i8> %x, i8 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
   %vecext = extractelement <4 x i8> %x, i32 %0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
index 5b2cc9cf58570..703a012103674 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind
 define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %x, float addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %x, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
index 587b067945e65..097d862c9735e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @constant_index(<4 x i32>* %in, i32* %inval, <4 x i32>* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
   %0 = load <4 x i32>, <4 x i32>* %arrayidx
   %arrayidx2 = getelementptr inbounds i32, i32* %inval, i64 %call
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
index 4d9fdb9bdd646..86cefa3d69d25 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
   %0 = load <4 x i32>, <4 x i32>* %arrayidx
   %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
index 10c1641c7bc90..095175f4331a1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
   %0 = load <4 x i32>, <4 x i32>* %arrayidx
   %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
index 23619cd2be9f5..5d0789d9dd23e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
@@ -72,13 +72,13 @@ attributes #3 = { nobuiltin nounwind }
 
 ; Function start
 ; CHECK: define spir_kernel void @__vecz_v4_f
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 
 ; There should be exactly six vector loads and one store in the code
 ; CHECK: load <16 x double>
 
 ; And in between them there should be a barrier call
-; CHECK: call spir_func void @_Z7barrierj
+; CHECK: call void @__mux_work_group_barrier
 ; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
 ; CHECK: load <16 x double>
 ; CHECK: load <16 x double>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
index 2293e5aa88f37..05d077c8ae6ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -34,7 +34,7 @@ entry:
 }
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 ; CHECK: and <16 x i64>
 ; CHECK: icmp slt <16 x i64>
 ; CHECK: sext <16 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
index f60573c4019b2..cdf064834d133 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
index 6a69d748bee4e..f9f8f4b3f260d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
index 3bf5b71d4f23c..adf50dd8880bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
index 5fd3ea5e6069d..fb0bbc392bdbd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
index 60572ff89c439..b5c53224e4c94 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
index fd73eb1aafc36..bf59e8d5b115f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
index f5e7253d563c8..08668c3d1df13 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
index 00a47e232ee35..ef4065e605744 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
@@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z12get_local_idj(i32) #0
+declare i64 @__mux_get_local_id(i32) #0
 
 ; Function Attrs: nounwind readnone
 declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0
@@ -35,7 +35,7 @@ declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2
 
 define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) {
 entry:
-  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0) #0
+  %lid = tail call i64 @__mux_get_local_id(i32 0) #0
   %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid
   br label %loop
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
index 94771a1202ac7..ce2aa29230092 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @vector_copy(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16
   %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; It makes sure the vector load and store are preserved right through to packetization
 ; and then widened, instead of being scalarized across work-items first
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
index 8ca3ced324a1f..dff334cc859f3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %call.trunc = trunc i64 %call to i32
   %call.splatinsert = insertelement <4 x i32> undef, i32 %call.trunc, i32 0
   %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -32,7 +32,7 @@ entry:
 
 for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
   %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
   %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -80,8 +80,8 @@ for.end:                                          ; preds = %entry, %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
 
 ; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis
 ; and then re-packetized
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
index 7643a5f7e9edf..3b6f26e35e548 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -20,14 +20,14 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare i32 @llvm.abs.i32(i32, i1)
 declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
 
 define spir_kernel void @absff(i32* %pa, i32* %pb) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr i32, i32* %pa, i64 %idx
   %b = getelementptr i32, i32* %pb, i64 %idx
   %la = load i32, i32* %a, align 16
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @absvf(<2 x i32>* %pa, <2 x i32>* %pb) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <2 x i32>, <2 x i32>* %pa, i64 %idx
   %b = getelementptr <2 x i32>, <2 x i32>* %pb, i64 %idx
   %la = load <2 x i32>, <2 x i32>* %a, align 16
@@ -49,7 +49,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_absff(ptr %pa, ptr %pb)
 ; CHECK: entry:
-; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a = getelementptr i32, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr i32, ptr %pb, i64 %idx
 ; CHECK: %[[T0:.*]] = load <4 x i32>, ptr %a, align 4
@@ -59,7 +59,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_absvf(ptr %pa, ptr %pb)
 ; CHECK: entry:
-; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx
 ; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
index 32ee16cfe8ee4..401d6cf336fc6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @widen_binops(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i64>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx
   %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx
   %d = getelementptr <4 x i64>, <4 x i64>* %pd, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
index 5ad8bcc51079d..33f9ac40e5465 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -20,14 +20,14 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare float @llvm.copysign.f32(float, float)
 declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>)
 
 define spir_kernel void @copysignff(float* %pa, float* %pb, float* %pc) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr float, float* %pa, i64 %idx
   %b = getelementptr float, float* %pb, i64 %idx
   %c = getelementptr float, float* %pc, i64 %idx
@@ -40,7 +40,7 @@ entry:
 
 define spir_kernel void @copysignvf(<2 x float>* %pa, <2 x float>* %pb, <2 x float>* %pc) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx
   %b = getelementptr <2 x float>, <2 x float>* %pb, i64 %idx
   %c = getelementptr <2 x float>, <2 x float>* %pc, i64 %idx
@@ -53,7 +53,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_copysignff(ptr %pa, ptr %pb, ptr %pc)
 ; CHECK: entry:
-; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a = getelementptr float, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr float, ptr %pb, i64 %idx
 ; CHECK: %c = getelementptr float, ptr %pc, i64 %idx
@@ -65,7 +65,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_copysignvf(ptr %pa, ptr %pb, ptr %pc)
 ; CHECK: entry:
-; CHECK: %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx
 ; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
index 401afa8897df2..75f0fd0a8a6f3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
   %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
   %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
index 1f301325cabcc..f0895d6165aec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Function Attrs: nounwind readnone
 declare spir_func <4 x float> @_Z4fminDv4_ff(<4 x float>, float)
@@ -38,7 +38,7 @@ declare spir_func <16 x float> @_Z4fminDv16_fS_(<16 x float>, <16 x float>)
 
 define spir_kernel void @fmin_vector_scalar(<4 x float>* %pa, float* %pb, <4 x float>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
   %b = getelementptr float, float* %pb, i64 %idx
   %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
index 5eb62b4e49493..3460f67581d3d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
   %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
   %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
index 50a3a326b34a7..d5263bb5c2bfd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %idx2 = shl i64 %idx, 1
   %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx2
   %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
index 079cf4f47796c..12671866f8ab7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
   %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
   %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
index 67a5e45bb1814..b0300297dd961 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr i8, i8* %pa, i64 %idx
   %b = getelementptr i8, i8* %pb, i64 %idx
   %d = getelementptr i8, i8* %pd, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
index dd6fcf52f7405..270a1c69545e7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr i8, i8* %pa, i64 %idx
   %b = getelementptr i8, i8* %pb, i64 %idx
   %d = getelementptr i8, i8* %pd, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
index 4722ba4d10994..2bb278d1d0cdf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind
 define spir_kernel void @widen_shufflevector(<2 x float> addrspace(1)* %a, <2 x float> addrspace(1)* %b, <4 x float> addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidxa = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 %call
   %arrayidxb = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %b, i64 %call
   %la = load <2 x float>, <2 x float> addrspace(1)* %arrayidxa, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
index 17061df108ef2..e68c4e6c2e757 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func float @_Z4sqrtf(float)
 declare spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float>)
 declare spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float>)
@@ -29,7 +29,7 @@ declare spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float>)
 define spir_kernel void @test_sqrt(<2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %out2,
                                    <4 x float> addrspace(1)* %in4, <4 x float> addrspace(1)* %out4) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %arrayin2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in2, i64 %gid
   %arrayin4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in4, i64 %gid
   %arrayout2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out2, i64 %gid
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
index ef0beb022572d..b90360dc75b6e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
@@ -24,7 +24,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define spir_kernel void @alloca_alias(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %myStructs = alloca [2 x %struct.testStruct], align 16
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = bitcast [2 x %struct.testStruct]* %myStructs to i8*
   call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull %0)
   %1 = trunc i64 %call to i32
@@ -58,7 +58,7 @@ entry:
 
 declare void @llvm.lifetime.start.p0i8(i64 immarg, i8*)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func void @_Z7vstore3Dv3_imPU3AS1i(<3 x i32>, i64, i32 addrspace(1)*)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
index 493e3398e0304..5ce84466aefd6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
@@ -25,14 +25,14 @@ target triple = "armv7-unknown-linux-gnueabihf"
 ; Function Attrs: nounwind
 define spir_kernel void @short3_char3_codegen(i8 addrspace(1)* %src, i16 addrspace(1)* %dest) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
 entry:
-  %call = call spir_func i32 @_Z13get_global_idj(i32 0) #3
+  %call = call i32 @__mux_get_global_id(i32 0) #3
   %call1 = call spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32 %call, i8 addrspace(1)* %src) #3
   %call3 = call spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8> %call1) #3
   call spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16> %call3, i32 %call, i16 addrspace(1)* %dest) #3
   ret void
 }
 
-declare spir_func i32 @_Z13get_global_idj(i32) #1
+declare i32 @__mux_get_global_id(i32) #1
 
 declare spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32, i8 addrspace(1)*) #1
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
index 4b851dbe19ded..02e39a996c33d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
@@ -25,9 +25,9 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(i32 addrspace(1)* %input, i32 addrspace(3)* %output, i32 addrspace(1)* %elements) {
   %ev = alloca %opencl.event_t*, align 8
-  %1 = call spir_func i64 @_Z13get_global_idj(i32 0)
-  %2 = call spir_func i64 @_Z12get_group_idj(i32 0)
-  %3 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %1 = call i64 @__mux_get_global_id(i32 0)
+  %2 = call i64 @__mux_get_group_id(i32 0)
+  %3 = call i64 @__mux_get_local_size(i32 0)
   %4 = mul i64 %3, %2
   %5 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %4
   %6 = mul i64 %3, %2
@@ -42,9 +42,9 @@ define spir_kernel void @test(i32 addrspace(1)* %input, i32 addrspace(3)* %outpu
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z12get_group_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_group_id(i32)
+declare i64 @__mux_get_local_size(i32)
 declare spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)*, i32 addrspace(3)*, i64, %opencl.event_t*)
 declare spir_func void @_Z17wait_group_eventsiP9ocl_event(i32, %opencl.event_t**)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
index 3796a3f03de37..2fbadb0e80f9d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %do.body
 
@@ -42,7 +42,7 @@ do.end:                                           ; preds = %do.body
 
 define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %do.body
 
@@ -62,7 +62,7 @@ do.end:                                           ; preds = %do.body
 
 define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
   %idxprom = sext i32 %0 to i64
@@ -71,7 +71,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; We no longer support instantiating atomic instructions in diverged blocks,
 ; since they require masking. FileCheck does not support comments, so the CHECKs
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
index ad101c818863d..de5f90bbf3f9e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %do.body
 
@@ -42,7 +42,7 @@ do.end:                                           ; preds = %do.body
 
 define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %do.body
 
@@ -62,7 +62,7 @@ do.end:                                           ; preds = %do.body
 
 define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
   %idxprom = sext i32 %0 to i64
@@ -71,7 +71,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; We no longer support instantiating atomic instructions in diverged blocks,
 ; since they require masking. FileCheck does not support comments, so the CHECKs
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
index 9becb697dea93..d20f8408fb3d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %do.body
 
@@ -42,7 +42,7 @@ do.end:                                           ; preds = %do.body
 
 define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %do.body
 
@@ -62,7 +62,7 @@ do.end:                                           ; preds = %do.body
 
 define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
   %idxprom = sext i32 %0 to i64
@@ -71,7 +71,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_atomic_rmw
 ; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
index 3f6659e76aeeb..e881f871544c8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
@@ -24,7 +24,7 @@ entry:
   %d = alloca i32
   %e = alloca i32
   %f = alloca float
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %sum = add i32 %a, %b
   store i32 %sum, i32* %d, align 4
   store i32 %sum, i32* %e, align 4
@@ -43,13 +43,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @foo(i32*)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf)
 ; CHECK: entry:
 ; CHECK: %e = alloca i32
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %sum = add i32 %a, %b
 ; CHECK: store i32 %sum, ptr %e
 ; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
index 3578ca32c4407..68ecafdf70027 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
@@ -28,7 +28,7 @@ entry:
   %gid = alloca i64, align 8
   store i32* %in, i32** %in.addr, align 8
   store i32* %out, i32** %out.addr, align 8
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   store i64 %call, i64* %gid, align 8
   %0 = load i64, i64* %gid, align 8
   %rem = urem i64 %0, 16
@@ -65,7 +65,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @foo(i32, i32 addrspace(1)*)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
index 2681f5cab1372..aebcc3952b715 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
@@ -23,8 +23,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
 entry:
-  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
   %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
   %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
   %varying = load i32, i32 addrspace(1)* %a_gep
@@ -45,7 +45,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks that a conditional branch based on an AND of both
 ; a uniform and a varying value gets split into two separate branches
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
index a6999a1893b8f..f89467ce4b86a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
@@ -23,8 +23,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
 entry:
-  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
   %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
   %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
   %varying = load i32, i32 addrspace(1)* %a_gep
@@ -45,7 +45,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks that a conditional branch based on an OR of both
 ; a uniform and a varying value gets split into two separate branches
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
index 0b7dd50641c82..dec5a3d4632a4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @saddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call
   %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call
@@ -33,7 +33,7 @@ entry:
 
 define spir_kernel void @uaddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call
   %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call
@@ -45,7 +45,7 @@ entry:
 
 define spir_kernel void @saddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call
@@ -57,7 +57,7 @@ entry:
 
 define spir_kernel void @uaddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call
@@ -69,7 +69,7 @@ entry:
 
 define spir_kernel void @saddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call
@@ -81,7 +81,7 @@ entry:
 
 define spir_kernel void @uaddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call
   %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call
@@ -91,7 +91,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i8 @_Z7add_satcc(i8, i8)
 declare spir_func i8 @_Z7add_sathh(i8, i8)
 declare spir_func i32 @_Z7add_satii(i32, i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
index 36e59f1a7601a..a3a5773be00bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func float @_Z4fmaxff(float, float)
 declare spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float>, float)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
index 9d802215b4f9e..b1e0c7fc88366 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func float @_Z4fminff(float, float)
 declare spir_func <2 x float> @_Z4fminDv2_ff(<2 x float>, float)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
index db2d59b33f807..3f8b5bd0d5ec1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_normalize(float %a, float %b, i32* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %norm = call spir_func float @_Z9normalizef(float %a)
   %normi = fptosi float %norm to i32
   %c0 = getelementptr i32, i32* %c, i64 %gid
@@ -31,14 +31,14 @@ entry:
 
 define spir_kernel void @test_rhadd(i32 %a, i32 %b, i32* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b)
   %c0 = getelementptr i32, i32* %c, i64 %gid
   store i32 %add, i32* %c0, align 4
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func float @_Z9normalizef(float)
 declare spir_func i32 @_Z5rhaddjj(i32, i32)
 
@@ -46,7 +46,7 @@ declare spir_func i32 @_Z5rhaddjj(i32, i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_rhadd(i32 %a, i32 %b, ptr %c)
 ; CHECK: entry:
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b)
 ; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
 ; CHECK: store i32 %add, ptr %c0, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
index d05e657280822..c2a393c1a9fe9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(float %a, float %b, i32* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b)
   %c0 = getelementptr i32, i32* %c, i64 %gid
   store i32 %cmp, i32* %c0, align 4
@@ -37,7 +37,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z9isgreaterff(float, float)
 declare spir_func i32 @_Z6islessff(float, float)
 declare spir_func i32 @_Z7isequalff(float, float)
@@ -49,7 +49,7 @@ define spir_func i32 @opt_Z7isequalff(float, float) {
 
 ; CHECK: define spir_kernel void @__vecz_v4_test(float %a, float %b, ptr %c)
 ; CHECK: entry:
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %relational = fcmp ogt float %a, %b
 ; CHECK: %relational[[R1:[0-9]+]] = zext i1 %relational to i32
 ; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
index 1c5e4b8f58fcc..18f0d818d694d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -19,7 +19,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func float @_Z5fractfPf(float, float*)
 declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
@@ -31,7 +31,7 @@ declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
 
 define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
   %iouta = alloca float
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx
   %x = load float, float* %arrayidx.x, align 4
   %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta)
@@ -49,7 +49,7 @@ define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr)
 
 define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
   %iouta = alloca <2 x float>
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx
   %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8
   %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
index e654d64689809..1874c37800a31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
index 3bce3301c22c8..eb9ffc770c4b9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
index 034834a50417a..c287439fe810c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
index 7c98863706be5..b87aa662bbe9c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
index 1d20b9e604b6d..aee863ac6ff7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
index 95c291f4423ae..e8ef695bafbea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
index 2f1f5d044e9f5..f9169c7420165 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
@@ -38,7 +38,7 @@ entry:
 
 define spir_kernel void @builtin(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @user_defined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
@@ -58,7 +58,7 @@ entry:
 
 define spir_kernel void @user_undefined(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
@@ -67,7 +67,7 @@ entry:
 
 define spir_kernel void @cantinline(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
   %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
   call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
   %0 = load i32, i32* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
@@ -100,7 +100,7 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 declare spir_func i32 @_Z3absi(i32)
 declare spir_func i32 @_Z3clzi(i32) #1
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func void @undefined(i32*, i32*)
 
 ; Functions with definitions
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
index eff7950c29474..3c7935414409a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -21,14 +21,14 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %out) #0 {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0) #1
+  %gid = call i64 @__mux_get_global_id(i32 0) #1
   %conv = trunc i64 %gid to i32
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
   store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
@@ -52,7 +52,7 @@ attributes #1 = { nounwind readnone }
 
 ; CHECK: define spir_kernel void @__vecz_v4_test
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT: %conv = trunc i64 %gid to i32
 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
 ; CHECK-NEXT: store i32 %conv, ptr addrspace(1) %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
index 819c952ce071f..6110f78ad36e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
@@ -26,8 +26,8 @@ define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in,
 entry:
   %a.sroa.0 = alloca <2 x float>, align 8
   %b.sroa.2 = alloca <2 x float>, align 8
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_local_id(i32 0)
   %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8*
   %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8*
   %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1
@@ -64,8 +64,8 @@ for.body11:                                       ; preds = %for.body11, %for.bo
   br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
-declare spir_func i64 @_Z12get_local_idj(i32) local_unnamed_addr
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+declare i64 @__mux_get_local_id(i32) local_unnamed_addr
 
 ; Check that all the allocas come before anything else
 ; CHECK: define spir_kernel void @__vecz_v4_test(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
index 2ec80677342e6..c276a9763abb4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; A nested loop, in the form of
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
index 069b6969e389a..8e7adf00fa033 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call = call i64 @__mux_get_global_id(i32 1)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call = call i64 @__mux_get_global_id(i32 1)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call = call i64 @__mux_get_global_id(i32 1)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call = call i64 @__mux_get_global_id(i32 1)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; A nested loop, in the form of
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
index 5f4383d50adfe..0352df0afd216 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call = call i64 @__mux_get_global_id(i32 2)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call = call i64 @__mux_get_global_id(i32 2)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call = call i64 @__mux_get_global_id(i32 2)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 2)
+  %call = call i64 @__mux_get_global_id(i32 2)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; A nested loop, in the form of
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
index cb505d4e4a5ab..5232baa40e5eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_varying_if_ptr(i32 %a, i32** %b, i32* %on_true, i32* %on_false) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -53,7 +53,7 @@ if.end:
 define spir_kernel void @test_varying_if_ptrptr(i32 %a, i32*** %b, i32** %on_true, i32** %on_false) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
index bf1b541acb2da..4447d91905941 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This tests a uniform if statement that shouldn't be touched by the CFC pass
 ; CHECK: define spir_kernel void @__vecz_v4_test_uniform_if(i32 %a, ptr %b)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
index a2b5802814f92..2e9d562436b99 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This tests for a uniform loop that should remain untouched by the CFC pass
 ; CHECK: define spir_kernel void @__vecz_v4_test_uniform_loop(i32 %a, ptr %b)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
index e296fda5c03f6..817e443ddbfc2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Check for a varying that needs masked operations
 ; CHECK: define spir_kernel void @__vecz_v4_test_varying_if(i32 %a, ptr %b)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
index e9a2c8d3abb86..09f1f73e0c8e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
@@ -42,7 +42,7 @@ if.end:                                           ; preds = %if.else, %if.then
 define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -63,7 +63,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -87,7 +87,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -112,7 +112,7 @@ for.end:                                          ; preds = %for.cond
 
 define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %sub = sub nsw i32 16, %conv
   br label %for.cond
@@ -153,7 +153,7 @@ for.end14:                                        ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; The loop's start condition depends on the global ID
 ; Note that the mask names are hardcoded in vecz, if they change they need to be
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
index f293ea4a29469..216f1e5ca00cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @convert3(i64 addrspace(1)* %src, float addrspace(1)* %dest) local_unnamed_addr {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %call1 = tail call spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64 %call, i64 addrspace(1)* %src)
   %call2 = tail call spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64> %call1)
   tail call spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float> %call2, i64 %call, float addrspace(1)* %dest)
@@ -32,7 +32,7 @@ entry:
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
 
 ; Function Attrs: convergent nounwind
 declare spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float>, i64, float addrspace(1)*) local_unnamed_addr
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
index 486b1721f7aff..34e5d1449e10c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nofree nounwind
 define spir_kernel void @convert4(<4 x i64> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call
   %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 32
   %call1 = tail call spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64> %0)
@@ -34,7 +34,7 @@ entry:
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
 
 ; Function Attrs: convergent nounwind readnone
 declare spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64>) local_unnamed_addr
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
index d48d1da9a6f01..79952d45c1464 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 ; Function Attrs: nounwind
 define spir_kernel void @convert_contiguity(float addrspace(1)* %m_ptr) {
-  %1 = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %1 = call i64 @__mux_get_global_id(i32 0)
   %2 = call spir_func i32 @_Z12convert_uintm(i64 %1)
   %3 = icmp slt i32 %2, 100
   %4 = select i1 %3, float 1.000000e+00, float 0.000000e+00
@@ -38,7 +38,7 @@ declare spir_func i32 @_Z12convert_uintm(i64)
 declare spir_func i64 @_Z12convert_longi(i32)
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; It checks that the store address was identified as congituous through the
 ; OpenCL convert builtin function
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
index f7297e315c1f9..45a177ad18b3f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
index 9f0002c0e2aec..b287804080553 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
index baf957450de28..1e059c4a8525d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -22,11 +22,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -46,9 +46,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func void @_Z7barrierj(i32)
+declare void @__mux_work_group_barrier(i32, i32, i32)
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
index 654baf92be9c3..b1334c7aafb7d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
index 9e2ff23dd69ab..e46e743c04a80 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -22,11 +22,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2)
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -46,9 +46,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func void @_Z7barrierj(i32)
+declare void @__mux_work_group_barrier(i32, i32, i32)
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
index 75acd7aec4198..60834e45633b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
index d01b84639a4d0..ea009d8ab6739 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -50,7 +50,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -74,7 +74,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the masked gather load is defined correctly
 ; CHECK: define <4 x i32> @__vecz_b_masked_gather_load4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x ptr addrspace(1)>{{( %0)?}}, <4 x i1>{{( %1)?}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
index b4b1892965ab1..28767529ffa54 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
@@ -23,13 +23,13 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call = call i64 @__mux_get_local_id(i32 0) #5
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %conv, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %call2 = call i64 @__mux_get_global_id(i32 0) #5
   %conv3 = trunc i64 %call2 to i32
   %idxprom = sext i32 %conv3 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
@@ -40,8 +40,8 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0) #5
-  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %call8 = call i64 @__mux_get_local_size(i32 0) #5
+  %call9 = call i64 @__mux_get_group_id(i32 0) #5
   %mul = mul i64 %call9, %call8
   %add = add i64 %mul, %call
   %sext = shl i64 %add, 32
@@ -54,13 +54,13 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z12get_local_idj(i32) #1
+declare i64 @__mux_get_local_id(i32) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func i64 @_Z14get_local_sizej(i32) #1
+declare i64 @__mux_get_local_size(i32) #1
 
-declare spir_func i64 @_Z12get_group_idj(i32) #1
+declare i64 @__mux_get_group_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
index 54087d88be1bf..beab4dbd5c4e6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -50,7 +50,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %rem = urem i64 %call, 3
   %cmp = icmp eq i64 %rem, 0
   br i1 %cmp, label %if.else, label %if.then
@@ -74,7 +74,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the masked scatter store is defined correctly
 ; CHECK: define void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, <4 x ptr addrspace(1)>{{( %1)?}}, <4 x i1>{{( %2)?}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
index 33164117b644c..e700c04555d97 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
@@ -23,13 +23,13 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call = call i64 @__mux_get_local_id(i32 0) #5
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %conv, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %call2 = call i64 @__mux_get_global_id(i32 0) #5
   %conv3 = trunc i64 %call2 to i32
   %idxprom = sext i32 %conv3 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
@@ -40,8 +40,8 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0) #5
-  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %call8 = call i64 @__mux_get_local_size(i32 0) #5
+  %call9 = call i64 @__mux_get_group_id(i32 0) #5
   %mul = mul i64 %call9, %call8
   %add = add i64 %mul, %call
   %sext = shl i64 %add, 32
@@ -54,13 +54,13 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z12get_local_idj(i32) #1
+declare i64 @__mux_get_local_id(i32) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func i64 @_Z14get_local_sizej(i32) #1
+declare i64 @__mux_get_local_size(i32) #1
 
-declare spir_func i64 @_Z12get_group_idj(i32) #1
+declare i64 @__mux_get_group_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
index 312b02601413a..e41f41d52715d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
index b2fd7d0beb74b..768599fcb72d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
index 1589a02710990..b3465aa688f19 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
@@ -52,7 +52,7 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
index fa50eb13120b4..988d7926097e4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
@@ -22,23 +22,23 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z15get_global_sizej(i32) #1
+declare i64 @__mux_get_global_size(i32) #1
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @multiple_dimensions_0(i32 addrspace(1)* %output) #2 {
 entry:
-  %call.i = call spir_func i64 @_Z13get_global_idj(i32 0) #3
-  %call1.i = call spir_func i64 @_Z15get_global_sizej(i32 1) #3
+  %call.i = call i64 @__mux_get_global_id(i32 0) #3
+  %call1.i = call i64 @__mux_get_global_size(i32 1) #3
   %mul.i = mul i64 %call1.i, %call.i
-  %call2.i = call spir_func i64 @_Z15get_global_sizej(i32 2) #3
+  %call2.i = call i64 @__mux_get_global_size(i32 2) #3
   %mul3.i = mul i64 %mul.i, %call2.i
-  %call4.i = call spir_func i64 @_Z13get_global_idj(i32 1) #3
+  %call4.i = call i64 @__mux_get_global_id(i32 1) #3
   %mul6.i = mul i64 %call2.i, %call4.i
   %add.i = add i64 %mul6.i, %mul3.i
-  %call7.i = call spir_func i64 @_Z13get_global_idj(i32 2) #3
+  %call7.i = call i64 @__mux_get_global_id(i32 2) #3
   %add8.i = add i64 %add.i, %call7.i
   %conv = trunc i64 %add8.i to i32
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %add8.i
@@ -72,7 +72,7 @@ attributes #3 = { convergent nobuiltin nounwind readonly }
 ; CHECK: define spir_kernel void @__vecz_v4_multiple_dimensions_0
 
 ; make sure the stride calculation uses the correct operand of the multiply
-; CHECK: %[[CALL1:.+]] = call spir_func i64 @_Z15get_global_sizej(i32 1)
-; CHECK: %[[CALL2:.+]] = call spir_func i64 @_Z15get_global_sizej(i32 2)
+; CHECK: %[[CALL1:.+]] = call i64 @__mux_get_global_size(i32 1)
+; CHECK: %[[CALL2:.+]] = call i64 @__mux_get_global_size(i32 2)
 ; CHECK: %[[NEWMUL:.+]] = mul i64 %[[CALL1]], %[[CALL2]]
 ; CHECK: call void @__vecz_b_interleaved_store4_V_Dv4_ju3ptrU3AS1({{.+}} %[[NEWMUL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
index 7087195ebce69..170c3ae5fd090 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -25,7 +25,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: norecurse nounwind
 define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) {
 entry:
-  %gid = call i64 @_Z12get_local_idj(i32 0)
+  %gid = call i64 @__mux_get_local_id(i32 0)
   %sa = alloca %struct.S2, align 16
   %sb = alloca %struct.S2, align 16
   %sa_i8 = bitcast %struct.S2* %sa to i8*
@@ -63,7 +63,7 @@ declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
 declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1)
 declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1)
 
-declare i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
 ; Sanity checks: Make sure the non-vecz entry function is still in place and
 ; contains memset and memcpy. This is done in order to prevent future bafflement
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
index 4cb13295f14c6..2f8e542423592 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
@@ -25,7 +25,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: norecurse nounwind
 define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) {
 entry:
-  %gid = call i64 @_Z12get_local_idj(i32 0)
+  %gid = call i64 @__mux_get_local_id(i32 0)
   %sa = alloca %struct.S2, align 16
   %sb = alloca %struct.S2, align 16
   %sa_i8 = bitcast %struct.S2* %sa to i8*
@@ -64,7 +64,7 @@ declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
 declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1)
 declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1)
 
-declare i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
 ; Sanity checks: Make sure the non-vecz entry function is still in place and
 ; contains memset and memcpy. This is done in order to prevent future bafflement
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
index 1f9b36c4c6d2d..9969f607c5b7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -19,7 +19,7 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare void @llvm.assume(i1)
 declare i32 @llvm.expect.i32(i32, i32)
@@ -40,7 +40,7 @@ declare i32 @llvm.expect.i32(i32, i32)
 ; CHECK: store <4 x i32> [[SUM]], ptr %arrayidxz, align 4
 define spir_kernel void @assume(ptr %aptr, ptr %bptr, ptr %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
@@ -73,7 +73,7 @@ entry:
 
 define spir_kernel void @expect(ptr %aptr, ptr %bptr, ptr %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
   %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
index f1580d1663091..60efefe3b05bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @extract_constant_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
   %vecext = extractelement <4 x float> %0, i32 0;
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
 ; CHECK: call <4 x float> @__vecz_b_interleaved_load4_4_Dv4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
index 5e2867c7a3796..61927cde9dd53 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind
 define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
   %vecext = extractelement <4 x float> %0, i32 %x
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index 3d41ea9e08c45..6cc4145e497e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -27,7 +27,7 @@ entry:
   %global_id = alloca i32, align 4
   %myStruct = alloca %struct.testStruct, align 4
   store ptr addrspace(1) %out, ptr %out.addr, align 8
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   store i32 %conv, ptr %global_id, align 4
   %x = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
@@ -67,7 +67,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
 ; CHECK: entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
index 003370383d2be..1e738db395c2e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
@@ -18,8 +18,8 @@
 ; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i32:32-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
 
 %struct.mystruct = type { [2 x i32], ptr }
 
@@ -27,7 +27,7 @@ target triple = "spir64-unknown-unknown"
 define spir_kernel void @test(ptr addrspace(1) nocapture writeonly align 4 %output) {
 entry:
   %foo = alloca [4 x %struct.mystruct], align 4
-  %call = tail call spir_func i32 @_Z13get_global_idj(i32 0)
+  %call = tail call spir_func i32 @__mux_get_global_id(i32 0)
   store i32 20, ptr %foo, align 4
   %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1
   store i32 22, ptr %arrayidx4, align 4
@@ -43,7 +43,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i32 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test(
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
index d0635ba750e1b..c6641f8aee1dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -43,7 +43,7 @@ entry:
   call void @llvm.dbg.value(metadata float addrspace(1)* %in2f, i64 0, metadata !21, metadata !38), !dbg !41
   call void @llvm.dbg.value(metadata i32 addrspace(1)* %out1i, i64 0, metadata !22, metadata !38), !dbg !41
   call void @llvm.dbg.value(metadata float addrspace(1)* %out1f, i64 0, metadata !23, metadata !38), !dbg !41
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4, !dbg !42
+  %call = call i64 @__mux_get_global_id(i32 0) #4, !dbg !42
   call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !24, metadata !38), !dbg !42
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1i, i64 %call, !dbg !43
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !43
@@ -57,7 +57,7 @@ entry:
   ret void, !dbg !47
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #3
+declare i64 @__mux_get_global_id(i32) #3
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index d1b9f68d77533..bd3430a9f8214 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -39,7 +39,7 @@ entry:
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
   call void @llvm.dbg.declare(metadata i32* %tid, metadata !14, metadata !29), !dbg !31
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   %conv = trunc i64 %call to i32, !dbg !31
   store i32 %conv, i32* %tid, align 4, !dbg !31
   call void @llvm.dbg.declare(metadata <3 x i32>* %tmp, metadata !15, metadata !29), !dbg !32
@@ -84,7 +84,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 declare spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64, i32 addrspace(1)*) #2
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
index 72678bd74e68e..29f09991d0e7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
   %0 = load <4 x i32>, <4 x i32>* %arrayidx
   %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
index dbac3ca4c45af..6ffafa29877cd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
   %0 = load <4 x i32>, <4 x i32>* %arrayidx
   %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
index 748862493eb56..06da56e483189 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -28,7 +28,7 @@ entry:
   %0 = bitcast [1 x i16]* %data to i8*
   %arraydecay = getelementptr inbounds [1 x i16], [1 x i16]* %data, i64 0, i64 0
   %1 = bitcast [1 x i16]* %data to half*
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %call = tail call i64 @__mux_get_global_id(i32 0) #5
   %arrayidx7 = getelementptr inbounds half, half addrspace(1)* %p, i64 %call
   %arrayidx = bitcast half addrspace(1)* %arrayidx7 to i16 addrspace(1)*
   %2 = load i16, i16 addrspace(1)* %arrayidx, align 2, !tbaa !9
@@ -40,7 +40,7 @@ entry:
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #2
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr #2
 
 ; Function Attrs: convergent nounwind
 declare spir_func float @_Z11vloada_halfmPKDh(i64, half*) local_unnamed_addr #3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
index 09be8955197d9..42e076af5e4c4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
@@ -26,7 +26,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %cmp = icmp eq i32 %width, 13
   br i1 %cmp, label %if.then, label %if.end
 
@@ -43,7 +43,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 define spir_kernel void @test_float(float* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %mul = fmul float %0, %0
@@ -54,7 +54,7 @@ entry:
 
 
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
index b691315757940..d0b6cafc01a5f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
@@ -24,8 +24,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) #0 !shave_original_kernel !10 {
 entry:
-  %call = call spir_func i32 @_Z13get_global_idj(i32 0) #2
-  %call1 = call spir_func i32 @_Z13get_global_idj(i32 1) #2
+  %call = call i32 @__mux_get_global_id(i32 0) #2
+  %call1 = call i32 @__mux_get_global_id(i32 1) #2
   %mul = mul nsw i32 %call1, %stride
   %add = add nsw i32 %mul, %call
   %mul2 = shl nsw i32 %add, 1
@@ -46,7 +46,7 @@ entry:
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i32 @_Z13get_global_idj(i32) #1
+declare i32 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
index bc9c052e38ae8..6f48a1a77ff74 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
@@ -23,9 +23,9 @@ target triple = "spir64-unknown-unknown"
 
 define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %conv, %mul
@@ -54,5 +54,5 @@ entry:
 ; CHECK:  %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave
 
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
index a9c4dbbc4ad46..0269e453ce22e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -23,11 +23,11 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  call spir_func void @_Z7barrierj(i32 2) #3
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
@@ -47,9 +47,9 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func void @_Z7barrierj(i32) #1
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
@@ -72,14 +72,14 @@ attributes #3 = { nobuiltin nounwind }
 
 ; Function start
 ; CHECK: define spir_kernel void @__vecz_v4_f
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 
 ; There should be exactly 4 interleaved loads and one store in the code
 ; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
 ; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
 
 ; And in between them there should be a barrier call
-; CHECK: call spir_func void @_Z7barrierj
+; CHECK: call void @__mux_work_group_barrier
 ; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
 ; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
 ; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
index c22625e138831..12cbf0c96934b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
@@ -30,7 +30,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -46,7 +46,7 @@ entry:
 
 define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -62,7 +62,7 @@ entry:
 
 define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -78,7 +78,7 @@ entry:
 
 define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -96,7 +96,7 @@ entry:
 
 define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -114,7 +114,7 @@ entry:
 
 define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -132,7 +132,7 @@ entry:
 
 define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -169,7 +169,7 @@ declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
 declare i32 @llvm.usub.sat.i32(i32, i32)
 declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CTPOP: void @__vecz_v2_ctpop
 ; CTPOP: = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
index ae05f0b8932cf..ccdccba5a3d6d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
@@ -30,7 +30,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -46,7 +46,7 @@ entry:
 
 define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -62,7 +62,7 @@ entry:
 
 define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
@@ -78,7 +78,7 @@ entry:
 
 define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -96,7 +96,7 @@ entry:
 
 define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -114,7 +114,7 @@ entry:
 
 define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -132,7 +132,7 @@ entry:
 
 define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
   %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
@@ -169,7 +169,7 @@ declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
 declare i32 @llvm.usub.sat.i32(i32, i32)
 declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CTPOP: void @__vecz_v2_ctpop
 ; CTPOP: = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
index 13f94ed77f102..0cbf1c85c0ee5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @irreducible_loop(i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
   %ld = load i32, i32 addrspace(1)* %arrayidx4, align 4
   %cmp = icmp sgt i32 %ld, -1
@@ -45,7 +45,7 @@ do.end:                                           ; preds = %label
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_irreducible_loop
 ; CHECK: entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
index e11efdfe3e407..d36b9e0c1f350 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([23 x i8], [23 x i8] addrspace(2)* @.str, i64 0, i64 0), i64 %call, i32 %0)
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
index 99cc1e48195c0..c08d3b682f972 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
@@ -29,7 +29,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @entry(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 entry:
-  %call = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = tail call i64 @__mux_get_local_id(i32 0)
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %call
   %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %call
@@ -74,7 +74,7 @@ if.end:
   ret void
 }
 
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
 declare spir_func i32 @_Z3maxii(i32, i32)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
index fc2adc4ef9a0d..8283037ef3059 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -26,7 +26,7 @@ entry:
   %results.addr = alloca i32 addrspace(1)*, align 8
   %tid = alloca i32, align 4
   store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   store i32 %conv, i32* %tid, align 4
   %0 = load i32, i32* %tid, align 4
@@ -47,7 +47,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
index 96bf189710a6e..5166cab218e80 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -26,7 +26,7 @@ entry:
   %results.addr = alloca i32 addrspace(1)*, align 8
   %tid = alloca i32, align 4
   store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   store i32 %conv, i32* %tid, align 4
   %0 = load i32, i32* %tid, align 4
@@ -47,7 +47,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
index cf90d52b80d59..6360d5b5313bb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %call.tr = trunc i64 %call to i32
   %conv = shl i32 %call.tr, 1
   %idx.ext = sext i32 %conv to i64
@@ -53,7 +53,7 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
index 3ab352a4c4ba8..ce530c9ad73e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(1)* %doit) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %call.tr = trunc i64 %call to i32
   %conv = shl i32 %call.tr, 1
   %idx.ext = sext i32 %conv to i64
@@ -59,7 +59,7 @@ if.end:                                           ; preds = %if.else, %if.then,
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
index d24ea6fcfc417..5b462f57ad479 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %add = add i64 %call, 1
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
@@ -51,7 +51,7 @@ if.end1:                                          ; preds = %if.end
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
index ed24ec0fcf73c..07f9f01299ba7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
@@ -21,14 +21,14 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
   store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
index 766a514a7bd8a..4e87a4b86ed52 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nsw i32 %conv, %n
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
index b09cc25da4d88..37804e47e003c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nsw i32 %conv, %n
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
index b2fa2956e4636..d3caf8e11a9af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nsw i32 %conv, %n
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
index 766a514a7bd8a..4e87a4b86ed52 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nsw i32 %conv, %n
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
index 02365aaaa5f0f..d74a7821cee0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %mul = mul nuw nsw i64 %call, 18
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
index 750a7744df7cc..efc2d1b86ada0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = shl i32 %n, 1
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
index 99aab80558349..89ddca506ba2c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = shl nuw nsw i64 %call, 1
   %mul = mul nuw nsw i64 %add, %call
@@ -30,7 +30,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
index c3256fa47511b..84d223297a7f1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %mul = mul nsw i32 %conv2, %n
@@ -34,7 +34,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
index 03a29cf5341d1..8876efb92b760 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 addrspace(1)* readnone %r) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = add nuw nsw i64 %call, 255
   %idxprom = and i64 %conv, 255
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %src, i64 %idxprom
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _gather_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
index e88630c2653c9..5b155cea4399b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nuw nsw i64 %call, 9
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
index e0bd5d66b3db6..8b4de50d0fe52 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %mul = mul nuw nsw i64 %call, 9
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
index a75d1ba3b58d9..5deaed20f4b2e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nsw i32 %conv, %n
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
index f52a188f3d48e..80a27d7a77a19 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %mul = mul nuw nsw i64 %call, 5
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
index 2a41511698681..6f91e244d1c0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = shl nuw nsw i64 %call, 1
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
index 7ed4fced4a3f6..3d58276697b1b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %mul = mul nuw nsw i64 %call, %call
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
index 4f5dd3091e553..cb833c98b3bc8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %1 = mul nuw nsw i64 %call, 9
   %mul = add nuw nsw i64 %1, 81
@@ -30,7 +30,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
index edc9780cdc00d..c3e4fa3945a59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %0 = load i32, i32 addrspace(1)* %src, align 4
   %add = add nuw nsw i32 %conv, 9
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @test
 ; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
index 952192641fa2b..1f2db39dc3510 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
@@ -22,13 +22,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @multiple_exit_blocks(i64 %n) {
 entry:
-  %gid = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
   %cmp1 = icmp slt i64 %lid, %n
   %cmp2 = icmp slt i64 %gid, %n
   br i1 %cmp2, label %if.then, label %if.end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
index f1eef3daa2571..37b77bf8d00f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
@@ -29,7 +29,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
@@ -45,7 +45,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_foo3(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; CHECK-NOT: call spir_kernel
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 ; CHECK: load <4 x i32>, ptr addrspace(1) %{{.+}}, align 4
 ; CHECK: store <4 x i32> %{{.+}}, ptr addrspace(1) %{{.+}}, align 4
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
index 67ed397dc63c4..ca48f1362b82c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
@@ -55,7 +55,7 @@ entry:
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
@@ -82,7 +82,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
index ae7019f31e923..e735d164bdbeb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
@@ -25,7 +25,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
 entry:
-  %tid = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %tid = call i64 @__mux_get_global_id(i32 0) #3
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid
   %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %tid
@@ -36,7 +36,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 ; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]]
 ; CHECK: define spir_kernel void @__vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
index 0d92b325c4594..341beb52832af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
@@ -18,12 +18,12 @@
 ; equal width but with one enabling vector predication.
 ; RUN: veczc -k add:1s,1sp -S < %s | FileCheck %s
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @add(
 define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx.in1 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx
   %arrayidx.in2 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx
   %in1.v = load i32, ptr addrspace(1) %arrayidx.in1, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
index a1f075d4913ed..98206b26898d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @priv(i32 addrspace(3)* %a) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %for.cond
 
@@ -43,7 +43,7 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
index 33d160b80140e..172ba57d8d040 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
@@ -52,7 +52,7 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
index 03959379ef109..eead4848591b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
@@ -20,11 +20,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp1 = icmp slt i32 %i, %e
   br i1 %cmp1, label %for.body.lr.ph, label %for.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
index 1add6d1c9dbec..082e4f15f7165 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @no_vecz1(i32 addrspace(1)* %out, i32 %n) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %call, 0
   br i1 %cmp, label %for.cond.preheader, label %if.end
 
@@ -35,7 +35,7 @@ if.end:                                           ; preds = %for.cond.preheader,
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK-NOT: insertelement
 ; CHECK-NOT: shufflevector
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
index 77cb58fb597e0..248c2e879b9e1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
@@ -22,7 +22,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define spir_kernel void @no_vecz2(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %m) {
 entry:
   %0 = load i32, i32 addrspace(1)* %m, align 4
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %call, 0
   br i1 %cmp, label %for.cond.preheader, label %if.end
 
@@ -49,7 +49,7 @@ if.end:                                           ; preds = %for.cond.cleanup28,
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @{{(__vecz_v16_)?}}no_vecz2
 ; CHECK-NOT: extractelement
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
index 8ade7c8ae0038..3128c97e23b68 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
@@ -23,9 +23,9 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @offset_info_analysis(i8 addrspace(1)* noalias %in, i8 addrspace(1)* noalias %out, i32 %width) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
-  %call1 = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %call1 = call i64 @__mux_get_global_id(i32 1) #2
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %width
   %0 = xor i32 %width, -1
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks that a 'xor' as a binop operand does correctly get analyzed.
 ; and masked properly
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
index 045285825a237..afc72a8fd7df9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -34,7 +34,7 @@ entry:
 }
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 ; CHECK: and <4 x i64>
 ; CHECK: and <4 x i64>
 ; CHECK: and <4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
index 7031037728f09..e4cdc728b8ad8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
index 1722c0c9b5337..2a2a81848fe54 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
index 7d12cd2dedc88..951af5b9babd7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
index 22d8656b20f79..9081069d4d41a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
index 41b3a682d3ac4..5bff531a0c4df 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
index 241645030342c..3468b00dfc5e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
index 0b014e6271f31..b522285e862c1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
index 66e9cfc2ccc2f..065a63c9161af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
index aad20e3bc2fae..e36282ee7bf4e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
index 8a51ce4fe8cca..efcc8fd8ddff8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
index ba42441427eea..51763b4ab8286 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
index efbf4c5d8bd00..59a470ef3233e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
index 516ee88c522c6..7928f829155c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
@@ -19,12 +19,12 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
index 0d3f4a0f3a7eb..98a7d4bede10d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
index f7f8f21a789dc..cd05f11142c55 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare spir_func i32 @_Z5isinfd(double)
 declare spir_func i32 @_Z5isinff(float)
 declare spir_func i32 @_Z5isnand(double)
@@ -43,7 +43,7 @@ declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
 
 define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isfinitef(float %0)
@@ -54,7 +54,7 @@ entry:
 
 define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isfinited(double %0)
@@ -65,7 +65,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
@@ -76,7 +76,7 @@ entry:
 
 define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
@@ -87,7 +87,7 @@ entry:
 
 define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isinff(float %0)
@@ -98,7 +98,7 @@ entry:
 
 define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isinfd(double %0)
@@ -109,7 +109,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
@@ -120,7 +120,7 @@ entry:
 
 define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
@@ -131,7 +131,7 @@ entry:
 
 define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z8isnormalf(float %0)
@@ -142,7 +142,7 @@ entry:
 
 define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z8isnormald(double %0)
@@ -153,7 +153,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
@@ -164,7 +164,7 @@ entry:
 
 define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
@@ -175,7 +175,7 @@ entry:
 
 define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z5isnanf(float %0)
@@ -186,7 +186,7 @@ entry:
 
 define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z5isnand(double %0)
@@ -197,7 +197,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
@@ -208,7 +208,7 @@ entry:
 
 define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
@@ -219,7 +219,7 @@ entry:
 
 define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %call1 = call spir_func i32 @_Z7signbitf(float %0)
@@ -230,7 +230,7 @@ entry:
 
 define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
   %0 = load double, double addrspace(1)* %arrayidx, align 8
   %call1 = call spir_func i32 @_Z7signbitd(double %0)
@@ -241,7 +241,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
   %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
   %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
@@ -252,7 +252,7 @@ entry:
 
 define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
index 5447a00118fa0..2cec2611d77f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call
   %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
@@ -31,11 +31,11 @@ entry:
 
 define spir_kernel void @second_test(i32 %a, i32 %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 !opencl.kernels = !{!0, !6}
 !opencl.kernel_wg_size_info = !{!12}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
index 1e80cac1e1b7f..22595b3dfabee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call
   %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
@@ -31,11 +31,11 @@ entry:
 
 define spir_kernel void @second_test(i32 %a, i32 %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 !opencl.kernels = !{!0, !6}
 !opencl.kernel_wg_size_info = !{!12}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
index 069a82c6f7449..2e67d46209a81 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
@@ -26,8 +26,8 @@ define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in,
 entry:
   %a.sroa.0 = alloca <2 x float>, align 16
   %b.sroa.2 = alloca <2 x float>, align 16
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_local_id(i32 0)
   %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8*
   %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8*
   %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1
@@ -64,8 +64,8 @@ for.body11:                                       ; preds = %for.body11, %for.bo
   br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
-declare spir_func i64 @_Z12get_local_idj(i32) local_unnamed_addr
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+declare i64 @__mux_get_local_id(i32) local_unnamed_addr
 
 ; Check that all the allocas come before anything else
 ; CHECK: define spir_kernel void @__vecz_v4_test(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
index 543302a1688fc..ddbcd7e1b220b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 define spir_kernel void @test_branch(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -41,7 +41,7 @@ if.end:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks if the branch conditions and the branch BBs are vectorized
 ; and masked properly
@@ -49,7 +49,7 @@ declare spir_func i64 @_Z13get_global_idj(i32)
 ; CHECK: %conv = sext i32 %a to i64
 ; CHECK: %[[A_SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %conv, {{i32|i64}} 0
 ; CHECK: %[[A_SPLAT:.+]] = shufflevector <4 x i64> %[[A_SPLATINSERT]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
-; CHECK: %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %call = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %[[GID_SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %call, {{i32|i64}} 0
 ; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %[[GID:.+]] = add <4 x i64> %[[GID_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index adf0731cc0392..e2b653d52525b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -39,7 +39,7 @@ entry:
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
@@ -66,7 +66,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
index fc50f154c6a31..d4f9e932c60b9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 define spir_kernel void @test_branch(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -43,7 +43,7 @@ if.end:
 
 define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i32 %a, 42
   br i1 %cmp, label %if.then, label %if.else
 
@@ -67,7 +67,7 @@ if.end:
 }
 
 define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
-  %index = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %index = call i64 @__mux_get_global_id(i32 0)
   %a.i = getelementptr i32, i32* %a, i64 %index
   %b.i = getelementptr i32, i32* %b, i64 %index
   %c.i = getelementptr i32, i32* %c, i64 %index
@@ -78,11 +78,11 @@ define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks if a simple kernel is vectorized without any masks
 ; CHECK: define spir_func void @__vecz_v4_test_nonvarying_loadstore(ptr %a, ptr %b, ptr %c)
-; CHECK: %index = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %index = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a.i = getelementptr i32, ptr %a, i64 %index
 ; CHECK: %b.i = getelementptr i32, ptr %b, i64 %index
 ; CHECK: %c.i = getelementptr i32, ptr %c, i64 %index
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
index fccbb32843090..0b448e8f8eba8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 define spir_kernel void @test_branch(i32 %a, i32* %b) {
 entry:
   %conv = sext i32 %a to i64
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %conv, %call
   br i1 %cmp, label %if.then, label %if.else
 
@@ -43,7 +43,7 @@ if.end:
 
 define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i32 %a, 42
   br i1 %cmp, label %if.then, label %if.else
 
@@ -67,7 +67,7 @@ if.end:
 }
 
 define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
-  %index = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %index = call i64 @__mux_get_global_id(i32 0)
   %a.i = getelementptr i32, i32* %a, i64 %index
   %b.i = getelementptr i32, i32* %b, i64 %index
   %c.i = getelementptr i32, i32* %c, i64 %index
@@ -78,12 +78,12 @@ define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks if the if blocks are vectorized without masks and if the phi
 ; node is also vectorized properly
 ; CHECK: define spir_kernel void @__vecz_v4_test_uniform_branch(i32 %a, ptr %b)
-; CHECK: %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %call = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %[[SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %call, {{i32|i64}} 0
 ; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i64> %[[SPLATINSERT]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %[[GID:.+]] = add <4 x i64> %[[SPLAT]], <i64 0, i64 1, i64 2, i64 3>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
index 64355afa3c865..49b67c2053235 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
@@ -25,7 +25,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(%struct.T addrspace(1)* %in, %struct.T addrspace(1)* %out, i32 addrspace(1)* %offsets) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
   %conv = sext i32 %0 to i64
@@ -37,7 +37,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; Check if we can packetize GEPs on structs
 ; Note that we only need to packetize the non-uniform operands..
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
index c22771258e189..13bbe9556fb3a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
index 327d93185eadb..1d6962386952e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
index 56069ada5aedb..85466be8a1527 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
index c863858541e44..57ebb217c5ae6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
index 5c9f38f5546ca..c05abe83b1c59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
@@ -154,7 +154,7 @@ if.end:                                           ; preds = %entry, %if.then
 ; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
 ; CHECK: insertelement <4 x i64> {{poison|undef}}, i64
 ; CHECK: shufflevector <4 x i64>
-; CHECK: %[[LOCAL_SIZE:[^ ]+]] = call spir_func i64 @_Z14get_local_sizej(i32 0)
+; CHECK: %[[LOCAL_SIZE:[^ ]+]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK: icmp {{(ugt|ult)}} i64 %[[LOCAL_SIZE]], {{(1|2)}}
 ; CHECK-NEXT: br
 ; CHECK: phi i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
index 2f58fa76a8b0c..abb7b8efe8723 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
index 926d245564712..8d25758ef4869 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
index 7f0782e9a3968..57101e9421d75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
index c95f0d6f9fb02..0d216f4fbec55 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
index e370be4748007..fc19214884173 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -62,13 +62,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -95,13 +95,13 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
@@ -129,7 +129,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %0 = load i32, i32 addrspace(1)* %in, align 4
   %rem1 = and i32 %0, 1
   %tobool = icmp eq i32 %rem1, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
index 46baacb174abd..4b4ca63435695 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
index a1482f948478a..d29a4b8d2cc53 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -20,20 +20,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z14get_local_sizej(i32)
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
 
 ; Function Attrs: nounwind
 define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call = call i64 @__mux_get_local_id(i32 0)
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
   %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
   %conv = zext i32 %storemerge to i64
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0)
+  %call1 = call i64 @__mux_get_local_size(i32 0)
   %cmp = icmp ult i64 %conv, %call1
   br i1 %cmp, label %for.body, label %for.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
index 053a647ea32bf..e4a19066e79bb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
@@ -128,7 +128,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %rem = srem i32 %conv, 5
   %cmp = icmp eq i32 %rem, 0
@@ -258,7 +258,7 @@ if.end73:                                         ; preds = %if.end70, %if.end41
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
index 32244424ddabe..a68b3d9eda689 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
@@ -89,7 +89,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -180,7 +180,7 @@ early:                                            ; preds = %for.end34, %for.end
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
index d11101c7ffd37..3318c9d1e797b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
@@ -161,7 +161,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -316,7 +316,7 @@ s:                                                ; preds = %for.cond68, %for.co
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
index 9b335e8f3d02f..71d05eb3f69d3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
@@ -146,7 +146,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -261,7 +261,7 @@ n46:                                              ; preds = %i44, %for.cond35
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
index b38aa0247374d..f226c3eb5bb8d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
@@ -213,7 +213,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -431,7 +431,7 @@ v:                                                ; preds = %for.cond107, %for.c
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
index 035b584ab0b0e..5385a5ab95d69 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -96,8 +96,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %call1 = call i64 @__mux_get_global_size(i32 0) #2
   %add = add i64 %call, 1
   %cmp = icmp ult i64 %add, %call1
   br i1 %cmp, label %if.then, label %if.else
@@ -162,10 +162,10 @@ if.end17:                                         ; preds = %sw.bb14, %if.else,
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z15get_global_sizej(i32) #1
+declare i64 @__mux_get_global_size(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
index 76b96c54195fe..c7fc571b91837 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
@@ -94,7 +94,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp slt i32 %n, 5
   br i1 %cmp, label %for.cond, label %while.body
@@ -193,7 +193,7 @@ early:                                            ; preds = %for.end49, %for.end
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
index 78578253482b7..bdbf0d5c88b6b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
@@ -145,7 +145,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -272,7 +272,7 @@ q:                                                ; preds = %for.cond59, %for.co
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
index 0da9fdfd55ef5..7bfeb0054f12b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
@@ -100,7 +100,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp slt i32 %n, 5
   br i1 %cmp, label %for.cond, label %while.body
@@ -211,7 +211,7 @@ early:                                            ; preds = %for.cond52, %for.en
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
index 06e8e72eeba65..4dd7317c2df6e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -130,7 +130,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -249,7 +249,7 @@ p:                                                ; preds = %for.cond60, %for.en
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
index df7dcb93cbca8..3fc928055b86b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
@@ -109,7 +109,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -192,7 +192,7 @@ if.end42:                                         ; preds = %if.else40, %i38
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
index 7973e9ef46da2..b5c52ad8c3341 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
@@ -118,7 +118,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -205,7 +205,7 @@ j:                                                ; preds = %for.cond40, %for.co
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
index 7cefd8bad6526..93f135b7a073d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
@@ -89,7 +89,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
@@ -183,7 +183,7 @@ end:                                              ; preds = %i42, %h
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
index fd2e031325485..bb155865e445d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
@@ -100,7 +100,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -153,7 +153,7 @@ g:                                                ; preds = %for.cond, %e, %whil
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
index 1cbe02fccfb77..c6b2608c7604f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
@@ -96,7 +96,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -135,7 +135,7 @@ f:                                                ; preds = %e, %if.else, %while
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
index 5f2f5c3146b18..5a8b3dc38a0c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -108,7 +108,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -168,7 +168,7 @@ h:                                                ; preds = %for.cond, %f, %if.e
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
index ce1252b537ae9..4dfe9cf837e7b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
@@ -97,7 +97,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @partial_linearization23(i32 addrspace(1)* %out, i32 %n) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else7
@@ -236,7 +236,7 @@ end:                                              ; preds = %i24, %h
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @__vecz_v4_partial_linearization23
 ; CHECK: i24:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
index c61c6baa9947e..aeacf9c8d3a98 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
@@ -87,7 +87,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
@@ -181,7 +181,7 @@ end:                                              ; preds = %i42, %for.cond
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
index 655ae12b89510..f223cea33546b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
@@ -80,7 +80,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %0 = icmp eq i32 %conv, -2147483648
   %1 = icmp eq i32 %n, -1
@@ -139,7 +139,7 @@ g:                                                ; preds = %f, %e
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
index 218d993ad36ae..520b069f53c85 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -88,7 +88,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %rem1 = and i32 %conv, 1
   %cmp = icmp eq i32 %rem1, 0
@@ -158,7 +158,7 @@ g:                                                ; preds = %f, %if.then
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
index ad5918b645550..73dffa88a178c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
@@ -85,7 +85,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -140,7 +140,7 @@ early:                                            ; preds = %e, %while.end
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
index 7976d741c2eaf..13bbb4131361a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
@@ -95,7 +95,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else5
@@ -158,7 +158,7 @@ i29:                                              ; preds = %h, %for.cond19
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
index bdec2081eeb19..78a21ccea682a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
@@ -82,7 +82,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %0 = icmp eq i32 %conv, -2147483648
   %1 = icmp eq i32 %n, -1
@@ -142,7 +142,7 @@ g:                                                ; preds = %f, %e
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
index cb1212315acc2..5ff8cfff49728 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
@@ -70,7 +70,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -105,7 +105,7 @@ while.end:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
index ad2f4f3d7dde6..fad2ee91be6cb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
@@ -25,7 +25,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) {
 entry:
-  %call = tail call spir_func i32 @_Z13get_global_idj(i32 0)
+  %call = tail call i32 @__mux_get_global_id(i32 0)
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %for.body.preheader, label %if.end.thread
 
@@ -60,6 +60,6 @@ if.end2:
   ret void
 }
 
-declare spir_func i32 @_Z13get_global_idj(i32)
+declare i32 @__mux_get_global_id(i32)
 
 declare spir_func i32 @_Z3maxii(i32, i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
index ef109b67827a0..a3dbc7703c5fc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
@@ -39,10 +39,10 @@ target triple = "spir64-unknown-unknown"
 ; PASSES2-NOT: Running pass:
 
 define spir_kernel void @foo(i32 addrspace(1)* %out) {
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx
   store i32 0, i32 addrspace(1)* %arrayidx, align 4
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
index efe037e65b3d9..43079dfa9e05f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -20,25 +20,25 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: IR Dump After Simplify masked memory operations{{( on __vecz_v2_foo)?}}
 ; CHECK-NEXT: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) #0 {
-; CHECK-NEXT:   %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:   %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
 ; CHECK-NEXT:   store i32 0, ptr addrspace(1) %arrayidx, align 4
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
 ; CHECK: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) {{.*}} {
-; CHECK-NEXT:   %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK-NEXT:   %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
 ; CHECK-NEXT:   store <2 x i32> zeroinitializer, ptr addrspace(1) %arrayidx, align 4
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
 define spir_kernel void @foo(i32 addrspace(1)* %out) {
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx
   store i32 0, i32 addrspace(1)* %arrayidx, align 4
   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
index eb6550ae076b3..954e6fe8aaaef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @codegen_2(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %size, i32 %reps) local_unnamed_addr {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = sext i32 %reps to i64
   %mul = mul i64 %call, %conv
   %add = add i64 %call, 1
@@ -60,7 +60,7 @@ for.inc:                                          ; preds = %if.then, %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !opencl.ocl.version = !{!1}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index a603cf94b1c63..ab9e14c63b94e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -35,7 +35,7 @@ entry:
   store i32 addrspace(3)* %b, i32 addrspace(3)** %b.addr, align 8
   call void @llvm.dbg.declare(metadata i32 addrspace(3)** %b.addr, metadata !13, metadata !30), !dbg !31
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !30), !dbg !32
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #3, !dbg !32
+  %call = call i64 @__mux_get_local_id(i32 0) #3, !dbg !32
   store i64 %call, i64* %tid, align 8, !dbg !32
   call void @llvm.dbg.declare(metadata i32* %i, metadata !19, metadata !30), !dbg !33
   %0 = load i64, i64* %tid, align 8, !dbg !33
@@ -74,7 +74,7 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z12get_local_idj(i32) #2
+declare i64 @__mux_get_local_id(i32) #2
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
index f35d1e5d7f068..bb6f070318c66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i32 %size) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %idx.ext = sext i32 %conv to i64
   %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idx.ext
@@ -49,7 +49,7 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
index 4f780b3e97285..68b766a9b93aa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i64 %size) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %call
   br label %for.cond
 
@@ -46,7 +46,7 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
index 9af4b5aae0e7e..a946ab0eaf5f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
@@ -19,16 +19,16 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 @predicate_with_switch.tmpIn = internal addrspace(3) global [16 x i32] undef, align 4
 
 define spir_kernel void @predicate_with_switch(i32 addrspace(1)* %A, i32 addrspace(1)* %B) #0 {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #2
-  %call1 = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_local_id(i32 0) #2
+  %call1 = call i64 @__mux_get_global_id(i32 0) #2
   switch i64 %call, label %if.end [
     i64 0, label %return
     i64 200, label %return
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
index 5698c31901290..9b280eca43289 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
@@ -20,7 +20,7 @@
 
 define spir_kernel void @fast_nan(float addrspace(1)* %src1, float addrspace(1)* %src2, i16 addrspace(1)* %dst, i32 %width) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %src1, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %src2, i64 %call
@@ -32,4 +32,4 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
index c661d2c931d57..44bd970d07cad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
@@ -26,7 +26,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
+  %call = call i64 @__mux_get_global_id(i32 0) #3
   %cmp = icmp eq i32 %width, 13
   br i1 %cmp, label %if.then, label %if.end
 
@@ -43,7 +43,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 define spir_kernel void @test_float(float* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds float, float* %in, i64 %call
   %0 = load float, float* %arrayidx, align 4
   %mul = fmul float %0, %0
@@ -54,7 +54,7 @@ entry:
 
 
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
index 0cec9bb1fef51..39be7c37e5468 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
@@ -79,7 +79,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @regression_by_all(i32 addrspace(1)* %out, i32 %n) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
   %rem1 = and i32 %n, 1
   %cmp = icmp eq i32 %rem1, 0
@@ -116,7 +116,7 @@ e:                                                ; preds = %for.cond, %d
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @__vecz_v4_regression_by_all
 ; CHECK: br i1 %[[CMP:.+]], label %[[D:.+]], label %[[IFELSE:.+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
index 5527d3ab9c78f..b04924ee08cc5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
@@ -26,7 +26,7 @@ target triple = "spir64-unknown-unknown"
 ; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
 define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = ptrtoint i8 addrspace(1)* %in to i64
   %shl = shl i64 %call, 2
   %add = add i64 %shl, %0
@@ -41,7 +41,7 @@ entry:
 ; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
 define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = ptrtoint i16 addrspace(1)* %in to i64
   %shl = shl i64 %call, 2
   %add = add i64 %shl, %0
@@ -49,4 +49,4 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
index 3776c68fd9da1..2ca52052cabe6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = ptrtoint i8 addrspace(1)* %in to i64
   %shl = shl nuw nsw i64 %call, 2
   %add = add i64 %shl, %0
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @__vecz_v4_remove_intptr
 ; CHECK-NOT: ptrtoint
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
index 3411e43b95a64..c62058b3c90b9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = ptrtoint i8 addrspace(1)* %in to i64
   %shl = shl nuw nsw i64 %call, 2
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %shl
@@ -42,7 +42,7 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @__vecz_v4_remove_intptr
 ; CHECK-NOT: ptrtoint
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
index 43241ec257ff1..ebf66549adb38 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
@@ -21,7 +21,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i64 addrspace(1)* %N) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %0 = load i64, i64 addrspace(1)* %N, align 8
   %cmp = icmp ult i64 %call, %0
   br i1 %cmp, label %if.then, label %if.end
@@ -40,7 +40,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @__vecz_v16_add
 ; CHECK: entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
index 4765e95d902ba..869b3a73e7840 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
@@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z12get_local_idj(i32) #0
+declare i64 @__mux_get_local_id(i32) #0
 
 ; Function Attrs: nounwind readnone
 declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0
@@ -35,7 +35,7 @@ declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2
 
 define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) {
 entry:
-  %lid = tail call spir_func i64 @_Z12get_local_idj(i32 0) #0
+  %lid = tail call i64 @__mux_get_local_id(i32 0) #0
   %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid
   br label %loop
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
index a98976bebcb4c..4bce1b7985b98 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x i32>* %pc, <4 x float>* %pd) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
   %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
   %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
index e4dbe5f61eb36..60febc22cb725 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -42,7 +42,7 @@ entry:
   store <2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)** %out.addr, align 8
   call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %out.addr, metadata !18, metadata !34), !dbg !35
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !19, metadata !34), !dbg !36
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !36
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !36
   store i64 %call, i64* %tid, align 8, !dbg !36
   call void @llvm.dbg.declare(metadata <2 x i32>* %a, metadata !23, metadata !34), !dbg !37
   %0 = load i64, i64* %tid, align 8, !dbg !37
@@ -72,7 +72,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
index dff0d17cc7d73..1940065a68d44 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
@@ -19,11 +19,11 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define spir_kernel void @test_instructions(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i32>* %pc) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx
   %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx
   %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
index 0435c80f1be5b..7380496abd278 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
@@ -19,14 +19,14 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>*, <2 x i1>)
 declare void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float>, <2 x float>*, <2 x i1>)
 
 define spir_kernel void @scalarize_masked_memops(<2 x float>* %pa, <2 x float>* %pz) {
 entry:
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %head = insertelement <2 x i64> undef, i64 %idx, i64 0
   %splat = shufflevector <2 x i64> %head, <2 x i64> undef, <2 x i32> zeroinitializer
   %idxs = add <2 x i64> %splat, <i64 0, i64 1>
@@ -36,7 +36,7 @@ entry:
   %zptr = getelementptr <2 x float>, <2 x float>* %pz, i64 %idx
   call void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float> %ld, <2 x float>* %zptr, <2 x i1> %mask)
   ret void
- ; CHECK:  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+ ; CHECK:  %idx = call i64 @__mux_get_global_id(i32 0)
  ; CHECK:  %[[IDXS0:.*]] = add i64 %idx, 0
  ; CHECK:  %[[IDXS1:.*]] = add i64 %idx, 1
  ; CHECK:  %[[MASK0:.*]] = icmp slt i64 %[[IDXS0]], 8
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
index c86a1c6843226..b57d0df435528 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define dso_local spir_kernel void @splat(i32 addrspace(1)* %data, i32 addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 0)
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %data, i64 %call
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
   %splat.splatinsert = insertelement <4 x i32> poison, i32 %0, i64 0
@@ -34,7 +34,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32 noundef)
+declare i64 @__mux_get_global_id(i32 noundef)
 declare spir_func i32 @not_scalarizable(<4 x i32> noundef)
 
 ; It checks that the scalarizer scalarizes the add and reconstructs the vector
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
index b6da6887d0384..be26c56824cd3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define dso_local spir_kernel void @splat(float addrspace(1)* %data, float addrspace(1)* %out) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 0)
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %data, i64 %call
   %0 = load float, float addrspace(1)* %arrayidx, align 4
   %splat.splatinsert = insertelement <4 x float> poison, float %0, i64 0
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32 noundef)
+declare i64 @__mux_get_global_id(i32 noundef)
 declare spir_func float @not_scalarizable(<4 x float> noundef)
 
 ; It checks that the scalarizer turns the original vector splat back into a vector splat,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
index 1586feb7c1ea8..d9bc298514967 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -18,10 +18,10 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 define void @bar(i64** %ptrptrs, i64 %val) {
-  %idx = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxa = getelementptr inbounds i64*, i64** %ptrptrs, i64 %idx
   %ptrs = load i64*, i64** %arrayidxa, align 4
   %addr = getelementptr inbounds i64, i64* %ptrs, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
index e4284b6aa7a32..33e9cf9970c6f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
@@ -24,20 +24,20 @@ target triple = "spir64-unknown-unknown"
 @scan_fact.temp = internal addrspace(3) global [16 x i32] undef, align 4
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #0
+declare i64 @__mux_get_global_id(i32) #0
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z12get_local_idj(i32) #0
+declare i64 @__mux_get_local_id(i32) #0
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z14get_local_sizej(i32) #0
+declare i64 @__mux_get_local_size(i32) #0
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @scan_fact(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #3
-  %call1 = call spir_func i64 @_Z13get_global_idj(i32 0) #3
-  %call2 = call spir_func i64 @_Z14get_local_sizej(i32 0) #3
+  %call = call i64 @__mux_get_local_id(i32 0) #3
+  %call1 = call i64 @__mux_get_global_id(i32 0) #3
+  %call2 = call i64 @__mux_get_local_size(i32 0) #3
   %mul = shl i64 %call1, 1
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %mul
   %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
index 6ab3427e9f117..4f68469ff25ee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
@@ -26,7 +26,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   br label %while.body
 
@@ -86,7 +86,7 @@ h:                                                ; preds = %for.cond, %f, %if.e
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
index 24cc2c542276f..fe988f643200f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @load16
 ; CHECK: load <4 x i8>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
index 3060f924d4286..1593bdffa7087 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -41,7 +41,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @load16
 ; CHECK: load <4 x i8>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
index f7e7aed565711..048713d1c16e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @load16
 ; CHECK: load <4 x i8>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
index e3d06caf03fb1..1cd2d4bc4574e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @load16
 ; CHECK: load <4 x i8>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
index 01607469b7cf2..e7d298b2b07a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -52,7 +52,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @load16
 ; CHECK: load <4 x i8>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
index f138056a51daa..6cd500dbafa72 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
@@ -21,9 +21,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
   %conv = trunc i64 %call to i32
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
   %conv2 = trunc i64 %call1 to i32
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %mul, %conv
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: spir_kernel void @load16
 ; CHECK: load <4 x i8>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
index 182ff8263ae8f..d5e661c641d58 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
   %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
   %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
   %ele0 = extractelement <4 x i8> %data.ld, i32 0
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64) #1
+declare i64 @__mux_get_global_id(i64) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
index 26c5a233edf88..59da087e4699c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
   %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
   %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
   %ele0 = extractelement <4 x i8> %data.ld, i32 3
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64) #1
+declare i64 @__mux_get_global_id(i64) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
index 491be71b88b7f..60aa309f7b0ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
   %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
   %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
   %ele0 = extractelement <4 x i8> %data.ld, i32 0
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64) #1
+declare i64 @__mux_get_global_id(i64) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
index ae92e82b5f872..a695f54b37a53 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
   %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
   %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
   %ele0 = extractelement <4 x i8> %data.ld, i32 3
@@ -42,7 +42,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64) #1
+declare i64 @__mux_get_global_id(i64) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
index 83e216f489d46..8c2b80f72ce50 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @squash(i64 addrspace(1)* %idx, <2 x float> addrspace(1)* %data, <2 x float> addrspace(1)* %output) #0 {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
   %idx.ptr = getelementptr inbounds i64, i64 addrspace(1)* %idx, i64 %gid
   %idx.ld = load i64, i64 addrspace(1)* %idx.ptr, align 8
   %data.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %data, i64 %idx.ld
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64) #1
+declare i64 @__mux_get_global_id(i64) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -43,7 +43,7 @@ attributes #2 = { nobuiltin nounwind }
 ; gather load
 ;
 ; CHECK: void @__vecz_v4_squash
-; CHECK:  %[[GID:.+]] = call spir_func i64 @_Z13get_global_idj(i64 0) #[[ATTRS:[0-9]+]]
+; CHECK:  %[[GID:.+]] = call i64 @__mux_get_global_id(i64 0) #[[ATTRS:[0-9]+]]
 ; CHECK:  %[[IDX_PTR:.+]] = getelementptr inbounds i64, ptr addrspace(1) %idx, i64 %[[GID]]
 ; CHECK:  %[[WIDE_LOAD:.+]] = load <4 x i64>, ptr addrspace(1) %[[IDX_PTR]], align 8
 ; CHECK:  %[[DATA_PTR:.+]] = getelementptr inbounds <2 x float>, ptr addrspace(1) %data, <4 x i64> %[[WIDE_LOAD]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
index 6af90e4bba69d..614e3d52c3202 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
@@ -43,34 +43,34 @@ target triple = "spir64-unknown-unknown"
 
 define dso_local spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* nocapture noundef writeonly %info) !reqd_work_group_size !11 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 0)
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 1)
-  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 noundef 2)
-  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 0)
-  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 1)
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 noundef 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 noundef 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 noundef 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 noundef 1)
   %mul7 = mul nuw nsw i64 %call5, %call2
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
   %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
-  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 2)
+  %call12 = tail call i64 @__mux_get_global_size(i32 noundef 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
-  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 noundef 3)
+  %call14 = tail call i64 @__mux_get_global_size(i32 noundef 3)
   %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
   %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
   store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
-  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %call16 = tail call i32 @__mux_get_work_dim()
   %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
   store i32 %call16, i32 addrspace(1)* %work_dim, align 1
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_size(i32)
 
-declare spir_func i32 @_Z12get_work_dimv()
+declare i32 @__mux_get_work_dim()
 
 !11 = !{i32 4, i32 1, i32 1}
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
index 9ad94ffa958a9..7d0d5ca6b77aa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
@@ -23,34 +23,34 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
-  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 2)
-  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 0)
-  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 1)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 1)
   %mul7 = mul nuw nsw i64 %call5, %call2
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
   %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
-  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 2)
+  %call12 = tail call i64 @__mux_get_global_size(i32 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
-  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 3)
+  %call14 = tail call i64 @__mux_get_global_size(i32 3)
   %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
   %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
   store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
-  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %call16 = tail call i32 @__mux_get_work_dim()
   %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
   store i32 %call16, i32 addrspace(1)* %work_dim, align 1
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_size(i32)
 
-declare spir_func i32 @_Z12get_work_dimv()
+declare i32 @__mux_get_work_dim()
 
 ; CHECK: spir_kernel void @foo
 ; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
index 1ae62a0924222..58ec273625333 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
@@ -23,34 +23,34 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
-  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 2)
-  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 0)
-  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 1)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 1)
   %mul7 = mul nuw nsw i64 %call5, %call2
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
   %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
-  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 2)
+  %call12 = tail call i64 @__mux_get_global_size(i32 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
-  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 3)
+  %call14 = tail call i64 @__mux_get_global_size(i32 3)
   %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
   %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
   store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
-  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %call16 = tail call i32 @__mux_get_work_dim()
   %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
   store i32 %call16, i32 addrspace(1)* %work_dim, align 1
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_size(i32)
 
-declare spir_func i32 @_Z12get_work_dimv()
+declare i32 @__mux_get_work_dim()
 
 ; CHECK: spir_kernel void @foo
 ; CHECK: store <4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
index 3d317f732a4be..24c8521652f68 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
@@ -23,34 +23,34 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0)
-  %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1)
-  %call2 = tail call spir_func i64 @_Z13get_global_idj(i32 2)
-  %call3 = tail call spir_func i64 @_Z15get_global_sizej(i32 0)
-  %call5 = tail call spir_func i64 @_Z15get_global_sizej(i32 1)
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 1)
   %mul7 = mul nuw nsw i64 %call5, %call2
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
   %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
-  %call12 = tail call spir_func i64 @_Z15get_global_sizej(i32 2)
+  %call12 = tail call i64 @__mux_get_global_size(i32 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
-  %call14 = tail call spir_func i64 @_Z15get_global_sizej(i32 3)
+  %call14 = tail call i64 @__mux_get_global_size(i32 3)
   %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
   %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
   store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
-  %call16 = tail call spir_func i32 @_Z12get_work_dimv()
+  %call16 = tail call i32 @__mux_get_work_dim()
   %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
   store i32 %call16, i32 addrspace(1)* %work_dim, align 1
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_size(i32)
 
-declare spir_func i32 @_Z12get_work_dimv()
+declare i32 @__mux_get_work_dim()
 
 ; CHECK: spir_kernel void @foo
 ; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
index 74276d0da284a..801a622d56e54 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(i32* %in, i32* %out, %struct_type* %sin) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %inp = getelementptr inbounds i32, i32* %in, i64 %call
   %oup = getelementptr inbounds i32, i32* %out, i64 %call
   %o = load i32, i32* %oup
@@ -71,7 +71,7 @@ for.end:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
index 9dc3a66c66a56..ae3dedf111200 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test(%struct_type* %in1, %struct_type* %in2, %struct_type* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %in1p = getelementptr inbounds %struct_type, %struct_type* %in1, i64 %call
   %in2p = getelementptr inbounds %struct_type, %struct_type* %in2, i64 %call
   %outp = getelementptr inbounds %struct_type, %struct_type* %out, i64 %call
@@ -36,7 +36,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
 
 ; CHECK: define spir_kernel void @__vecz_v4_test
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
index 59b2dee6b099b..8e297e4ee4134 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -19,17 +19,17 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-declare spir_func i32 @_Z16get_sub_group_idv()
-declare spir_func i32 @__mux_get_sub_group_local_id()
-declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32)
+declare i32 @__mux_get_sub_group_id()
+declare i32 @__mux_get_sub_group_local_id()
+declare i32 @__mux_sub_group_broadcast_i32(i32, i32)
 
 ; It makes sure broadcast still works when its source operand is uniform
 define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
-  %call = tail call spir_func i32 @_Z16get_sub_group_idv()
+  %call = tail call i32 @__mux_get_sub_group_id()
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
   %v = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
-  %idx = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %broadcast = call i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
+  %idx = tail call i32 @__mux_get_sub_group_local_id()
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idx
   store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
index e1582ba7f8c47..1fab059100023 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_shift = shl i64 %gid, 1
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
@@ -34,13 +34,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is applied when the source GEPs have
 ; constant strides, even though they are different.
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %gid_shift = shl i64 %gid, 1
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
index 725d747de1fc0..d92bbe80429be 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_offset = add i64 %gid, 16
   %gid_mashed = xor i64 %gid, 12462
   %cond = icmp eq i64 %a, %gid
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform pass is not applied when the GEP index
 ; is divergent, which would result in a scatter store regardless.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
index 16759e0bf5845..e26fcd8d807a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_offset = add i64 %gid, 16
   %gid_mashed = xor i64 %gid, 12462
   %cond = icmp eq i64 %a, %gid
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform pass is not applied when a source GEP
 ; is divergent, which would result in a scatter store regardless.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
index 2de729f9a8a10..299dcf1978a90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   %c1 = getelementptr i64, i64* %c, i64 0
@@ -30,13 +30,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is not applied when the select is not
 ; accessed through an additional GEP.
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
 ; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
index 36d2eb284c15c..70e34a3cbfb30 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_offset = add i64 %gid, 16
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
@@ -34,13 +34,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is applied when the source GEPs have
 ; equal constant strides.
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %gid_offset = add i64 %gid, 16
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
index 3fe020663d73b..2252c075b39e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_shift = shl i64 %gid, 1
   %cond = icmp eq i64 %a, 0
   %c0 = getelementptr i64, i64* %c, i64 %gid
@@ -34,13 +34,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is applied when the condition is
 ; uniform, and the source GEPs have different constant strides.
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %gid_shift = shl i64 %gid, 1
 ; CHECK: %cond = icmp eq i64 %a, 0
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
index 4bd5d6e4d1c51..3bc49c589a3d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_offset = add i64 %gid, 16
   %cond = icmp eq i64 %a, 0
   %c0 = getelementptr i64, i64* %c, i64 %gid
@@ -34,7 +34,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is not applied when the condition is
 ; uniform, and the two strides are the same.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
index e068dd704d13b..9dc3c74401b57 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %gid_offset = add i64 %gid, 16
   %cond = icmp eq i64 %a, 0
   %c0 = getelementptr i64, i64* %c, i64 %gid
@@ -34,7 +34,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is not applied when the condition is
 ; uniform and the two strides are equal, and that the result is a contiguous
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
index 12b54cd4eb005..704a1a82d28b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 %gid
   store i64 %b, i64* %c0, align 4
@@ -33,13 +33,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is applied when one of the source GEPs
 ; is uniform
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
 ; CHECK: store i64 %b, ptr %c0, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
index d27a4186ee311..9a95a5b85d92e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
 entry:
-  %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %gid = call i64 @__mux_get_global_id(i32 0)
   %cond = icmp eq i64 %a, %gid
   %c0 = getelementptr i64, i64* %c, i64 1
   store i64 %b, i64* %c0, align 4
@@ -33,13 +33,13 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This checks that the ternary transform is applied when the source GEPs are
 ; both uniform.
 
 ; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
-; CHECK: %gid = call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %cond = icmp eq i64 %a, %gid
 ; CHECK: %c0 = getelementptr i64, ptr %c, i64 1
 ; CHECK: store i64 %b, ptr %c0, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
index 72d03f0395bf5..b2ec9fe8ef2ef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
@@ -37,7 +37,7 @@ entry:
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
@@ -64,7 +64,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
index c30ed8f1549d5..a509bc5563d8b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @add(<128 x i32>* %in1, <128 x i32>* %in2, <128 x i32>* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %in1p = getelementptr inbounds <128 x i32>, <128 x i32>* %in1, i64 %call
   %in1v = load <128 x i32>, <128 x i32>* %in1p, align 4
   %in2p = getelementptr inbounds <128 x i32>, <128 x i32>* %in2, i64 %call
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 ; We do not expect this test to succeed
 ; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
index fdbbdaa70dfdf..eae533dab66c1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
@@ -35,7 +35,7 @@ entry:
   store <4 x i16> addrspace(1)* %dst, <4 x i16> addrspace(1)** %dst.addr, align 8
   call void @llvm.dbg.declare(metadata <4 x i16> addrspace(1)** %dst.addr, metadata !19, metadata !32), !dbg !33
   call void @llvm.dbg.declare(metadata i32* %tid, metadata !20, metadata !32), !dbg !34
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3, !dbg !34
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !34
   %conv = trunc i64 %call to i32, !dbg !34
   store i32 %conv, i32* %tid, align 4, !dbg !34
   call void @llvm.dbg.declare(metadata <4 x i16>* %tmp, metadata !22, metadata !32), !dbg !35
@@ -57,7 +57,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #2
+declare i64 @__mux_get_global_id(i32) #2
 
 declare spir_func <4 x i16> @_Z9as_short4Dv3_t(<3 x i16>) #2
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
index c6b5c3aadbdb1..2ac20c8a249ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
@@ -18,19 +18,16 @@
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i32:32-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i32 @_Z14get_local_sizej(i32) #2
-
-; Function Attrs: convergent nounwind readonly
-declare spir_func i32 @_Z12get_local_idj(i32) #2
+declare i32 @__mux_get_local_id(i32) #2
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @test() #0 {
 entry:
-  %call8 = call spir_func i32 @_Z12get_local_idj(i32 0) #3
+  %call8 = call i32 @__mux_get_local_id(i32 0) #3
   %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* undef, i32 %call8
   %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
   %conv9 = uitofp i8 %0 to float
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
index e59c91661f1eb..8f5e9a83edffd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %0 = icmp eq i32 %a, -2147483648
   %1 = icmp eq i32 %b, -1
   %2 = and i1 %0, %1
@@ -40,7 +40,7 @@ entry:
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1
 
 ; It tests to ensure that the array index is correctly identified
 ; as having a uniform stride and generates plain vector loads and not
@@ -48,7 +48,7 @@ declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
 
 ; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
 ; CHECK: entry:
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 ; CHECK-DAG: %[[INA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %[[X:.+]]
 ; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
 ; CHECK-DAG: %[[OUTA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %[[X:.+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
index e59c91661f1eb..8f5e9a83edffd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 {
 entry:
-  %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %0 = icmp eq i32 %a, -2147483648
   %1 = icmp eq i32 %b, -1
   %2 = and i1 %0, %1
@@ -40,7 +40,7 @@ entry:
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1
 
 ; It tests to ensure that the array index is correctly identified
 ; as having a uniform stride and generates plain vector loads and not
@@ -48,7 +48,7 @@ declare spir_func i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
 
 ; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
 ; CHECK: entry:
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
+; CHECK: call i64 @__mux_get_global_id(i32 0)
 ; CHECK-DAG: %[[INA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %[[X:.+]]
 ; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
 ; CHECK-DAG: %[[OUTA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %[[X:.+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
index b136be330fffd..1d34290dc3bb0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %id = call i64 @__mux_get_global_id(i64 0) #2
   %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
   %load = load i32, i32 addrspace(1)* %init_addr
   br label %loop
@@ -39,7 +39,7 @@ merge:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64)
+declare i64 @__mux_get_global_id(i64)
 
 ; It checks that the stride analysis can tell the store is contiguous through the PHI node.
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
index 90b03a8169bb5..d970cc72f8b55 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %id = call i64 @__mux_get_global_id(i64 0) #2
   %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
   %load = load i32, i32 addrspace(1)* %init_addr
   br label %loop
@@ -39,7 +39,7 @@ merge:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64)
+declare i64 @__mux_get_global_id(i64)
 
 ; It checks that the stride analysis can tell the store is contiguous through the PHI node.
 ; Same as uniform_loop_contiguous_phi1.ll except with the PHI node incoming values reversed.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
index be591e2e1eb61..c1a92b1660c68 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %id = call i64 @__mux_get_global_id(i64 0) #2
   %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
   %load = load i32, i32 addrspace(1)* %init_addr
   br label %loop
@@ -40,7 +40,7 @@ merge:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64)
+declare i64 @__mux_get_global_id(i64)
 
 ; It checks that the stride analysis can tell the store is contiguous through the PHI node.
 ; Same as uniform_loop_contiguous_phi1.ll except with the index GEP inside the loop.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
index f1a86ad703fe2..a7515f3c71f73 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
@@ -21,7 +21,7 @@ target triple = "spir-unknown-unknown"
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %id = call spir_func i64 @_Z13get_global_idj(i64 0) #2
+  %id = call i64 @__mux_get_global_id(i64 0) #2
   %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
   %load = load i32, i32 addrspace(1)* %init_addr
   br label %loop
@@ -40,7 +40,7 @@ merge:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i64)
+declare i64 @__mux_get_global_id(i64)
 
 ; It checks that the stride analysis can tell the store is contiguous through the PHI node.
 ; Same as uniform_loop_contiguous_phi3.ll except with the PHI node incoming values reversed.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
index 2278613d19ebb..fa60304b44fbe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
@@ -23,8 +23,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
 entry:
-  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
   %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
   %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
   %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
@@ -38,7 +38,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks that a sum of a varying value with two uniform values
 ; gets re-associated from (Varying + Uniform) + Uniform
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
index 32396838c6fa3..0db9a2703400e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
@@ -23,8 +23,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
 entry:
-  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
   %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
   %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x
   %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
@@ -38,7 +38,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks that a sum of a varying value with two uniform values
 ; gets re-associated from (Varying + Uniform) + Varying
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
index 5d31ca92ed2be..2873b8759b971 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
@@ -23,8 +23,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
 entry:
-  %x = call spir_func i64 @_Z13get_global_idj(i32 0) #2
-  %y = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
   %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
   %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x
   %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
@@ -38,7 +38,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 ; This test checks that a sum of a varying value with two uniform values
 ; gets re-associated from Varying + (Varying + Uniform)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
index 8ebf588e8235f..6871734fa38b2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
@@ -24,9 +24,9 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @entry(i64* %input, i64* %output) {
 entry:
-  %gid = call i64 @_Z12get_local_idj(i32 0)
+  %gid = call i64 @__mux_get_local_id(i32 0)
   %i1ptr = getelementptr i64, i64* %output, i64 %gid
-  call spir_func void @_Z9mem_fencej(i32 1)
+  call void @__mux_mem_barrier(i32 2, i32 264) 
   %ii = call i64 @functionD(i64* %input)
   %ib = trunc i64 %ii to i1
   call void @functionA(i64* %i1ptr, i1 %ib)
@@ -63,18 +63,18 @@ entry:
   ret i64 %r
 }
 
-declare spir_func void @_Z9mem_fencej(i32)
+declare void @__mux_mem_barrier(i32, i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
-declare i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_entry
 ; CHECK: entry:
-; Check that we didn't mask the get_local_id call
-; CHECK: %gid = call i64 @_Z12get_local_idj(i32 0)
+; Check that we didn't mask the __mux_get_local_id call
+; CHECK: %gid = call i64 @__mux_get_local_id(i32 0)
 ; Check that we didn't mask the mem_fence call
-; CHECK: call spir_func void @_Z9mem_fencej(i32 1)
+; CHECK: call void @__mux_mem_barrier(i32 2, i32 264)
 ; Check that we instantiated functionA without a mask
 ; CHECK: call void @functionA(ptr {{.+}}, i1 %ib)
 ; CHECK: call void @functionA(ptr {{.+}}, i1 %ib)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
index 316ae3ab78c98..23bd57fb36b6b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @varying_load1(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %meta) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
   %cmp = icmp slt i32 %conv, 11
   br i1 %cmp, label %if.then, label %if.end16
@@ -70,7 +70,7 @@ if.end16:                                         ; preds = %if.end, %if.then12,
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
index 6484400e45602..252af7d45ca4e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
@@ -24,8 +24,8 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @varying_load2(i32 addrspace(1)* %input, i32 addrspace(1)* %out) #0 {
 entry:
-  %call1 = call spir_func i64 @_Z14get_local_sizej(i32 0) #3
-  %call2 = call spir_func i64 @_Z12get_local_idj(i32 0) #3
+  %call1 = call i64 @__mux_get_local_size(i32 0) #3
+  %call2 = call i64 @__mux_get_local_id(i32 0) #3
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %call2
   %cmp = icmp ne i64 %call2, 0
   br i1 %cmp, label %for.cond.preheader, label %if.end14
@@ -36,7 +36,7 @@ for.cond.preheader:                               ; preds = %entry
 for.cond:                                         ; preds = %for.cond.preheader, %for.inc
   %max.0 = phi i32 [ %max.1, %for.inc ], [ 0, %for.cond.preheader ]
   %storemerge = phi i64 [ %inc, %for.inc ], [ 0, %for.cond.preheader ]
-  %call6 = call spir_func i64 @_Z14get_local_sizej(i32 0) #3
+  %call6 = call i64 @__mux_get_local_size(i32 0) #3
   %cmp7 = icmp ult i64 %storemerge, %call6
   br i1 %cmp7, label %for.body, label %for.end
 
@@ -65,11 +65,11 @@ if.end14:                                         ; preds = %for.end, %entry
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z12get_local_idj(i32) #1
+declare i64 @__mux_get_local_id(i32) #1
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z14get_local_sizej(i32) #1
+declare i64 @__mux_get_local_size(i32) #1
 
 attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
index d9912a284866b..11392ed374520 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 define spir_kernel void @fmuladd(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
@@ -49,7 +49,7 @@ entry:
 
 define spir_kernel void @fma(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
   %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
   %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
@@ -74,7 +74,7 @@ entry:
 ; CHECK-NOT: call double @llvm.fma.v4f64(
 ; CHECK: ret void
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
index c9e6e5c230429..dca8e8649bd00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -23,13 +23,13 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp eq i64 %call, 0
   br i1 %cmp, label %for.end, label %for.cond
 
 for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
   %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
   %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -77,8 +77,8 @@ for.end:                                          ; preds = %entry, %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
 
 ; This test checks if a uniform <4 x i32> phi is not scalarized
 ; CHECK: define spir_kernel void @__vecz_v4_vector_loop
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
index 6cccccaa70d7f..e5c8fe4263bc3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %call.trunc = trunc i64 %call to i32
   %call.splatinsert = insertelement <4 x i32> undef, i32 %call.trunc, i32 0
   %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -32,7 +32,7 @@ entry:
 
 for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
-  %call1 = call spir_func i64 @_Z15get_global_sizej(i32 0)
+  %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
   %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
   %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -80,8 +80,8 @@ for.end:                                          ; preds = %entry, %for.cond
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
-declare spir_func i64 @_Z15get_global_sizej(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
 
 ; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis
 ; and then re-packetized
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
index 42841969cf2ba..082d251e544d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
@@ -28,7 +28,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
   %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
@@ -42,7 +42,7 @@ entry:
 
 define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
   %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
@@ -56,7 +56,7 @@ entry:
 
 define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
   %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
@@ -70,7 +70,7 @@ entry:
 
 define spir_kernel void @test_float_vectors(<2 x float>* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
   %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
   %mul = fmul <2 x float> %0, %0
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
index 09fa585880423..ad71e33a0eaa0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
@@ -28,7 +28,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
   %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
@@ -42,7 +42,7 @@ entry:
 
 define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
   %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
@@ -56,7 +56,7 @@ entry:
 
 define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
   %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
@@ -70,7 +70,7 @@ entry:
 
 define spir_kernel void @test_float_vectors(<2 x float>* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
   %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
   %mul = fmul <2 x float> %0, %0
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
index 786d3867f31bb..39ca271f5539d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
@@ -28,7 +28,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
   %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
@@ -42,7 +42,7 @@ entry:
 
 define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
   %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
@@ -56,7 +56,7 @@ entry:
 
 define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
   %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
@@ -70,7 +70,7 @@ entry:
 
 define spir_kernel void @test_float_vectors(<2 x float>* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
   %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
   %mul = fmul <2 x float> %0, %0
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
index 0c26001dcef81..25b91286ace66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
@@ -28,7 +28,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
   %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
@@ -42,7 +42,7 @@ entry:
 
 define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
   %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
@@ -56,7 +56,7 @@ entry:
 
 define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
   %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
@@ -70,7 +70,7 @@ entry:
 
 define spir_kernel void @test_float_vectors(<2 x float>* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
   %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
   %mul = fmul <2 x float> %0, %0
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
index 0098cb81fa0ff..18249480262c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
@@ -28,7 +28,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
   %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
@@ -42,7 +42,7 @@ entry:
 
 define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
   %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
@@ -56,7 +56,7 @@ entry:
 
 define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
   %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
@@ -70,7 +70,7 @@ entry:
 
 define spir_kernel void @test_float_vectors(<2 x float>* %in) {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0)
+  %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
   %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
   %mul = fmul <2 x float> %0, %0
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-declare spir_func i64 @_Z13get_global_idj(i32)
+declare i64 @__mux_get_global_id(i32)
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
index 6fff2283f5329..8a3f1de861de8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
@@ -24,9 +24,9 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: convergent nounwind
 define spir_kernel void @blend_div_loop(i8 addrspace(1)* %src1ptr, i32 %src1_step, i32 %src1_offset, i8 addrspace(1)* %dstptr, i32 %dst_step, i32 %dst_offset, i32 %dst_rows, i32 %dst_cols, i8 addrspace(1)* %src2ptr, i32 %src2_step, i32 %src2_offset, i8 addrspace(1)* %src3ptr, i32 %src3_step, i32 %src3_offset, i32 %rowsPerWI) #0 {
 entry:
-  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #2
+  %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
-  %call1 = call spir_func i64 @_Z13get_global_idj(i32 1) #2
+  %call1 = call i64 @__mux_get_global_id(i32 1) #2
   %0 = trunc i64 %call1 to i32
   %conv3 = mul i32 %0, %rowsPerWI
   %cmp = icmp slt i32 %conv, %dst_cols
@@ -134,7 +134,7 @@ if.end62:                                         ; preds = %for.cond, %entry
 }
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 ; Function Attrs: convergent nounwind readonly
 declare spir_func i32 @_Z5mad24iii(i32, i32, i32) #1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
index debd60a1627dd..71ab928440cb9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
@@ -22,16 +22,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z12get_group_idj(i32)
+declare i64 @__mux_get_group_id(i32)
 
 ; Function Attrs: convergent nounwind readonly
-declare spir_func i64 @_Z12get_local_idj(i32)
+declare i64 @__mux_get_local_id(i32)
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @vecz_scalar_gather_load(i32 addrspace(1)* %row_indices, i32 addrspace(1)* %row_blocks, float addrspace(1)* %result) {
 entry:
-  %call1 = call spir_func i64 @_Z12get_group_idj(i32 0)
-  %call2 = call spir_func i64 @_Z12get_local_idj(i32 0)
+  %call1 = call i64 @__mux_get_group_id(i32 0)
+  %call2 = call i64 @__mux_get_local_id(i32 0)
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %call1
   %load1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
   %add1 = add i64 %call1, 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
index 749c56abe8b96..73b4b679fa85b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
@@ -22,12 +22,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind readnone
-declare spir_func i64 @_Z13get_global_idj(i32) #0
+declare i64 @__mux_get_global_id(i32) #0
 
 define spir_kernel void @vecz_scalar_interleaved_load(float addrspace(1)* %out, i64 %n, float %m) {
 entry:
-  %gid0 = tail call spir_func i64 @_Z13get_global_idj(i32 0) #0
-  %gid1 = tail call spir_func i64 @_Z13get_global_idj(i32 1) #0
+  %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0
   %cmp1 = icmp slt i64 %gid0, %n
   br i1 %cmp1, label %if.then1, label %end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
index f66905fcdb66a..856180d6c002c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
@@ -23,14 +23,14 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind
 define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
 entry:
-  %call = call spir_func i64 @_Z12get_local_idj(i32 0) #5
+  %call = call i64 @__mux_get_local_id(i32 0) #5
   %conv = trunc i64 %call to i32
   %cmp = icmp sgt i32 %conv, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
   fence syncscope("singlethread") acq_rel
-  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #5
+  %call2 = call i64 @__mux_get_global_id(i32 0) #5
   %conv3 = trunc i64 %call2 to i32
   %idxprom = sext i32 %conv3 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
@@ -41,8 +41,8 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %call8 = call spir_func i64 @_Z14get_local_sizej(i32 0) #5
-  %call9 = call spir_func i64 @_Z12get_group_idj(i32 0) #5
+  %call8 = call i64 @__mux_get_local_size(i32 0) #5
+  %call9 = call i64 @__mux_get_group_id(i32 0) #5
   %mul = mul i64 %call9, %call8
   %add = add i64 %mul, %call
   %sext = shl i64 %add, 32
@@ -55,13 +55,13 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-declare spir_func i64 @_Z12get_local_idj(i32) #1
+declare i64 @__mux_get_local_id(i32) #1
 
-declare spir_func i64 @_Z13get_global_idj(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
-declare spir_func i64 @_Z14get_local_sizej(i32) #1
+declare i64 @__mux_get_local_size(i32) #1
 
-declare spir_func i64 @_Z12get_group_idj(i32) #1
+declare i64 @__mux_get_group_id(i32) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -86,20 +86,19 @@ attributes #6 = { nounwind }
 ; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_dont_mask_workitem_builtins(
 
 ; Check if the builtins are still here
-; CHECK: call spir_func i64 @_Z12get_local_idj(i32 0)
-; CHECK: call spir_func i64 @_Z14get_local_sizej(i32 0)
-; CHECK: call spir_func i64 @_Z12get_group_idj(i32 0)
+; CHECK: call i64 @__mux_get_local_id(i32 0)
+; CHECK: call i64 @__mux_get_local_size(i32 0)
+; CHECK: call i64 @__mux_get_group_id(i32 0)
 ; CHECK: fence syncscope("singlethread") acq_rel
-; CHECK: call spir_func i64 @_Z13get_global_idj(i32 0)
-; CHECK-NOT: call spir_func i64 @__vecz_b_masked__Z13get_global_idj(i32
-; CHECK-NOT: call spir_func i64 @__vecz_b_masked__Z14get_local_sizej(i32
-; CHECK-NOT: call spir_func i64 @__vecz_b_masked__Z12get_group_idj(i32
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_global_id(i32
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_local_size(i32
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_group_id(i32
 
 ; Function end
 ; CHECK: ret void
 
 ; Also check that we haven't declared the masked functions
-; CHECK-NOT: define private spir_func void @__vecz_b_masked__Z7barrierj(i32)
-; CHECK-NOT: define private spir_func i64 @__vecz_b_masked__Z13get_global_idj(i32, i1)
-; CHECK-NOT: define private spir_func i64 @__vecz_b_masked__Z14get_local_sizej(i32, i1)
-; CHECK-NOT: define private spir_func i64 @__vecz_b_masked__Z12get_group_idj(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_group_id(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_local_size(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_group_id(i32, i1)

From 91ef73cf25a2f74a08958d6515d3cf3e5b741880 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@gmail.com>
Date: Tue, 15 Aug 2023 16:01:46 +0100
Subject: [PATCH 018/182] [compiler] Update to build with LLVM 17

Some headers have been moved, and `PointerType::isOpaque` warns that
it's always true, which is pretty noisy.
---
 .../include/multi_llvm/creation_apis_helper.h  |  1 -
 .../include/multi_llvm/opaque_pointers.h       | 18 +++++++-----------
 .../source/transform/builtin_inlining_pass.cpp | 11 +++++++----
 .../vecz/tools/source/veczc.cpp                |  2 ++
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
index cf1e5a80e6f0b..815c763c88c12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
@@ -16,7 +16,6 @@
 #ifndef MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
 #define MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
 
-#include <llvm/ADT/None.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
index 4a3dc772ca944..91e9de1d488a9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
@@ -18,26 +18,22 @@
 
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Type.h>
+#include <multi_llvm/llvm_version.h>
 
 namespace multi_llvm {
-inline bool isOpaquePointerTy(llvm::Type *Ty) {
-  if (auto *PTy = llvm::dyn_cast<llvm::PointerType>(Ty)) {
-    return PTy->isOpaque();
-  }
-  return false;
-}
-
 inline bool isOpaqueOrPointeeTypeMatches(llvm::PointerType *PTy, llvm::Type *) {
   (void)PTy;
+#if LLVM_VERSION_LESS(17, 0)
   assert(PTy->isOpaque() && "No support for typed pointers in LLVM 15+");
+#endif
   return true;
 }
 
 inline llvm::Type *getPtrElementType(llvm::PointerType *PTy) {
-  if (PTy->isOpaque()) {
-    return nullptr;
-  }
-  assert(false && "No support for typed pointers");
+  (void)PTy;
+#if LLVM_VERSION_LESS(17, 0)
+  assert(PTy->isOpaque() && "No support for typed pointers in LLVM 15+");
+#endif
   return nullptr;
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index e81e2a0f32615..b3583dc42a26b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -107,15 +107,17 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   }
 
   Value *DstPtr = Args[0];
-  auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
-
   Type *Int8Ty = B.getInt8Ty();
+
+#if LLVM_VERSION_LESS(17, 0)
+  auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
   // FIXME: We implicitly assume pointers to i8 by doing byte-wise stores,
   // below. See CA-4331.
   if (!DstPtrTy->isOpaque() &&
       multi_llvm::getPtrElementType(DstPtrTy) != Int8Ty) {
     return nullptr;
   }
+#endif
 
   Value *StoredValue = Args[1];
   bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
@@ -210,11 +212,11 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
 
   Value *DstPtr = Args[0];
   Value *SrcPtr = Args[1];
+  Type *Int8Ty = B.getInt8Ty();
 
+#if LLVM_VERSION_LESS(17, 0)
   auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
   auto *SrcPtrTy = cast<PointerType>(DstPtr->getType());
-
-  Type *Int8Ty = B.getInt8Ty();
   // FIXME: We implicitly assume pointers to i8 by doing byte-wise loads and
   // stores, below. See CA-4331.
   if ((!DstPtrTy->isOpaque() &&
@@ -223,6 +225,7 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
         multi_llvm::getPtrElementType(SrcPtrTy) != Int8Ty))) {
     return nullptr;
   }
+#endif
 
   bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
   llvm::StoreInst *MC = nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 3cc04f29a25bd..09a5ee237d486 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -286,7 +286,9 @@ int main(const int argc, const char *const argv[]) {
 
   llvm::SMDiagnostic err;
   llvm::LLVMContext context;
+#if LLVM_VERSION_LESS(17, 0)
   context.setOpaquePointers(true);
+#endif
 
   std::unique_ptr<llvm::Module> module =
       llvm::parseIRFile(InputFilename, err, context);

From b3d2276217e86302865f35d15fd265bc73a488d9 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@gmail.com>
Date: Wed, 16 Aug 2023 15:29:33 +0100
Subject: [PATCH 019/182] [compiler] Fix a couple of lit tests with LLVM 17

1. LLVM 17 re-introduced an unreachable where we haven't seen once since
   LLVM 13.
2. LLVM 17 changes how debug DW_OP_deref is handled when converting
   debug expressions. See https://reviews.llvm.org/D142160. This is
   apparently an improvement in debug info, but it's hard to tell. It's
   also unclear what we're meant to be doing with debug info in this
   pass as our support for debug info is generally lacking, so for now
   we'll just accept LLVM 17's output without questioning it too much.
---
 .../compiler_passes/vecz/test/lit/llvm/undef_ub.ll        | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
index 2ac20c8a249ec..a3db9d0350186 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
@@ -14,7 +14,8 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %t
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
@@ -40,5 +41,6 @@ entry:
 ; The "undefs" in the above IR should "optimize" to a trap call and an unreachable
 ; terminator instruction.
 ; CHECK: define spir_kernel void @__vecz_v4_test
-; On LLVM 13+ there's no such trap: the UB is just that the function returns early.
-; CHECK: ret void
+; Before LLVM 17 there's no such trap: the UB is just that the function returns early.
+; CHECK-LT17: ret void
+; CHECK-GE17: unreachable

From e5904ee6f3d8c7c6a622351ca67bbf57ca45fa4f Mon Sep 17 00:00:00 2001
From: Amy <135044214+AmyCodeplay@users.noreply.github.com>
Date: Wed, 23 Aug 2023 15:22:06 +0100
Subject: [PATCH 020/182] Amy/refactor work group collectives 2 (#98)

Handle Work Group Reductions in the barrier pass
---
 .../analysis/uniform_value_analysis.cpp       | 14 ++--
 .../vecz/source/transform/packetizer.cpp      | 66 ++++++++++++++-----
 2 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 10fd49cce220e..3f4f495c13dde 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -106,13 +106,13 @@ bool UniformValueResult::isValueOrMaskVarying(const Value *V) const {
 }
 
 /// @brief Utility function to check whether an instruction is a call to a
-/// subgroup reduction or subgroup broadcast operaton.
+/// reduction or broadcast operaton.
 ///
 /// @param[in] I Instruction to check
 /// @param[in] BI BuiltinInfo for platform-specific builtin IDs
-/// @return true if the instruction is a call to a subgroup reduction or
+/// @return true if the instruction is a call to a reduction or broadcast
 /// builtin.
-static bool isSubgroupBroadcastOrReduction(
+static bool isGroupBroadcastOrReduction(
     const Instruction &I, const compiler::utils::BuiltinInfo &BI) {
   if (!isa<CallInst>(&I)) {
     return false;
@@ -123,7 +123,7 @@ static bool isSubgroupBroadcastOrReduction(
     return false;
   }
   auto Info = BI.isMuxGroupCollective(BI.analyzeBuiltin(*Callee).ID);
-  return Info && Info->isSubGroupScope() &&
+  return Info && (Info->isSubGroupScope() || Info->isWorkGroupScope()) &&
          (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast());
 }
 
@@ -132,9 +132,9 @@ void UniformValueResult::findVectorLeaves(
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
-      // Subgroup reductions and broadcasts are always vector leaves regardless
-      // of uniformity.
-      if (isSubgroupBroadcastOrReduction(I, BI)) {
+      // Reductions and broadcasts are always vector leaves regardless of
+      // uniformity.
+      if (isGroupBroadcastOrReduction(I, BI)) {
         Leaves.push_back(&I);
         continue;
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 32f7803492522..d50b223139e99 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1086,14 +1086,17 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
   auto const Builtin = BI.analyzeBuiltin(*callee);
   auto const Info = BI.isMuxGroupCollective(Builtin.ID);
 
-  if (!Info || !Info->isSubGroupScope() ||
+  if (!Info || (!Info->isSubGroupScope() && !Info->isWorkGroupScope()) ||
       (!Info->isAnyAll() && !Info->isReduction())) {
     return nullptr;
   }
 
+  bool isWorkGroup = Info->isWorkGroupScope();
+  unsigned argIdx = isWorkGroup ? 1 : 0;
+
   SmallVector<Value *, 16> opPackets;
-  IRBuilder<> B(buildAfter(CI, F));
-  auto *const argTy = CI->getArgOperand(0)->getType();
+  IRBuilder<> B(CI);
+  auto *const argTy = CI->getArgOperand(argIdx)->getType();
   auto packetWidth = getPacketWidthForType(argTy);
 
   // Don't vector predicate if we have to split into multiple packets. The
@@ -1105,7 +1108,7 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
     return nullptr;
   }
 
-  auto op = packetize(CI->getArgOperand(0));
+  auto op = packetize(CI->getArgOperand(argIdx));
 
   // Reduce the packet values in-place.
   // TODO: can we add 'reassoc' to the floating-point reductions to absolve
@@ -1147,9 +1150,16 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
   Value *v =
       createSimpleTargetReduction(B, &TTI, opPackets.front(), Info->Recurrence);
 
-  IC.deleteInstructionLater(CI);
-
-  CI->replaceAllUsesWith(v);
+  if (isWorkGroup) {
+    // For a work group operation, we leave the original reduction function and
+    // divert the subgroup reduction through it, giving us a work group
+    // reduction over subgroup reductions.
+    CI->setOperand(argIdx, v);
+    v = CI;
+  } else {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(v);
+  }
 
   return v;
 }
@@ -1163,14 +1173,21 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
   Function *callee = CI->getCalledFunction();
   auto const Builtin = BI.analyzeBuiltin(*callee);
 
-  if (auto Info = BI.isMuxGroupCollective(Builtin.ID);
-      !Info || !Info->isSubGroupScope() || !Info->isBroadcast()) {
+  bool isWorkGroup = false;
+  if (auto Info = BI.isMuxGroupCollective(Builtin.ID)) {
+    if (!Info->isBroadcast() ||
+        (!Info->isSubGroupScope() && !Info->isWorkGroupScope())) {
+      return nullptr;
+    }
+    isWorkGroup = Info->isWorkGroupScope();
+  } else {
     return nullptr;
   }
 
-  IRBuilder<> B(buildAfter(CI, F));
+  IRBuilder<> B(CI);
 
-  auto *const src = CI->getArgOperand(0);
+  unsigned argIdx = isWorkGroup ? 1 : 0;
+  auto *const src = CI->getArgOperand(argIdx);
 
   auto op = packetize(src);
   PACK_FAIL_IF(!op);
@@ -1183,7 +1200,19 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
     return src;
   }
 
-  auto *const idx = CI->getArgOperand(1);
+  auto *idx = CI->getArgOperand(argIdx + 1);
+  if (isWorkGroup) {
+    // When it's a work group broadcast, we need to sanitize the input index so
+    // that it stays within the range of one subgroup.
+    auto *const minVal =
+        ConstantInt::get(idx->getType(), SimdWidth.getKnownMinValue());
+    Value *idxFactor = minVal;
+    if (SimdWidth.isScalable()) {
+      idxFactor = B.CreateVScale(minVal);
+    }
+    idx = B.CreateURem(idx, idxFactor);
+  }
+
   Value *val = nullptr;
   // Optimize the constant fixed-vector case, where we can choose the exact
   // subpacket to extract from directly.
@@ -1207,9 +1236,16 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
     val = B.CreateExtractElement(op.getAsValue(), idx);
   }
 
-  IC.deleteInstructionLater(CI);
-
-  CI->replaceAllUsesWith(val);
+  if (isWorkGroup) {
+    // For a work group operation, we leave the origial broadcast function and
+    // divert the subgroup reduction through it, giving us a work group
+    // reduction over subgroup reductions.
+    CI->setOperand(argIdx, val);
+    val = CI;
+  } else {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(val);
+  }
 
   return val;
 }

From 10b9d78f24bf624205b40a6c0dda5f4c887852b4 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 28 Aug 2023 16:14:32 +0100
Subject: [PATCH 021/182] [vecz] Do not vectorize llvm.debug.value intrinsics

We were trying to update dbg.value intrinsics with the vectorized value
and by updating the DIType to a constructed vector type.

Unfortunately we weren't always getting this right, and sometimes
generating invalid debug expressions (noticed on LLVM 17).

I have evaluated the state of our debug info (with LLVM and lldb 16) and
found while I was able to step through kernels with a source view, the
experience is lacking overall and I was unable to inspect any variables
at all, regardless of whether we vectorized or not.

Given the fact that LLVM 17 errors about some invalid debug locations
and the fact we have no clear pathway towards fixing this properly, the
best thing to do is to remove our support for vectorized dbg.value
expressions for now.

We can revisit this with a broader view to improving out debugging
support later on.
---
 .../vecz/source/transform/packetizer.cpp      | 97 ++-----------------
 .../lit/llvm/insert_element_debug_info.ll     | 14 +--
 .../test/lit/llvm/packetization_debug_info.ll | 52 ++++------
 .../vecz/test/lit/llvm/phi_node_debug_info.ll | 18 ++--
 4 files changed, 45 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index d50b223139e99..8fd763e4ff105 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -205,7 +205,7 @@ class Packetizer::Impl : public Packetizer {
   Value *packetizeSubgroupBroadcast(Instruction *I);
   /// @brief Packetize PHI node.
   ///
-  /// @param[in] PHI PHI Node to packetize.
+  /// @param[in] Phi PHI Node to packetize.
   ///
   /// @return Packetized values.
   ValuePacket packetizePHI(PHINode *Phi);
@@ -218,7 +218,7 @@ class Packetizer::Impl : public Packetizer {
   /// @brief Packetize a subgroup scan.
   ///
   /// @param[in] CI CallInst to packetize.
-  /// @param[in] SubgroupScanKind type of subgroup scan to packetized.
+  /// @param[in] Scan type of subgroup scan to packetized.
   ///
   /// @return Packetized values.
   ValuePacket packetizeSubgroupScan(CallInst *CI,
@@ -230,9 +230,9 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized values.
   Result assign(Value *Scalar, Value *Vectorized);
-  /// @brief Packetize a load instruction.
+  /// @brief Vectorize an instruction.
   ///
-  /// @param[in] Load Instruction to packetize.
+  /// @param[in] Ins Instruction to packetize.
   ///
   /// @return Packetized instruction.
   Value *vectorizeInstruction(Instruction *Ins);
@@ -274,7 +274,7 @@ class Packetizer::Impl : public Packetizer {
   ValuePacket packetizeBinaryOp(BinaryOperator *BinOp);
   /// @brief Packetize a freeze instruction.
   ///
-  /// @param[in] FreezeInst Instruction to packetize.
+  /// @param[in] FreezeI Instruction to packetize.
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeFreeze(FreezeInst *FreezeI);
@@ -340,9 +340,9 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeInsertElement(InsertElementInst *InsertElement);
-  /// @brief Packetize an insert element instruction.
+  /// @brief Packetize an extract element instruction.
   ///
-  /// @param[in] InsertElement Instruction to packetize.
+  /// @param[in] ExtractElement Instruction to packetize.
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement);
@@ -2104,87 +2104,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   return results;
 }
 
-void Packetizer::Impl::vectorizeDI(Instruction *Scalar, Value *Packet) {
-  auto *const LAM = LocalAsMetadata::getIfExists(Scalar);
-  if (!LAM) {
-    return;
-  }
-
-  auto *const MDV = MetadataAsValue::getIfExists(Scalar->getContext(), LAM);
-  if (!MDV) {
-    return;
-  }
-
-  DIBuilder DIB(*Scalar->getModule(), false);
-
-  // Find all the debug value intrinsics attached to scalar instruction
-  for (User *U : MDV->users()) {
-    DbgValueInst *const DVI = dyn_cast<DbgValueInst>(U);
-    if (!DVI) {
-      continue;
-    }
-
-    DILocalVariable *const DILocal = DVI->getVariable();
-    DIType *LocalType = dyn_cast<DIType>(DILocal->getType());
-
-    // Vector types need to be of a integral base type
-    while (!isa<DIBasicType>(LocalType)) {
-      if (DIDerivedType *DerivedType = dyn_cast<DIDerivedType>(LocalType)) {
-        LocalType = dyn_cast_or_null<DIType>(DerivedType->getBaseType());
-      } else if (DICompositeType *CompositeType =
-                     dyn_cast<DICompositeType>(LocalType)) {
-        auto baseType = CompositeType->getBaseType();
-        LocalType = dyn_cast_or_null<DIType>(baseType);
-      } else {
-        // Error case:
-        // No other valid derived classes of DIType,
-        // however some might be added to LLVM in the future.
-        break;
-      }
-
-      if (!LocalType) {
-        break;
-      }
-    }
-
-    // Type is something complex like a struct which we can't handle
-    if (!LocalType) {
-      continue;
-    }
-
-    if (SimdWidth.isScalable()) {
-      continue;
-    }
-    // Create a new DI vector type with simd width
-    const unsigned int Width = SimdWidth.getFixedValue();
-    Metadata *const Subscript = DIB.getOrCreateSubrange(0, Width);
-    DINodeArray SubscriptArray = DIB.getOrCreateArray(Subscript);
-
-    const uint64_t Size = LocalType->getSizeInBits() * Width;
-    const uint64_t Align = LocalType->getAlignInBits() * Width;
-
-    DICompositeType *const VectorType =
-        DIB.createVectorType(Size, Align, LocalType, SubscriptArray);
-
-    // Replace DILocalVariable type with our new vectorized type
-    DILocal->replaceOperandWith(3, VectorType);
-
-    // New packetized instruction will point to the base of our vector type
-    auto DIExpr = DIB.createExpression();
-
-    // Create llvm.dbg.value() intrinsic for packetized instruction,
-    // but can't insert it before a phi node.
-    if (isa<PHINode>(Scalar)) {
-      DIB.insertDbgValueIntrinsic(Packet, DILocal, DIExpr, DVI->getDebugLoc(),
-                                  Scalar->getParent()->getFirstNonPHI());
-    } else {
-      DIB.insertDbgValueIntrinsic(Packet, DILocal, DIExpr, DVI->getDebugLoc(),
-                                  Scalar);
-    }
-    // Delete the old scalar debug intrinsic since the instruction
-    // it references will also be deleted.
-    IC.deleteInstructionLater(DVI);
-  }
+void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
+  // FIXME: Reinstate support for vectorizing debug info
   return;
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index bd3430a9f8214..a23678ae51a4e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -24,12 +24,13 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-; Function Attrs: nounwind
+; CHECK: define spir_kernel void @__vecz_v4_unaligned_load
 define spir_kernel void @unaligned_load(i32 addrspace(1)* %in, i32 addrspace(1)* %offsets, i32 addrspace(1)* %out) #0 !dbg !7 {
 entry:
   %in.addr = alloca i32 addrspace(1)*, align 8
   %offsets.addr = alloca i32 addrspace(1)*, align 8
   %out.addr = alloca i32 addrspace(1)*, align 8
+; CHECK: %tmp = alloca <16 x i32>, align 16
   %tid = alloca i32, align 4
   %tmp = alloca <3 x i32>, align 16
   store i32 addrspace(1)* %in, i32 addrspace(1)** %in.addr, align 8
@@ -44,6 +45,12 @@ entry:
   store i32 %conv, i32* %tid, align 4, !dbg !31
   call void @llvm.dbg.declare(metadata <3 x i32>* %tmp, metadata !15, metadata !29), !dbg !32
   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in.addr, align 8, !dbg !32
+; CHECK: %[[TMP_LD:.+]] = call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr(ptr nonnull %tmp)
+; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the
+; termination of the previous value assigned to %tmp - we could probably do
+; better here by manifesting a vectorized value?
+; CHECK: call void @llvm.dbg.value(metadata i32 {{(poison|undef)}}, metadata !{{[0-9]+}},
+; CHECK-SAME:   metadata !DIExpression({{.*}})), !dbg !{{[0-9]+}}
   %1 = load i32, i32* %tid, align 4, !dbg !32
   %mul = mul nsw i32 3, %1, !dbg !32
   %idx.ext = sext i32 %mul to i64, !dbg !32
@@ -135,8 +142,3 @@ attributes #3 = { nobuiltin }
 !34 = !DILocation(line: 5, scope: !7)
 !35 = !DILocation(line: 6, scope: !7)
 !36 = !DILocation(line: 7, scope: !7)
-
-; CHECK: define spir_kernel void @__vecz_v4_unaligned_load
-; CHECK: %tmp = alloca <16 x i32>, align 16
-; CHECK: %[[TMP_LD:.+]] = call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr(ptr nonnull %tmp)
-; CHECK: call void @llvm.dbg.value(metadata <4 x i32> %[[TMP_LD]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg !{{[0-9]+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index e2b653d52525b..bcd40ab98077f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -23,7 +23,9 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-; Function Attrs: nounwind
+; Vectorized kernel function
+; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_add({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
+; Check that intrinsics for user variable locations are still present
 define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
 entry:
   %in1.addr = alloca i32 addrspace(1)*, align 8
@@ -33,20 +35,34 @@ entry:
   %a = alloca i32, align 4
   %b = alloca i32, align 4
   store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
+; CHECK-SAME: !dbg [[PARAM_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
   store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[PARAM_LOC]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[PARAM_LOC]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+; CHECK: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: !dbg [[TID_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
   %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
+; FIXME: We're dropping the llvm.dbg.declare/llvm.dbg.value for %a here - we
+; could probably preserve it.
+; CHECK-NOT: call void @llvm.dbg.value(
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
   store i32 %2, i32* %a, align 4, !dbg !32
+; FIXME: We're dropping the llvm.dbg.declare/llvm.dbg.value for %a here - we
+; could probably preserve it.
+; CHECK-NOT: call void @llvm.dbg.value(
   call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
   %3 = load i64, i64* %tid, align 8, !dbg !33
   %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
@@ -115,27 +131,6 @@ attributes #3 = { nobuiltin }
 !34 = !DILocation(line: 7, scope: !4)
 !35 = !DILocation(line: 8, scope: !4)
 
-; Vectorized kernel function
-; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_add({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
-
-; Check that intrinsics for user variable locations are still present
-; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
-; CHECK-SAME: !dbg [[PARAM_LOC:![0-9]+]]
-
-; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[PARAM_LOC]]
-
-; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[PARAM_LOC]]
-
-; CHECK: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[TID_LOC:![0-9]+]]
-
-; CHECK: call void @llvm.dbg.value(metadata {{.*}}, metadata [[DI_A:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME:!dbg [[A_LOC:![0-9]+]]
-
-; CHECK: call void @llvm.dbg.value(metadata {{.*}}, metadata [[DI_B:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME:!dbg [[B_LOC:![0-9]+]]
 
 ; Debug info metadata entries
 ; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_BASE:![0-9]+]], size: 64, align: 64)
@@ -144,7 +139,7 @@ attributes #3 = { nobuiltin }
 ; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "add",
 ; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]]
 
-; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A]], [[DI_B]]}
+; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A:![0-9]+]], [[DI_B:![0-9]+]]}
 ; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]],
 ; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
 ; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]],
@@ -154,14 +149,3 @@ attributes #3 = { nobuiltin }
 
 ; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]]
 ; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]],
-; CHECK-SAME:line: 5, type: [[VECTOR_TYPE:![0-9]+]])
-
-; Vectorized debug info type create in packetization pass
-; CHECK: [[VECTOR_TYPE]] = !DICompositeType(tag: DW_TAG_array_type, baseType: [[DI_BASE]], size: {{[0-9]+}}, align: {{[0-9]+}}
-; CHECK-SAME:flags: DIFlagVector, elements: ![[DI_ELEMS:[0-9]+]])
-; CHECK:[[DI_ELEMS]] = !{[[DI_SUBRANGE:![0-9]+]]}
-; LLVM 11 adds a lowerBound argument to DISubrange, so the optional check below
-; CHECK: [[DI_SUBRANGE]] = !DISubrange(count: [[WIDTH]]{{(, lowerBound: [0-9])?}})
-
-; CHECK: [[DI_B]] = !DILocalVariable(name: "b", scope: [[VECZ_SUBPROG]],
-; CHECK-SAME: line: 6, type: [[VECTOR_TYPE]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index ab9e14c63b94e..cbd7444ed6b8d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -17,13 +17,14 @@
 ; Check that debug info intrinsics are correctly placed after
 ; phi nodes.
 
-; RUN: veczc -k loop_phi -vecz-simd-width=4 -S < %s | FileCheck %s
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
+; CHECK: define spir_kernel void @__vecz_v4_loop_phi(
 define spir_kernel void @loop_phi(i32 addrspace(3)* %a, i32 addrspace(3)* %b) #0 !dbg !4 {
 entry:
   %a.addr = alloca i32 addrspace(3)*, align 8
@@ -43,6 +44,13 @@ entry:
   store i32 %conv, i32* %i, align 4, !dbg !33
   br label %for.cond, !dbg !33
 
+
+; CHECK: for.cond:
+; CHECK: %[[PHI1:.+]] = phi {{i[0-9]+}} [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: call void @llvm.dbg.value(metadata i64 %[[PHI1]], metadata !{{[0-9]+}},
+; CHECK-SAME: metadata !DIExpression({{.*}})), !dbg !{{[0-9]+}}
+; Check we haven't inserted a llvm.dbg.value intrinsic before the last of the PHIs.
+; CHECK-NOT: phi
 for.cond:                                         ; preds = %for.inc, %entry
   %1 = load i32, i32* %i, align 4, !dbg !34
   %cmp = icmp slt i32 %1, 128, !dbg !34
@@ -68,6 +76,7 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond, !dbg !34
 
 for.end:                                          ; preds = %for.cond
+; CHECK: ret void
   ret void, !dbg !39
 }
 
@@ -126,10 +135,3 @@ attributes #3 = { nobuiltin }
 !37 = distinct !DILexicalBlock(scope: !35, file: !1, line: 4)
 !38 = !DILocation(line: 6, scope: !37)
 !39 = !DILocation(line: 7, scope: !4)
-
-; CHECK: for.cond:
-; CHECK: %[[PHI1:.+]] = phi <4 x [[TYPE:i[0-9]+]]> [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
-; CHECK: call void @llvm.dbg.value(metadata <4 x [[TYPE]]> %[[PHI1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg !{{[0-9]+}}
-; CHECK-NOT: phi
-
-; CHECK: ret void

From c93b31fe7bfa3fdbfe6be1a1b6beb423af4d4f92 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 28 Aug 2023 17:35:35 +0100
Subject: [PATCH 022/182] [compiler] Delete dead CLBuiltinInfo code

These builtins have all been converted to mux builtins and, as such, the
code is dead.

This also updates some vecz tests which were testing the old CL builtins
rather than the generic mux ones.
---
 .../test/lit/llvm/constant_address_with_uniform.ll   |  8 ++++----
 .../vecz/test/lit/llvm/diverging_loop.ll             | 10 +++++-----
 .../vecz/test/lit/llvm/diverging_nested_loop.ll      | 10 +++++-----
 .../lit/llvm/scalar_load_store_in_varying_branch.ll  | 10 +++++-----
 .../vecz/test/lit/llvm/scalar_splat.ll               |  8 +++-----
 ...calar_splat_after_load_store_in_varying_branch.ll | 11 +++++------
 .../lit/llvm/scalar_splat_after_varying_branch.ll    | 11 +++++------
 .../test/lit/llvm/scalar_splat_in_varying_branch.ll  | 12 ++++++------
 .../vecz/test/lit/llvm/secretly_scalar_load_store.ll |  8 ++++----
 .../vecz/test/lit/llvm/uniform_loop.ll               |  6 +++---
 .../vecz/test/lit/llvm/uniform_loop_metadata.ll      |  6 +++---
 .../vecz/test/lit/llvm/unmangled_builtin_call.ll     |  8 ++++----
 12 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
index 74e35d42163ba..d2ff89e2e6aab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -14,16 +14,16 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_global_id(i32);
+declare spir_func i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %out2) {
 entry:
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 3
   store i32 %gid, i32 addrspace(1)* %arrayidx, align 4
 
@@ -35,7 +35,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_test
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: %gid = call i32 @get_global_id(i32 0)
+; CHECK-NEXT: %gid = call i32 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 3
 ; CHECK: store i32 %gid, ptr addrspace(1) %arrayidx, align 4
 ; CHECK: store <4 x ptr addrspace(1)> %{{.+}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
index a8c7d4edf8ee8..157e28cb1261c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
@@ -14,18 +14,18 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_local_size(i32);
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_local_size(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %id = call i32 @get_local_id(i32 0)
-  %size = call i32 @get_local_size(i32 0)
+  %id = call i32 @__mux_get_local_id(i32 0)
+  %size = call i32 @__mux_get_local_size(i32 0)
   br label %loop
 
 loop:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
index 486d6c200857e..5abfe81e27bf4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
@@ -14,18 +14,18 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_local_size(i32);
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_local_size(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %id = call i32 @get_local_id(i32 0)
-  %size = call i32 @get_local_size(i32 0)
+  %id = call i32 @__mux_get_local_id(i32 0)
+  %size = call i32 @__mux_get_local_size(i32 0)
   br label %loop
 
 loop:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
index ec972edbc80d5..a850d6f99ac6d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
@@ -14,17 +14,17 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_global_id(i32);
+declare spir_func i32 @__mux_get_local_id(i32);
+declare spir_func i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %lid = call i32 @get_local_id(i32 0)
+  %lid = call i32 @__mux_get_local_id(i32 0)
   %cmp = icmp eq i32 %lid, 0
   br i1 %cmp, label %if, label %merge
 
@@ -37,7 +37,7 @@ if:
 merge:
   %multi_load = load i32, i32 addrspace(1)* %in
   %multi_add = add i32 %multi_load, 42
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
   store i32 %multi_add, i32 addrspace(1)* %slot
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
index c13008f425340..39792aee4f089 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
@@ -14,19 +14,17 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func void @barrier(i32);
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_global_id(i32);
+declare i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
   %load = load i32, i32 addrspace(1)* %in
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
   store i32 %load, i32 addrspace(1)* %slot
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
index ba1f94c452384..89ddc9091c3a6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
@@ -14,18 +14,17 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func void @barrier(i32);
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_global_id(i32);
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %lid = call i32 @get_local_id(i32 0)
+  %lid = call i32 @__mux_get_local_id(i32 0)
   %cmp = icmp eq i32 %lid, 0
   br i1 %cmp, label %if, label %merge
 
@@ -37,7 +36,7 @@ if:
 
 merge:
   %load = load i32, i32 addrspace(1)* %in
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
   store i32 %load, i32 addrspace(1)* %slot
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
index 09c2e0b3680bb..be8fd26dae033 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
@@ -14,18 +14,17 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func void @barrier(i32);
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_global_id(i32);
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %lid = call i32 @get_local_id(i32 0)
+  %lid = call i32 @__mux_get_local_id(i32 0)
   %cmp = icmp eq i32 %lid, 0
   br i1 %cmp, label %if, label %merge
 
@@ -34,7 +33,7 @@ if:
 
 merge:
   %load = load i32, i32 addrspace(1)* %in
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
   store i32 %load, i32 addrspace(1)* %slot
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
index c8c424e746203..43bf13e839152 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
@@ -14,23 +14,23 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_local_id(i32);
-declare spir_func i32 @get_global_id(i32);
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %lid = call i32 @get_local_id(i32 0)
+  %lid = call i32 @__mux_get_local_id(i32 0)
   %and = and i32 %lid, 1
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %if, label %merge
 
 if:
-  %lid1 = call i32 @get_local_id(i32 1)
+  %lid1 = call i32 @__mux_get_local_id(i32 1)
   %cmp1 = icmp eq i32 %lid1, 0
   br i1 %cmp1, label %deeper_if, label %deeper_merge
 
@@ -39,7 +39,7 @@ deeper_if:
 
 deeper_merge:
   %load = load i32, i32 addrspace(1)* %in
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
   store i32 %load, i32 addrspace(1)* %slot
   br label %merge
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
index 0986e47372bd9..09135be13353a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
@@ -14,23 +14,23 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_global_id(i32);
+declare i32 @__mux_get_global_id(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %gid = call i32 @get_global_id(i32 0)
+  %gid = call i32 @__mux_get_global_id(i32 0)
   %and = and i32 %gid, 1
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %if, label %early_ret
 
 early_ret:
 ; just to prevent ROSCC from sticking its oar in
-  %gid1 = call i32 @get_global_id(i32 1)
+  %gid1 = call i32 @__mux_get_global_id(i32 1)
   ret void
 
 if:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
index 0b35a0f28a69c..43c60eb882ea7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
@@ -14,16 +14,16 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_local_size(i32);
+declare i32 @__mux_get_local_size(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %size = call i32 @get_local_size(i32 0)
+  %size = call i32 @__mux_get_local_size(i32 0)
   br label %loop
 
 loop:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
index 075ec2ea1ac71..9b7640aa94c88 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
@@ -14,16 +14,16 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir-unknown-unknown"
 
-declare spir_func i32 @get_local_size(i32);
+declare spir_func i32 @__mux_get_local_size(i32);
 
 define spir_kernel void @test(i32 addrspace(1)* %in) {
 entry:
-  %size = call i32 @get_local_size(i32 0)
+  %size = call i32 @__mux_get_local_size(i32 0)
   br label %loop
 
 loop:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
index f694914f58e62..a0973a053e579 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
@@ -23,7 +23,7 @@ target triple = "spir64-unknown-unknown"
 ; Function Attrs: nounwind uwtable
 define void @k_controlflow_loop_if(float* nocapture %out, float* nocapture readonly %in1, i32* nocapture readnone %in2) #0 {
 entry:
-  %call = tail call i64 @get_global_id(i32 0) #2
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %sext = shl i64 %call, 32
   %idxprom = ashr exact i64 %sext, 32
   %arrayidx = getelementptr inbounds float, float* %in1, i64 %idxprom
@@ -35,7 +35,7 @@ entry:
   ret void
 }
 
-declare i64 @get_global_id(i32) #1
+declare i64 @__mux_get_global_id(i32) #1
 
 attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -59,8 +59,8 @@ attributes #2 = { nobuiltin nounwind }
 ; The vectorized function
 ; CHECK: define void @__vecz_v[[WIDTH:[0-9]+]]_k_controlflow_loop_if(
 
-; The unmangled get_global_id call
-; CHECK: tail call i64 @get_global_id(i32 0)
+; The unmangled __mux_get_global_id call
+; CHECK: tail call i64 @__mux_get_global_id(i32 0)
 
 ; The vectorized loads and stores
 ; CHECK: load <4 x i32>, ptr %arrayidx, align 4

From 73bd8bd76cbbfc15d02414378e6be1e71207ee21 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 29 Aug 2023 16:44:51 +0100
Subject: [PATCH 023/182] [vecz] Remove checks for non-i8 memcpy/memsets

We always work with opaque pointers so these checks are dead code.
---
 .../transform/builtin_inlining_pass.cpp       | 23 ------
 .../test/lit/llvm/builtin_inlining_mem.ll     | 77 -------------------
 2 files changed, 100 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index b3583dc42a26b..8448d1fde243a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -109,16 +109,6 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   Value *DstPtr = Args[0];
   Type *Int8Ty = B.getInt8Ty();
 
-#if LLVM_VERSION_LESS(17, 0)
-  auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
-  // FIXME: We implicitly assume pointers to i8 by doing byte-wise stores,
-  // below. See CA-4331.
-  if (!DstPtrTy->isOpaque() &&
-      multi_llvm::getPtrElementType(DstPtrTy) != Int8Ty) {
-    return nullptr;
-  }
-#endif
-
   Value *StoredValue = Args[1];
   bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
   llvm::StoreInst *MS = nullptr;
@@ -214,19 +204,6 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
   Value *SrcPtr = Args[1];
   Type *Int8Ty = B.getInt8Ty();
 
-#if LLVM_VERSION_LESS(17, 0)
-  auto *DstPtrTy = cast<PointerType>(DstPtr->getType());
-  auto *SrcPtrTy = cast<PointerType>(DstPtr->getType());
-  // FIXME: We implicitly assume pointers to i8 by doing byte-wise loads and
-  // stores, below. See CA-4331.
-  if ((!DstPtrTy->isOpaque() &&
-       multi_llvm::getPtrElementType(DstPtrTy) != Int8Ty) ||
-      ((!SrcPtrTy->isOpaque() &&
-        multi_llvm::getPtrElementType(SrcPtrTy) != Int8Ty))) {
-    return nullptr;
-  }
-#endif
-
   bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
   llvm::StoreInst *MC = nullptr;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
deleted file mode 100644
index 23099611e2e78..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; FIXME: CA-4331 - we can't inline non-i8 memcpy/memset
-
-define spir_kernel void @test_memset_i16(i64* %z) {
-  %dst = bitcast i64* %z to i16*
-  call void @llvm.memset.p0i16.i64(i16* %dst, i8 42, i64 18, i32 8, i1 false)
-  ret void
-}
-
-
-; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i16(ptr %z)
-; CHECK: [[D1:%.*]] = getelementptr inbounds i8, ptr %dst, i64 0
-; CHECK: store i64 3038287259199220266, ptr [[D1]], align 8
-
-; CHECK: [[D2:%.*]] = getelementptr inbounds i8, ptr %dst, i64 8
-; CHECK: store i64 3038287259199220266, ptr [[D2]], align 8
-
-; CHECK: [[D3:%.*]] = getelementptr inbounds i8, ptr %dst, i64 16
-; CHECK: store i8 42, ptr [[D3]], align 1
-
-; CHECK: [[D4:%.*]] = getelementptr inbounds i8, ptr %dst, i64 17
-; CHECK: store i8 42, ptr [[D4]], align 1
-; CHECK: }
-
-define spir_kernel void @test_memcpy_i16(i64* %a, i64* %z) {
-  %src = bitcast i64* %a to i16*
-  %dst = bitcast i64* %z to i16*
-  call void @llvm.memcpy.p0i16.p0i16.i64(i16* %dst, i16* %src, i64 18, i32 8, i1 false)
-  ret void
-}
-
-
-; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i16(ptr %a, ptr %z)
-; CHECK: [[S1:%.*]] = getelementptr inbounds i8, ptr %src, i64 0
-; CHECK: [[D1:%.*]] = getelementptr inbounds i8, ptr %dst, i64 0
-; CHECK: [[SRC1:%.*]] = load i64, ptr [[S1]], align 8
-; CHECK: store i64 [[SRC1]], ptr [[D1]], align 8
-
-; CHECK: [[S2:%.*]] = getelementptr inbounds i8, ptr %src, i64 8
-; CHECK: [[D2:%.*]] = getelementptr inbounds i8, ptr %dst, i64 8
-; CHECK: [[SRC2:%.*]] = load i64, ptr [[S2]], align 8
-; CHECK: store i64 [[SRC2]], ptr [[D2]], align 8
-
-; CHECK: [[S3:%.*]] = getelementptr inbounds i8, ptr %src, i64 16
-; CHECK: [[D3:%.*]] = getelementptr inbounds i8, ptr %dst, i64 16
-; CHECK: [[SRC3:%.*]] = load i8, ptr [[S3]], align 1
-; CHECK: store i8 [[SRC3]], ptr [[D3]], align 1
-
-; CHECK: [[S4:%.*]] = getelementptr inbounds i8, ptr %src, i64 17
-; CHECK: [[D4:%.*]] = getelementptr inbounds i8, ptr %dst, i64 17
-; CHECK: [[SRC4:%.*]] = load i8, ptr [[S4]], align 1
-; CHECK: store i8 [[SRC4]], ptr [[D4]], align 1
-; CHECK: }
-
-declare void @llvm.memset.p0i16.i64(i16*, i8, i64, i32, i1)
-declare void @llvm.memcpy.p0i16.p0i16.i64(i16*, i16*, i64, i32, i1)

From 427b2ef6f02d2ed56b775e8379ef79397073643f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 30 Aug 2023 17:45:56 +0100
Subject: [PATCH 024/182] [vecz] Remove dead/discarded function demangle

I'm not entirely sure what's going on here or why the compiler didn't
warn about an unused variable, but this is obviously unnecessary.
---
 .../compiler_passes/vecz/source/vectorization_context.cpp      | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 5e27bf0632efc..39cdf21813016 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -143,9 +143,6 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
     return result;
   }
 
-  compiler::utils::NameMangler Mangler(&F.getContext());
-  auto const BuiltinName = Mangler.demangleName(F.getName()).str();
-
   result.func = VectorCallee;
 
   // Gather information about the function's arguments.

From 0ed8ebd6deb3d47a34fb6d62a1a35fdc28ace716 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 31 Aug 2023 10:25:50 +0100
Subject: [PATCH 025/182] [vecz] Switch from AssertingVH to PoisoningVH

When using asserting value handles, if the vectorizer quits unexpectedly
then the cache of vectorization units remains active. When the
LLVMContext finally comes to clear its memory for functions, the value
handles would assert that there are still uses of the functions we'd
cached. Since this happens after the time our error handlers have been
detatched we'd end up bringing down the OpenCL process, unhelpfully to
users.

The intended use of the handle was to catch pointers to Functions being
used as keys in the map, where if we deleted that Function LLVM may
allocate a new one in its place, and we'd incorrectly end up using a
cached vectorization unit for the new function.

Instead we can using a poisoning value handle, which poisons itself if
the value is destroyed whilethe handle is still live. If looking up a
new value in the map finds a handle from the old value, an assert is
triggered. This should still be caught by our error handlers and safely
dealt with, as it would happen during the compilation process.

Perhaps another solution would be to install a vecz error handler to
clear up the cache so we can keep using asserting value handles.
However, this seems like the intended use for the poisoning handles.
---
 .../compiler_passes/vecz/source/include/vectorization_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index 9fc3c35a21d09..aa8b0654733ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -47,7 +47,7 @@ class VectorizationChoices;
 struct VectorizationResult;
 class VectorizationUnit;
 
-using ActiveUnitMap = llvm::DenseMap<llvm::AssertingVH<const llvm::Function>,
+using ActiveUnitMap = llvm::DenseMap<llvm::PoisoningVH<const llvm::Function>,
                                      VectorizationUnit *>;
 
 /// @brief Holds global (per-module) vectorization state.

From fdbb6b1aa5ae4c276dad4d6241c973d1ab06494b Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 30 Aug 2023 16:17:22 +0100
Subject: [PATCH 026/182] [multi_llvm] Remove opaque pointer helpers

All pointers are opaque as of LLVM 15 and, as a result, these checks are
useless and just a maintenance burden. They arguably serve as
documentation programmer intent but I don't think that's a good enough
reason to keep them.
---
 .../include/multi_llvm/opaque_pointers.h      | 42 -------------------
 .../analysis/instantiation_analysis.cpp       |  6 +--
 .../analysis/packetization_analysis.cpp       |  4 --
 .../vecz/source/memory_operations.cpp         | 18 --------
 .../transform/builtin_inlining_pass.cpp       |  1 -
 .../interleaved_group_combine_pass.cpp        |  6 ---
 .../vecz/source/transform/packetizer.cpp      |  3 --
 .../vecz/source/vector_target_info.cpp        |  7 ----
 .../vecz/source/vectorization_context.cpp     |  3 --
 9 files changed, 1 insertion(+), 89 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
deleted file mode 100644
index 91e9de1d488a9..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/opaque_pointers.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#ifndef MULTI_LLVM_OPAQUE_POINTERS_H_INCLUDED
-#define MULTI_LLVM_OPAQUE_POINTERS_H_INCLUDED
-
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Type.h>
-#include <multi_llvm/llvm_version.h>
-
-namespace multi_llvm {
-inline bool isOpaqueOrPointeeTypeMatches(llvm::PointerType *PTy, llvm::Type *) {
-  (void)PTy;
-#if LLVM_VERSION_LESS(17, 0)
-  assert(PTy->isOpaque() && "No support for typed pointers in LLVM 15+");
-#endif
-  return true;
-}
-
-inline llvm::Type *getPtrElementType(llvm::PointerType *PTy) {
-  (void)PTy;
-#if LLVM_VERSION_LESS(17, 0)
-  assert(PTy->isOpaque() && "No support for typed pointers in LLVM 15+");
-#endif
-  return nullptr;
-}
-
-};  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_OPAQUE_POINTERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index 434d427ddde03..3458dd504f956 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -17,7 +17,6 @@
 #include "analysis/instantiation_analysis.h"
 
 #include <compiler/utils/builtin_info.h>
-#include <multi_llvm/opaque_pointers.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "analysis/uniform_value_analysis.h"
@@ -37,10 +36,7 @@ bool analyzeType(Type *Ty) {
 }
 
 bool analyzeMemOp(MemOp &Op) {
-  assert(isa<PointerType>(Op.getPointerType()) &&
-         multi_llvm::isOpaqueOrPointeeTypeMatches(
-             cast<PointerType>(Op.getPointerType()), Op.getDataType()) &&
-         "MemOp inconsistency");
+  assert(Op.getPointerType()->isPointerTy() && "MemOp inconsistency");
   return analyzeType(Op.getDataType());
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
index 9613422675414..fc840022d31ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
@@ -21,7 +21,6 @@
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/Debug.h>
-#include <multi_llvm/opaque_pointers.h>
 
 #include "analysis/stride_analysis.h"
 #include "analysis/uniform_value_analysis.h"
@@ -120,9 +119,6 @@ void PacketizationAnalysisResult::markForPacketization(Value *V) {
       if (hasValidStride) {
         // Get the pointer stride as a number of elements
         auto *const eltTy = mo->getDataType();
-        assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-                   cast<PointerType>(ptr->getType()), eltTy) &&
-               "MemOp assumption broken");
         if (eltTy->isVectorTy() || eltTy->isPointerTy()) {
           // No interleaved memops exist for vector element types or pointer
           // types. We can only vectorize pointer loads/stores or widen vector
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index b4788067249f7..1c0b7ca79ce61 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -20,7 +20,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/opaque_pointers.h>
 #include <multi_llvm/optional_helper.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -36,13 +35,10 @@ using namespace llvm;
 static std::string getMaskedMemOpName(Type *DataTy, PointerType *PtrTy,
                                       Type *MaskTy, unsigned Alignment,
                                       bool IsLoad, bool IsVP) {
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy) &&
-         "Invalid masked memory operation");
   if (!DataTy) {
     return std::string();
   }
   compiler::utils::NameMangler Mangler(&DataTy->getContext());
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy));
   const char *BaseName = IsLoad ? "masked_load" : "masked_store";
   compiler::utils::TypeQualifiers DataQuals(compiler::utils::eTypeQualNone);
   compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
@@ -75,8 +71,6 @@ Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
                                          Type *DataTy, PointerType *PtrTy,
                                          unsigned Alignment, bool IsLoad,
                                          bool IsVP) {
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy) &&
-         "Invalid masked memory operation");
   Module &M = Ctx.module();
   LLVMContext &LLVMCtx = M.getContext();
   Type *MaskTy = IntegerType::getInt1Ty(LLVMCtx);
@@ -121,8 +115,6 @@ static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data,
   if (Ptr->getType() != PtrTy) {
     Ptr = BitCastInst::CreatePointerCast(Ptr, PtrTy, "", InsertBefore);
   }
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(cast<PointerType>(PtrTy),
-                                                  DataTy));
   Function *F =
       getOrCreateMaskedMemOpFn(Ctx, DataTy, PtrTy, Alignment,
                                /*IsLoad*/ Data == nullptr, EVL != nullptr);
@@ -159,9 +151,6 @@ static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy,
                                            Value *Stride, Type *MaskTy,
                                            unsigned Alignment, bool IsLoad,
                                            bool IsVP) {
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy,
-                                                  DataTy->getScalarType()) &&
-         "Invalid masked memory operation");
   if (!DataTy) {
     return std::string();
   }
@@ -213,8 +202,6 @@ Function *vecz::getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx,
                                               Value *Stride, Type *MaskTy,
                                               unsigned Alignment, bool IsLoad,
                                               bool IsVP) {
-  assert(
-      multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, DataTy->getScalarType()));
   Module &M = Ctx.module();
   LLVMContext &LLVMCtx = M.getContext();
 
@@ -261,8 +248,6 @@ static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data,
   if (Ptr->getType() != PtrTy) {
     Ptr = BitCastInst::CreatePointerCast(Ptr, PtrTy, "", InsertBefore);
   }
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(cast<PointerType>(PtrTy),
-                                                  DataTy->getScalarType()));
   Type *MaskTy = Mask ? Mask->getType() : nullptr;
   Function *F = getOrCreateInterleavedMemOpFn(
       Ctx, DataTy, PtrTy, Stride, MaskTy, Alignment,
@@ -391,9 +376,6 @@ static CallInst *createScatterGatherMemOp(VectorizationContext &Ctx,
   VECZ_FAIL_IF(!DataTy);
   VECZ_FAIL_IF(!VecPtr || !VecPtr->getType()->isVectorTy() ||
                !VecPtr->getType()->getScalarType()->isPointerTy());
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-      cast<PointerType>(VecPtr->getType()->getScalarType()),
-      DataTy->getScalarType()));
   Type *MaskTy = Mask ? Mask->getType() : nullptr;
   Function *F = getOrCreateScatterGatherMemOpFn(
       Ctx, DataTy, cast<VectorType>(VecPtr->getType()), MaskTy, Alignment,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index 8448d1fde243a..308cc64f677bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -22,7 +22,6 @@
 #include <llvm/IR/Module.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/IPO/AlwaysInliner.h>
-#include <multi_llvm/opaque_pointers.h>
 
 #include "analysis/vectorization_unit_analysis.h"
 #include "debugging.h"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index a10f26b8ca2d6..2cca07c57c088 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -22,7 +22,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Transforms/Utils/Local.h>
-#include <multi_llvm/opaque_pointers.h>
 
 #include "analysis/uniform_value_analysis.h"
 #include "analysis/vectorization_unit_analysis.h"
@@ -303,9 +302,6 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
         Value *Base = Group.Base;
         if (Kind == eInterleavedLoad && Group.Offset != 0) {
           auto *EltTy = Group.Info.front().DataTy->getScalarType();
-          assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-                     cast<PointerType>(Base->getType()), EltTy) &&
-                 "Unhandled interleaved access");
           // if it's a Load group that was out of order, we have to use the
           // sequentially first GEP in order to preserve use-def ordering,
           // which means we have to offset it with an additional GEP and
@@ -397,8 +393,6 @@ bool InterleavedGroupCombinePass::findGroup(
     }
 
     Type *EleTy = DataType0->getScalarType();
-    assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, EleTy) &&
-           "Unhandled interleaved accesses");
     unsigned Align = EleTy->getScalarSizeInBits() / 8;
     assert(Align != 0 &&
            "interleaved memory operation with zero-sized elements");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 8fd763e4ff105..8786308f999e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -37,7 +37,6 @@
 #include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
-#include <multi_llvm/opaque_pointers.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <memory>
@@ -2123,8 +2122,6 @@ ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) {
 
   // Work out the packet width from the pointed to type, rather than the
   // pointer type itself, because this is the width the memops will be using.
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-      cast<PointerType>(pointer->getType()), GEP->getSourceElementType()));
   auto *const ty = GEP->getSourceElementType();
   auto const packetWidth = getPacketWidthForType(ty);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 2d22f5d5d3d5f..81db2d7be2f49 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -20,7 +20,6 @@
 #include <llvm/MC/TargetRegistry.h>
 #include <llvm/Target/TargetMachine.h>
 #include <multi_llvm/creation_apis_helper.h>
-#include <multi_llvm/opaque_pointers.h>
 #include <multi_llvm/triple.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -433,8 +432,6 @@ Value *TargetInfo::createMaskedInterleavedLoad(IRBuilder<> &B, Type *Ty,
                                                unsigned Alignment) const {
   // We only support scalar pointer types
   assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved load");
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-      cast<PointerType>(Ptr->getType()), Ty->getScalarType()));
 
   auto EC = multi_llvm::getVectorElementCount(Ty);
   Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr");
@@ -457,8 +454,6 @@ Value *TargetInfo::createMaskedInterleavedStore(IRBuilder<> &B, Value *Data,
                                                 unsigned Alignment) const {
   // We only support scalar pointer types
   assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved store");
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-      cast<PointerType>(Ptr->getType()), Data->getType()->getScalarType()));
   auto EC = multi_llvm::getVectorElementCount(Data->getType());
   Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr");
   Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
@@ -1097,8 +1092,6 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
   unsigned SimdWidth = VecWidth.getFixedValue();
 
   Type *EleTy = VecTy->getElementType();
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(PtrTy, EleTy) &&
-         "Unhandled interleaved accesses");
   unsigned Align = EleTy->getScalarSizeInBits() / 8;
 
   bool HasMask =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 39cdf21813016..c6aa7c7969caa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -23,7 +23,6 @@
 #include <llvm/IR/Attributes.h>
 #include <llvm/Target/TargetMachine.h>
 #include <multi_llvm/creation_apis_helper.h>
-#include <multi_llvm/opaque_pointers.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <algorithm>
@@ -472,8 +471,6 @@ bool VectorizationContext::emitMaskedMemOpBody(Function &F,
   Value *Mask = Desc.getMaskOperand(&F);
   Value *VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
   Type *DataTy = Desc.isLoad() ? F.getReturnType() : Data->getType();
-  assert(multi_llvm::isOpaqueOrPointeeTypeMatches(
-      cast<PointerType>(Ptr->getType()), DataTy));
 
   BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
   IRBuilder<> B(Entry);

From 5858572965f984a6057598436076a22087900a88 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 30 Aug 2023 16:44:38 +0100
Subject: [PATCH 027/182] [compiler] Remove all but one use of
 multi_llvm::Optional

Most of these are internal to the toolkit and so it's best just to move
to std::optional. There's one remaining case where we use an LLVM API
which takes an llvm::Optional on LLVM 15 but that will shortly be
deletable too.
---
 .../vecz/source/include/debugging.h           |   6 +-
 .../vecz/source/include/memory_operations.h   |  36 +++--
 .../vecz/source/memory_operations.cpp         | 132 +++++++++---------
 .../source/transform/basic_mem2reg_pass.cpp   |   1 +
 .../control_flow_conversion_pass.cpp          |   3 +-
 .../interleaved_group_combine_pass.cpp        |   8 +-
 .../vecz/source/transform/passes.cpp          |   2 +-
 .../vecz/source/vectorization_context.cpp     |  13 +-
 .../vecz/tools/source/veczc.cpp               |   1 +
 9 files changed, 99 insertions(+), 103 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
index efe67adef667c..55d063bfabc84 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -29,10 +29,10 @@
 #include <llvm/IR/PassManager.h>
 #include <llvm/IR/Value.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/optional_helper.h>
 
 #include <cstdlib>
 #include <memory>
+#include <optional>
 
 namespace vecz {
 
@@ -61,8 +61,8 @@ struct VeczFailResult {
   }
   /// @brief For functions that return an llvm::Optional
   template <typename T>
-  operator multi_llvm::Optional<T>() const {
-    return multi_llvm::None;
+  operator std::optional<T>() const {
+    return std::nullopt;
   }
 };
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
index 8ddf484a3c0a6..ddb2bfcb9915b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -24,9 +24,8 @@
 #include <inttypes.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/LLVMContext.h>
-#include <multi_llvm/optional_helper.h>
 
-#include <string>
+#include <optional>
 
 namespace llvm {
 class CallInst;
@@ -386,9 +385,8 @@ class MemOpDesc {
   /// @param[in] F Function to analyze.
   ///
   /// @return A MemOpDesc if the given function is a memory operation.
-  /// llvm::None otherwise.
-  static multi_llvm::Optional<MemOpDesc> analyzeMemOpFunction(
-      llvm::Function &F);
+  /// std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeMemOpFunction(llvm::Function &F);
 
   /// @brief Determine whether the given function is a masked memory operation.
   /// If that's the case, the descriptor is populated and returned.
@@ -396,8 +394,8 @@ class MemOpDesc {
   /// @param[in] F Function to analyze.
   ///
   /// @return A MemOpDesc if the given function is a masked memory operation.
-  /// llvm::None otherwise.
-  static multi_llvm::Optional<MemOpDesc> analyzeMaskedMemOp(llvm::Function &F);
+  /// std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeMaskedMemOp(llvm::Function &F);
 
   /// @brief Determine whether the given function is an interleaved memory
   /// operation or not. If that's the case, the descriptor is populated and
@@ -406,9 +404,8 @@ class MemOpDesc {
   /// @param[in] F Function to analyze.
   ///
   /// @return A MemOpDesc if the given function is an interleaved memory
-  /// operation. llvm::None otherwise.
-  static multi_llvm::Optional<MemOpDesc> analyzeInterleavedMemOp(
-      llvm::Function &F);
+  /// operation. std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeInterleavedMemOp(llvm::Function &F);
 
   /// @brief Determine whether the given function is a masked interleaved memory
   /// operation or not. If that's the case, the descriptor is populated and
@@ -417,8 +414,8 @@ class MemOpDesc {
   /// @param[in] F Function to analyze.
   ///
   /// @return A MemOpDesc if the given function is a masked interleaved memory
-  /// operation. llvm::None otherwise.
-  static multi_llvm::Optional<MemOpDesc> analyzeMaskedInterleavedMemOp(
+  /// operation. std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeMaskedInterleavedMemOp(
       llvm::Function &F);
 
   /// @brief Determine whether the given function is a scatter/gather memory
@@ -428,9 +425,8 @@ class MemOpDesc {
   /// @param[in] F Function to analyze.
   ///
   /// @return A MemOpDesc if the given function is a scatter/gather operation.
-  /// llvm::None otherwise.
-  static multi_llvm::Optional<MemOpDesc> analyzeScatterGatherMemOp(
-      llvm::Function &F);
+  /// std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeScatterGatherMemOp(llvm::Function &F);
 
   /// @brief Determine whether the given function is a scatter/gather memory
   /// operation or not. If that's the case, the descriptor is populated and
@@ -439,8 +435,8 @@ class MemOpDesc {
   /// @param[in] F Function to analyze.
   ///
   /// @return A MemOpDesc if the given function is a masked scatter/gather
-  /// operation. llvm::None otherwise.
-  static multi_llvm::Optional<MemOpDesc> analyzeMaskedScatterGatherMemOp(
+  /// operation. std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeMaskedScatterGatherMemOp(
       llvm::Function &F);
 
   /// @brief Determine whether the operation is a load or not.
@@ -493,14 +489,14 @@ struct MemOp {
   MemOp(llvm::Instruction *I, const MemOpDesc &Desc);
   /// @brief Create a memory operation from an instruction.
   /// @param[in] I Instruction that may be a memory operation.
-  static multi_llvm::Optional<MemOp> get(llvm::Instruction *I);
+  static std::optional<MemOp> get(llvm::Instruction *I);
   /// @brief Create a memory operation from an instruction and an existing
   /// memory operation descriptor.
   ///
   /// @param[in] CI Memory builtin call instruction.
   /// @param[in] AccessKind the kind of access to consider
-  static multi_llvm::Optional<MemOp> get(llvm::CallInst *CI,
-                                         MemOpAccessKind AccessKind);
+  static std::optional<MemOp> get(llvm::CallInst *CI,
+                                  MemOpAccessKind AccessKind);
 
   /// @brief Access the memory operation descriptor.
   const MemOpDesc &getDesc() const { return Desc; }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index 1c0b7ca79ce61..e99117f87ac17 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -20,9 +20,10 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/optional_helper.h>
 #include <multi_llvm/vector_type_helper.h>
 
+#include <string>
+
 #include "analysis/instantiation_analysis.h"
 #include "analysis/uniform_value_analysis.h"
 #include "debugging.h"
@@ -440,7 +441,7 @@ Argument *MemOpDesc::getOperand(Function *F, int OpIdx) const {
   return F->getArg(OpIdx);
 }
 
-multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMemOpFunction(Function &F) {
+std::optional<MemOpDesc> MemOpDesc::analyzeMemOpFunction(Function &F) {
   if (auto Op = MemOpDesc::analyzeMaskedMemOp(F)) {
     return Op;
   }
@@ -456,27 +457,27 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMemOpFunction(Function &F) {
   if (auto Op = MemOpDesc::analyzeMaskedScatterGatherMemOp(F)) {
     return Op;
   }
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
-multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
+std::optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
   StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
 
   MemOpDesc Desc;
   if (L.Consume("masked_store")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     Desc.IsVLOp = L.Consume("vp_");
     if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -495,14 +496,14 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
 
   if (L.Consume("masked_load")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     Desc.IsVLOp = L.Consume("vp_");
     if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -517,24 +518,23 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
     Desc.AccessKind = MemOpAccessKind::Masked;
     return Desc;
   }
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
-multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(
-    Function &F) {
+std::optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(Function &F) {
   StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
   MemOpDesc Desc;
   int ConstantStride{};
   if (L.Consume("interleaved_store")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (L.ConsumeSignedInteger(ConstantStride)) {
       VECZ_ERROR_IF(F.arg_size() != 2,
@@ -547,10 +547,10 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(
       std::advance(ArgIt, 2);
       Desc.Stride = &*ArgIt;
     } else {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -566,10 +566,10 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(
 
   if (L.Consume("interleaved_load")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (L.ConsumeSignedInteger(ConstantStride)) {
       VECZ_ERROR_IF(F.arg_size() != 1,
@@ -582,10 +582,10 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(
       std::advance(ArgIt, 1);
       Desc.Stride = &*ArgIt;
     } else {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -598,23 +598,22 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(
     return Desc;
   }
 
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
-multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(
-    Function &F) {
+std::optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(Function &F) {
   StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
   MemOpDesc Desc;
   if (L.Consume("masked_interleaved_store")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     Desc.IsVLOp = L.Consume("vp_");
     // KLOCWORK "UNINIT.STACK.MUST" possible false positive
@@ -623,21 +622,21 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(
     int ConstantStride;
     if (L.ConsumeSignedInteger(ConstantStride)) {
       if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
-        return multi_llvm::None;
+        return std::nullopt;
       }
       Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
     } else if (L.Consume("V")) {
       if (F.arg_size() != 4 + (unsigned)Desc.IsVLOp) {
-        return multi_llvm::None;
+        return std::nullopt;
       }
       auto ArgIt = F.arg_begin();
       std::advance(ArgIt, 3 + Desc.IsVLOp);
       Desc.Stride = &*ArgIt;
     } else {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -655,10 +654,10 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(
   }
   if (L.Consume("masked_interleaved_load")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     Desc.IsVLOp = L.Consume("vp_");
     // KLOCWORK "UNINIT.STACK.MUST" possible false positive
@@ -667,21 +666,21 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(
     int ConstantStride;
     if (L.ConsumeSignedInteger(ConstantStride)) {
       if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
-        return multi_llvm::None;
+        return std::nullopt;
       }
       Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
     } else if (L.Consume("V")) {
       if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
-        return multi_llvm::None;
+        return std::nullopt;
       }
       auto ArgIt = F.arg_begin();
       std::advance(ArgIt, 2 + Desc.IsVLOp);
       Desc.Stride = &*ArgIt;
     } else {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -697,26 +696,25 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(
     return Desc;
   }
 
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
-multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(
-    Function &F) {
+std::optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(Function &F) {
   StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
   MemOpDesc Desc;
   if (L.Consume("scatter_store")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (F.arg_size() != 2) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -732,13 +730,13 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(
 
   if (L.Consume("gather_load")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (F.arg_size() != 1) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -751,28 +749,28 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(
     return Desc;
   }
 
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
-multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
+std::optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
     Function &F) {
   StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
 
   MemOpDesc Desc;
   if (L.Consume("masked_scatter_store")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     Desc.IsVLOp = L.Consume("vp_");
     if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -791,14 +789,14 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
 
   if (L.Consume("masked_gather_load")) {
     if (!L.ConsumeInteger(Desc.Alignment)) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     if (!L.Consume("_")) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
     Desc.IsVLOp = L.Consume("vp_");
     if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
-      return multi_llvm::None;
+      return std::nullopt;
     }
 
     Function::arg_iterator Arg = F.arg_begin();
@@ -814,12 +812,12 @@ multi_llvm::Optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
     return Desc;
   }
 
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-multi_llvm::Optional<MemOp> MemOp::get(llvm::Instruction *I) {
+std::optional<MemOp> MemOp::get(llvm::Instruction *I) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     MemOpDesc Desc;
     Desc.Kind = MemOpKind::LoadInstruction;
@@ -848,19 +846,19 @@ multi_llvm::Optional<MemOp> MemOp::get(llvm::Instruction *I) {
       }
     }
   }
-  return multi_llvm::None;
+  return std::nullopt;
 }
 
-multi_llvm::Optional<MemOp> MemOp::get(llvm::CallInst *CI,
-                                       MemOpAccessKind AccessKind) {
+std::optional<MemOp> MemOp::get(llvm::CallInst *CI,
+                                MemOpAccessKind AccessKind) {
   if (!CI->getCalledFunction()) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
-  multi_llvm::Optional<MemOpDesc> Desc;
+  std::optional<MemOpDesc> Desc;
   if (Function *Caller = CI->getCalledFunction()) {
     switch (AccessKind) {
       default:
-        return multi_llvm::None;
+        return std::nullopt;
       case MemOpAccessKind::Masked:
         Desc = MemOpDesc::analyzeMaskedMemOp(*Caller);
         break;
@@ -879,7 +877,7 @@ multi_llvm::Optional<MemOp> MemOp::get(llvm::CallInst *CI,
     }
   }
   if (!Desc) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
   return MemOp(CI, *Desc);
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index 15a85bff5b5b8..1f662ffe7862a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -22,6 +22,7 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Local.h>
+#include <multi_llvm/llvm_version.h>
 
 #include "debugging.h"
 #include "transform/passes.h"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 9b60d4dc64467..b49ccc842760c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -31,7 +31,6 @@
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/optional_helper.h>
 
 #include <queue>
 #include <utility>
@@ -1099,7 +1098,7 @@ bool ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
     if (tryApplyMaskToBinOp(I, mask, toDelete, safeDivisors)) {
       continue;
     }
-    multi_llvm::Optional<MemOp> memOp = MemOp::get(&I);
+    std::optional<MemOp> memOp = MemOp::get(&I);
     // Turn loads and stores into masked loads and stores.
     if (memOp && (memOp->isLoad() || memOp->isStore())) {
       if (!tryApplyMaskToMemOp(*memOp, mask, toDelete)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index 2cca07c57c088..d1cbc37410c5d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -23,6 +23,8 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Transforms/Utils/Local.h>
 
+#include <optional>
+
 #include "analysis/uniform_value_analysis.h"
 #include "analysis/vectorization_unit_analysis.h"
 #include "debugging.h"
@@ -259,7 +261,7 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
         continue;
       }
 
-      multi_llvm::Optional<MemOp> Op = MemOp::get(CI);
+      std::optional<MemOp> Op = MemOp::get(CI);
       // We can't optimize interleaved memops if we don't know the stride at
       // runtime, since we need to check if the stride and the group size match.
       if (!Op || !Op->isStrideConstantInt()) {
@@ -320,7 +322,7 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
             Group.Kind == eMaskedInterleavedLoad) {
           Masks.reserve(Group.Data.size());
           for (auto *V : Group.Data) {
-            multi_llvm::Optional<MemOp> Op = MemOp::get(cast<Instruction>(V));
+            std::optional<MemOp> Op = MemOp::get(cast<Instruction>(V));
             assert(Op && "Unanalyzable interleaved access?");
             Masks.push_back(Op->getMaskOperand());
           }
@@ -449,7 +451,7 @@ bool InterleavedGroupCombinePass::findGroup(
           CanMove = canMoveUp(Group.Data, cast<Instruction>(InfoN.Op));
 
           if (InfoN.Kind == eMaskedInterleavedLoad) {
-            multi_llvm::Optional<MemOp> Op = MemOp::get(InfoN.Op);
+            std::optional<MemOp> Op = MemOp::get(InfoN.Op);
             assert(Op && "Unanalyzable load?");
             if (auto *MaskInst = dyn_cast<Instruction>(Op->getMaskOperand())) {
               CanMove &= Group.canDeinterleaveMask(*MaskInst);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index bab0daefa5bea..684082f7a2411 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -86,7 +86,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
   TargetInfo &VTI = Ctx.targetInfo();
   std::vector<Instruction *> ToDelete;
   for (Function &Builtin : F.getParent()->functions()) {
-    multi_llvm::Optional<MemOpDesc> BuiltinDesc =
+    std::optional<MemOpDesc> BuiltinDesc =
         MemOpDesc::analyzeMaskedMemOp(Builtin);
     if (!BuiltinDesc) {
       continue;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index c6aa7c7969caa..e6fda0c73117c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -369,14 +369,14 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
 }
 
 namespace {
-multi_llvm::Optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
+std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
     StringRef fnName, Type *const ty) {
   compiler::utils::Lexer L(fnName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
   if (!L.Consume("sub_group_scan_")) {
-    return multi_llvm::None;
+    return std::nullopt;
   }
   bool isInt = ty->isIntOrIntVectorTy();
   bool isInclusive = L.Consume("inclusive_");
@@ -412,13 +412,13 @@ multi_llvm::Optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
         opKind = RecurKind::Xor;
         assert(isInt && "unexpected internal scan builtin");
       } else {
-        return multi_llvm::None;
+        return std::nullopt;
       }
       bool isVP = L.Consume("_vp");
       return std::make_tuple(isInclusive, opKind, isVP);
     }
   }
-  return multi_llvm::None;
+  return std::nullopt;
 }
 };  // namespace
 
@@ -426,8 +426,7 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) {
   assert(F->isDeclaration() && "builtin is already defined");
 
   // Handle masked memory loads and stores.
-  if (multi_llvm::Optional<MemOpDesc> Desc =
-          MemOpDesc::analyzeMemOpFunction(*F)) {
+  if (std::optional<MemOpDesc> Desc = MemOpDesc::analyzeMemOpFunction(*F)) {
     if (Desc->isMaskedMemOp()) {
       return emitMaskedMemOpBody(*F, *Desc);
     }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 09a5ee237d486..d64675142518c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -43,6 +43,7 @@
 #include <llvm/Target/TargetLoweringObjectFile.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
+#include <multi_llvm/llvm_version.h>
 
 #include <string>
 

From c0e10f56f228119512504ddb7f85ef62feaabe8c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@gmail.com>
Date: Tue, 15 Aug 2023 17:21:24 +0100
Subject: [PATCH 028/182] [vecz] Vectorize sub-group local ID on top of mux
 sub-groups

Mux sub-groups represent hardware sub-groups (groups of invocations of
the kernel working together), whereas vecz vectorizes *on top* of those
(groups of work-items working together at a software level on *one*
invocation). This should essentially multiply the apparent sub-group
size by the vectorization factor.

The vectorizer is not currently obeying this paradigm, instead replacing
the mux sub-groups with its own and confusing the concept of a mux
sub-group with a vectorized operation. Since we only support a mux
sub-group size of 1 in all our targets, we can just about get away with
this.

We aim to bring the separate concepts of mux subgroups and vectorized
sub-groups together over a series of commits, and this is the first.

The sub-group local ID is now vectorized over mux sub-groups, by
multiplying the mux sub-group local ID by the vectorization factor, and
adding the step vector to that.
---
 .../inline_post_vectorization_pass.cpp        |  9 -------
 .../vecz/source/transform/packetizer.cpp      | 25 ++++++++++++-------
 .../llvm/ScalableVectors/subgroup_builtins.ll | 13 ++++++++--
 .../vecz/test/lit/llvm/subgroup_broadcast.ll  |  5 +++-
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |  9 ++++++-
 5 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index 65c4c120242b6..0d93f99c1cf40 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -73,15 +73,6 @@ Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
     }
   }
 
-  // Vectorized uses of the subgroup local id will have been replaced with step
-  // vectors starting from zero. Uniform uses should be replaced with zero in
-  // order to maintain equivalence between the scalar/vector forms. Do this
-  // here due to a tight coupling between the vectorized version and these
-  // remaining scalar versions.
-  if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
-    return ConstantInt::getNullValue(CI->getType());
-  }
-
   return CI;
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 8786308f999e5..2b04011f9daca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2443,14 +2443,6 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
     return nullptr;
   }
   if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
-    // The subgroup ID is just a simple index sequence. There is no dimension
-    // to it, and we only support 1D workgroups.
-    if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
-      IRBuilder<> B(buildAfter(CI, F));
-      return multi_llvm::createIndexSequence(
-          B, VectorType::get(CI->getType(), SimdWidth), SimdWidth,
-          "subgroup.local.id");
-    }
     return vectorizeWorkGroupCall(CI, Builtin);
   }
 
@@ -2561,8 +2553,23 @@ Value *Packetizer::Impl::vectorizeWorkGroupCall(
   // Do not vectorize ranks equal to vectorization dimension. The value of
   // get_global_id with other ranks is uniform.
 
+  Value *IDToSplat = CI;
+  // Multiply the sub-group local ID by the vectorization factor, to vectorize
+  // across the entire sub-group size.
+  // For example, with a vector width of 4 and a mux sub-group size of 2, the
+  // apparent sub-group size is 8 and the sub-group IDs are:
+  // | mux sub group 0 | mux sub group 1 |
+  // |-----------------|-----------------|
+  // |  0   1   2   3  |  4   5   6   7  |
+  if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
+    auto SimdWithAsVal = B.getInt32(SimdWidth.getKnownMinValue());
+    IDToSplat = B.CreateMul(IDToSplat, !SimdWidth.isScalable()
+                                           ? SimdWithAsVal
+                                           : B.CreateVScale(SimdWithAsVal));
+  }
+
   // Broadcast the builtin's return value.
-  Value *Splat = B.CreateVectorSplat(SimdWidth, CI);
+  Value *Splat = B.CreateVectorSplat(SimdWidth, IDToSplat);
 
   // Add an index sequence [0, 1, 2, ...] to the value unless uniform.
   auto const Uniformity = Builtin.uniformity;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 1f14ab3d42c23..7f3876335077f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -44,8 +44,17 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
   store i32 %call, i32 addrspace(1)* %arrayidx, align 4
   ret void
 ; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_local_id(
-; CHECK: [[LID:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK: store <vscale x 4 x i32> [[LID]], ptr addrspace(1) %out
+; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK: [[SHL:%.*]] = shl i32 %1, 2
+; CHECK: [[MUL:%.*]] = mul i32 %call, [[SHL]]
+; CHECK: [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[MUL]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[STEPVEC:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK: [[LID:%.*]] = add <vscale x 4 x i32> [[SPLAT]], [[STEPVEC]]
+; CHECK: [[EXT:%.*]] = sext i32 %call to i64
+; CHECK: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: store <vscale x 4 x i32> [[LID]], ptr addrspace(1) %arrayidx
 }
 
 define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
index 8e297e4ee4134..2344f68691f2f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -39,4 +39,7 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
 ; CHECK: [[LD:%.+]] = load i32, ptr addrspace(1) %{{.+}}, align 4
 ; CHECK: [[INS:%.+]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
 ; CHECK: [[BCAST:%.+]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %out, align 4
+; CHECK: %idx = tail call i32 @__mux_get_sub_group_local_id()
+; CHECK: [[EXT:%.*]] = sext i32 %idx to i64
+; CHECK: %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %arrayidx2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index 5dfcca1e82f05..c4a74592815f9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -43,7 +43,14 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
   store i32 %call, i32 addrspace(1)* %arrayidx, align 4
   ret void
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_local_id(
-; CHECK: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(1) %out
+; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+; CHECK: [[MUL:%.*]] = shl i32 %call, 2
+; CHECK: [[SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[ID:%.*]] = or <4 x i32> [[SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[EXT:%.*]] = sext i32 %call to i64
+; CHECK: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: store <4 x i32> [[ID]], ptr addrspace(1) %arrayidx
 }
 
 define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {

From 5c08537b947b1c43d8f3f7da4d8b67f828014a8f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@gmail.com>
Date: Tue, 15 Aug 2023 18:01:38 +0100
Subject: [PATCH 029/182] [vecz] Vectorize sub-group reductions on top of mux
 sub-groups

A reduction across the full apparent sub-group is a reduction over the
vectorization factor followed by a reduction of that value across the
mux sub-group.
---
 .../vecz/source/transform/packetizer.cpp      | 41 ++++++++-----------
 .../VectorPredication/subgroup_reductions.ll  | 33 ++++++++++-----
 ...ions_spv_khr_uniform_group_instructions.ll | 28 +++++++++----
 .../vecz/test/lit/llvm/subgroup_reductions.ll | 40 +++++++++++-------
 ...ions_spv_khr_uniform_group_instructions.ll | 33 +++++++++------
 5 files changed, 105 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 2b04011f9daca..56d152e8ac9ee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -190,13 +190,13 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   Value *packetizeMaskVarying(Instruction *I);
-  /// @brief Packetize a mask-varying subgroup reduction.
+  /// @brief Packetize a mask-varying subgroup/workgroup reduction.
   ///
   /// @param[in] I Instruction to packetize.
   ///
   /// @return Packetized instruction.
-  Value *packetizeSubgroupReduction(Instruction *I);
-  /// @brief Packetize a subgroup broadcast.
+  Value *packetizeGroupReduction(Instruction *I);
+  /// @brief Packetize a subgroup/workgroup broadcast.
   ///
   /// @param[in] I Instruction to packetize.
   ///
@@ -881,7 +881,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
     return getPacketized(Ins);
   }
 
-  if (auto *reduction = packetizeSubgroupReduction(Ins)) {
+  if (auto *reduction = packetizeGroupReduction(Ins)) {
     return broadcast(reduction);
   }
 
@@ -1074,7 +1074,7 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
   return Packetizer::Result(*this, Ins, nullptr);
 }
 
-Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
+Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   auto *const CI = dyn_cast<CallInst>(I);
   if (!CI || !CI->getCalledFunction()) {
     return nullptr;
@@ -1122,18 +1122,19 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
     Value *&val = opPackets.front();
     val = sanitizeVPReductionInput(B, val, VL, Info->Recurrence);
     if (!val) {
-      emitVeczRemarkMissed(&F, CI,
-                           "Can not vector-predicate subgroup reduction");
+      emitVeczRemarkMissed(
+          &F, CI, "Can not vector-predicate workgroup/subgroup reduction");
       return nullptr;
     }
   }
 
   // According to the OpenCL Spec, we are allowed to rearrange the operation
-  // order of a subgroup reduction any way we like (even though floating point
-  // addition is not associative so might not produce exactly the same result),
-  // so we reduce to a single vector first, if necessary, and then do a single
-  // reduction to scalar. This is more efficient than doing multiple reductions
-  // to scalar and then BinOp'ing multiple scalars together.
+  // order of a workgroup/subgroup reduction any way we like (even though
+  // floating point addition is not associative so might not produce exactly
+  // the same result), so we reduce to a single vector first, if necessary, and
+  // then do a single reduction to scalar. This is more efficient than doing
+  // multiple reductions to scalar and then BinOp'ing multiple scalars
+  // together.
   //
   // Reduce to a single vector.
   while ((packetWidth >>= 1)) {
@@ -1149,18 +1150,12 @@ Value *Packetizer::Impl::packetizeSubgroupReduction(Instruction *I) {
   Value *v =
       createSimpleTargetReduction(B, &TTI, opPackets.front(), Info->Recurrence);
 
-  if (isWorkGroup) {
-    // For a work group operation, we leave the original reduction function and
-    // divert the subgroup reduction through it, giving us a work group
-    // reduction over subgroup reductions.
-    CI->setOperand(argIdx, v);
-    v = CI;
-  } else {
-    IC.deleteInstructionLater(CI);
-    CI->replaceAllUsesWith(v);
-  }
+  // We leave the original reduction function and divert the vectorized
+  // reduction through it, giving us a reduction over the full apparent
+  // sub-group or work-group size (vecz * mux).
+  CI->setOperand(argIdx, v);
 
-  return v;
+  return CI;
 }
 
 Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
index 7cc1b0ec0c5c9..e4f95885c58b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -56,7 +56,8 @@ entry:
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[T2]]
 ; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4
 ; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], -1
-; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
 }
 
@@ -81,7 +82,8 @@ entry:
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> [[T2]], <4 x i1> zeroinitializer
 ; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4
 ; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0
-; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
 }
 
@@ -102,7 +104,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
@@ -122,7 +125,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer
 ; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[I]])
-; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -142,7 +146,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float -0.000000e+00, float -0.000000e+00,
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[I]])
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -162,7 +167,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 2147483647, i32 2147483647, 
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -182,7 +188,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -202,7 +209,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -2147483648, i32 -2147483648, 
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -222,7 +230,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -242,7 +251,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[I]])
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHEKC: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -262,5 +272,6 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 0xFFF8000000000000, float 0xFFF8000000000000,
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[I]])
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 032074917d73f..357d6bd0d0143 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -40,7 +40,8 @@ declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -60,7 +61,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
 ; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[I]])
-; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -80,7 +82,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[I]])
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -100,7 +103,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> 
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -120,7 +124,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[I]])
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -140,7 +145,8 @@ entry:
 ; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer
 ; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[I]])
-; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -157,7 +163,8 @@ entry:
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and(
 ; This doesn't generate a reduction intrinsic...
 ; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1
-; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]])
+; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -176,7 +183,8 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or(
 ; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0
-; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]])
+; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -196,7 +204,9 @@ entry:
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor(
 ; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
 ; CHECK: [[T:%.*]] = and i4 [[X]], 1
-; CHECK: [[R:%.*]] = zext i4 [[T]] to i32
+; CHECK: [[C:%.*]] = icmp ne i4 [[T]], 0
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[C]])
+; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
index 7d27f2dc1cce8..0bcb01081a3fb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -53,8 +53,8 @@ entry:
 
 ; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4
 ; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], 0
-
-; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
 }
 
@@ -76,8 +76,8 @@ entry:
 
 ; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4
 ; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0
-
-; CHECK: [[EXT:%.*]] = sext i1 [[R]] to i32
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
 }
 
@@ -94,7 +94,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_add_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 ; Given we've checked a "full" expanded reduction sequence above for LLVM < 13,
@@ -112,8 +113,9 @@ entry:
 ; intrinsic. LLVM 10 does the shift-left in a vector, LLVMs 11 and 12 do it in
 ; scalar.
 ; CHECK: [[CALL:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{%.*}})
-; CHECK: [[INS:%.*]] = insertelement <4 x i32> {{(undef|poison)}}, i32 [[CALL]], {{(i32|i64)}} 0
-; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[CALL]])
+; CHECK: [[INS:%.*]] = insertelement <4 x i32> {{(undef|poison)}}, i32 %call1, {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[SPLAT]],
 }
 
@@ -130,7 +132,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_add_i64(
 ; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %{{.*}})
-; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -146,7 +149,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_add_f32(
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %{{.*}})
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -162,7 +166,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_smin_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -178,7 +183,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_umin_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -194,7 +200,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_smax_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -210,7 +217,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_umax_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -226,7 +234,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_fmin_f32(
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %{{.*}})
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -242,7 +251,8 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_fmax_f32(
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %{{.*}})
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
 !opencl.ocl.version = !{!0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 067dd7ac7983d..2439e1b8bd854 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -36,7 +36,8 @@ declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_v4_reduce_mul_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -52,7 +53,8 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_mul_i64(
 ; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %{{.*}})
-; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -68,7 +70,8 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_mul_f32(
 ; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %{{.*}})
-; CHECK: store float [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -84,7 +87,8 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_and_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -100,7 +104,7 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_or_i32(
 ; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %{{.*}})
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -116,7 +120,8 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_xor_i32(
 ; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %{{.*}})
-; CHECK: store i64 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -133,8 +138,9 @@ entry:
 ; CHECK-LABEL: @__vecz_v4_reduce_logical_and(
 ; This doesn't generate a reduction intrinsic...
 ; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1
-; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]])
+; CHECK: [[E:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -152,8 +158,9 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_logical_or(
 ; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0
-; CHECK: [[R:%.*]] = zext i1 [[T]] to i32
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]])
+; CHECK: [[E:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
@@ -172,8 +179,10 @@ entry:
 ; CHECK-LABEL: @__vecz_v4_reduce_logical_xor(
 ; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
 ; CHECK: [[T:%.*]] = and i4 [[X]], 1
-; CHECK: [[R:%.*]] = zext i4 [[T]] to i32
-; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+; CHECK: [[T0:%.*]] = icmp ne i4 [[T]], 0
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[T0]])
+; CHECK: [[E:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)

From f54110b76753a7f8997d1f2712f5d36844298b32 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 29 Aug 2023 11:25:19 +0100
Subject: [PATCH 030/182] [vecz] Vectorize sub-group broadcasts on top of mux
 sub-groups

A broadcast across the full apparent sub-group is a broadcast of the
vector group member whose sub-group local ID is modulo the vector size.
---
 .../vecz/source/transform/packetizer.cpp      | 40 ++++++++-----------
 .../llvm/ScalableVectors/subgroup_builtins.ll |  3 +-
 .../vecz/test/lit/llvm/subgroup_builtins.ll   | 23 ++++++++++-
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 56d152e8ac9ee..1b9bf68b95625 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -201,7 +201,7 @@ class Packetizer::Impl : public Packetizer {
   /// @param[in] I Instruction to packetize.
   ///
   /// @return Packetized instruction.
-  Value *packetizeSubgroupBroadcast(Instruction *I);
+  Value *packetizeGroupBroadcast(Instruction *I);
   /// @brief Packetize PHI node.
   ///
   /// @param[in] Phi PHI Node to packetize.
@@ -885,7 +885,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
     return broadcast(reduction);
   }
 
-  if (auto *brdcast = packetizeSubgroupBroadcast(Ins)) {
+  if (auto *brdcast = packetizeGroupBroadcast(Ins)) {
     return broadcast(brdcast);
   }
 
@@ -1158,7 +1158,7 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   return CI;
 }
 
-Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
+Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   auto *const CI = dyn_cast<CallInst>(I);
   if (!CI || !CI->getCalledFunction()) {
     return nullptr;
@@ -1195,17 +1195,15 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
   }
 
   auto *idx = CI->getArgOperand(argIdx + 1);
-  if (isWorkGroup) {
-    // When it's a work group broadcast, we need to sanitize the input index so
-    // that it stays within the range of one subgroup.
-    auto *const minVal =
-        ConstantInt::get(idx->getType(), SimdWidth.getKnownMinValue());
-    Value *idxFactor = minVal;
-    if (SimdWidth.isScalable()) {
-      idxFactor = B.CreateVScale(minVal);
-    }
-    idx = B.CreateURem(idx, idxFactor);
+  // We need to sanitize the input index so that it stays within the range of
+  // one vectorized group.
+  auto *const minVal =
+      ConstantInt::get(idx->getType(), SimdWidth.getKnownMinValue());
+  Value *idxFactor = minVal;
+  if (SimdWidth.isScalable()) {
+    idxFactor = B.CreateVScale(minVal);
   }
+  idx = B.CreateURem(idx, idxFactor);
 
   Value *val = nullptr;
   // Optimize the constant fixed-vector case, where we can choose the exact
@@ -1230,18 +1228,12 @@ Value *Packetizer::Impl::packetizeSubgroupBroadcast(Instruction *I) {
     val = B.CreateExtractElement(op.getAsValue(), idx);
   }
 
-  if (isWorkGroup) {
-    // For a work group operation, we leave the origial broadcast function and
-    // divert the subgroup reduction through it, giving us a work group
-    // reduction over subgroup reductions.
-    CI->setOperand(argIdx, val);
-    val = CI;
-  } else {
-    IC.deleteInstructionLater(CI);
-    CI->replaceAllUsesWith(val);
-  }
+  // We leave the origial broadcast function and divert the vectorized
+  // broadcast through it, giving us a broadcast over the full apparent
+  // sub-group or work-group size (vecz * mux).
+  CI->setOperand(argIdx, val);
 
-  return val;
+  return CI;
 }
 
 Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 7f3876335077f..bafc54f5fa19a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -68,7 +68,8 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
 ; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_sub_group_broadcast(
 ; CHECK: [[LD:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) {{%.*}}, align 4
 ; CHECK: [[EXT:%.*]] = extractelement <vscale x 4 x i32> [[LD]], {{(i32|i64)}} 0
-; CHECK: [[INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXT]], {{(i32|i64)}} 0
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0)
+; CHECK: [[INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[BDCAST]], {{(i32|i64)}} 0
 ; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK: store <vscale x 4 x i32> [[SPLAT]], ptr addrspace(1)
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index c4a74592815f9..10bac3390e3c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -63,7 +63,28 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
   ret void
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast(
 ; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4
-; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[LD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 0
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0)
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
+}
+
+define spir_kernel void @sub_group_broadcast_wider_than_vf(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 6)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast_wider_than_vf(
+; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4
+; The sixth sub-group member is the (6 % 4 ==) 2nd vector group member
+; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 2
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 6)
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
 }
 

From b839b764c7c40ad0ae2325e519ddcd6f1eeed40b Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 30 Aug 2023 11:01:15 +0100
Subject: [PATCH 031/182] [vecz] Vectorize sub-group scans on top of mux
 sub-groups

The process of vectorizing a sub-group scan is a little more complicated
than the other operations.

A scan over the vector group operates in one invocation of the kernel,
whereas the full mux sub-group operates on more than one of these
vectorized invocations. As such, the individual invocations need access
to the previous invocations' scan results. It must therefore use mux
sub-group operations (a reduction and an exclusive scan) to propagate
the scan 'forward' across members of the sub-group.
---
 .../vecz/source/transform/packetizer.cpp      | 57 ++++++++++++++-
 .../llvm/ScalableVectors/subgroup_scans.ll    | 72 ++++++++++++++++---
 .../vecz/test/lit/llvm/subgroup_scans.ll      | 70 +++++++++++++++---
 3 files changed, 180 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 1b9bf68b95625..cd32abf6134c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1686,9 +1686,62 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
 
   IRBuilder<> B(CI);
 
-  auto *c = B.CreateCall(SubgroupFn, Ops);
+  auto *VectorScan = B.CreateCall(SubgroupFn, Ops);
+
+  // We've currently got a scan over each vector group, but the full sub-group
+  // is further multiplied by the mux sub-group size. For example, we may have
+  // a vectorization factor sized group of 4 and a mux sub-group size of 2.
+  // Together the full sub-group size to the user is 4*2 = 8.
+  // In terms of invocations, we've essentially currently got:
+  //   <a0, a0+a1, a0+a1+a2, a0+a1+a2+a3> (invocation 0)
+  //   <a4, a4+a5, a4+a5+a6, a4+a5+a6+a7> (invocation 1)
+  // These two iterations need to be further scanned over the mux sub-group
+  // size. We do this by adding the identity to the first invocation, the
+  // result of the scan over the first invocation to the second, etc. This is
+  // an exclusive scan over the *reduction* of the input vector:
+  //   <a0, a1, a2, a3> (invocation 0)
+  //   <a4, a5, a6, a7> (invocation 1)
+  // -> reduction
+  //   (a0+a1+a2+a3) (invocation 0)
+  //   (a4+a5+a6+a7) (invocation 1)
+  // -> exclusive mux sub-group scan
+  //               I (invocation 0)
+  //   (a0+a1+a2+a3) (invocation 1)
+  // -> adding that to the result of the vector scan:
+  //   <I+a0, I+a0+a1, I+a0+a1+a2, I+a0+a1+a2+a3>          (invocation 0)
+  //   <(a0+a1+a2+a3)+a4, (a0+a1+a2+a3)+a4+a5,             (invocation 1)
+  //    (a0+a1+a2+a3)+a4+a5+a6, (a0+a1+a2+a3)+a4+a5+a6+a7>
+  // When viewed as a full 8-element vector, this is our final scan.
+  // Thus we essentially keep the original mux sub-group scan, but change it to
+  // be an exclusive one.
+  auto *Reduction = Ops.front();
+  if (VL) {
+    Reduction = sanitizeVPReductionInput(B, Reduction, VL, Scan.Recurrence);
+    if (!Reduction) {
+      return results;
+    }
+  }
+  Reduction = createSimpleTargetReduction(B, &TTI, Reduction, Scan.Recurrence);
+
+  // Now we defer to an *exclusive* scan over the mux sub-group.
+  auto ExclScan = Scan;
+  ExclScan.Op = compiler::utils::GroupCollective::OpKind::ScanExclusive;
+
+  auto ExclScanID = Ctx.builtins().getMuxGroupCollective(ExclScan);
+  assert(ExclScanID != compiler::utils::eBuiltinInvalid);
 
-  results.push_back(c);
+  auto *const ExclScanFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      ExclScanID, *F.getParent(), {CI->getType()});
+  assert(ExclScanFn);
+
+  auto *const ExclScanCI = B.CreateCall(ExclScanFn, {Reduction});
+
+  Value *const Splat = B.CreateVectorSplat(SimdWidth, ExclScanCI);
+
+  auto *const Result = multi_llvm::createBinOpForRecurKind(B, VectorScan, Splat,
+                                                           Scan.Recurrence);
+
+  results.push_back(Result);
   return results;
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
index c6c8a4cf5ce47..0161057de1534 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
@@ -42,7 +42,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i32(
-; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_add_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <vscale x 4 x i32> [[SCAN]], [[SPLAT]]
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
@@ -55,7 +61,13 @@ entry:
   store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i64(
-; CHECK: call <vscale x 4 x i64> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4m(<vscale x 4 x i64> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i64> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4m(<vscale x 4 x i64> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_sub_group_scan_exclusive_add_i64(i64 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[HEAD]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <vscale x 4 x i64> [[SCAN]], [[SPLAT]]
+; CHECK: store <vscale x 4 x i64> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -68,7 +80,13 @@ entry:
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_f32(
-; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.0{{.*}}, <vscale x 4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fadd_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[HEAD]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = fadd <vscale x 4 x float> [[SCAN]], [[SPLAT]]
+; CHECK: store <vscale x 4 x float> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -81,7 +99,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smin_i32(
-; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4i(<vscale x 4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4i(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -94,7 +118,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umin_i32(
-; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -107,7 +137,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smax_i32(
-; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4i(<vscale x 4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4i(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -120,7 +156,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umax_i32(
-; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -133,7 +175,13 @@ entry:
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmin_f32(
-; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmin_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[HEAD]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[SCAN]], <vscale x 4 x float> [[SPLAT]])
+; CHECK: store <vscale x 4 x float> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -146,5 +194,11 @@ entry:
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmax_f32(
-; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmax_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[HEAD]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[SCAN]], <vscale x 4 x float> [[SPLAT]])
+; CHECK: store <vscale x 4 x float> [[FINAL]],
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
index 837726ecee774..8722e6f13edcf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
@@ -42,7 +42,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32(
-; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_add_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i32> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
@@ -55,7 +61,13 @@ entry:
   store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64(
-; CHECK: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_sub_group_scan_exclusive_add_i64(i64 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[HEAD]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i64> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i64> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -68,7 +80,13 @@ entry:
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32(
-; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.0{{.*}}, <4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fadd_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = fadd <4 x float> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x float> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -82,6 +100,12 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32(
 ; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -94,7 +118,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32(
-; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -107,7 +137,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32(
-; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
@@ -120,7 +156,13 @@ entry:
   store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32(
-; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -133,7 +175,13 @@ entry:
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32(
-; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmin_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
 }
 
 define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
@@ -146,5 +194,11 @@ entry:
   store float %call1, float addrspace(1)* %arrayidx2, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32(
-; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmax_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
 }

From cc242ab610695f97d60fc8765dcbd57de393a54b Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 30 Aug 2023 14:50:42 +0100
Subject: [PATCH 032/182] [vecz] Packetize the sub-group size on top of mux
 sub-groups

The total vectorized sub-group size is the mux reduction sum of all
vectorized group sizes (i.e., the vectorization factor or vector
length).
---
 .../vecz/source/transform/packetizer.cpp      | 39 +++++++++++++------
 .../llvm/ScalableVectors/subgroup_builtins.ll |  3 +-
 .../compute_vector_length.ll                  |  6 ++-
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |  3 +-
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index cd32abf6134c3..a917da773f951 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -693,23 +693,38 @@ bool Packetizer::Impl::packetize() {
         continue;
       }
 
-      auto *const Callee = CI->getCalledFunction();
-      if (Callee && Ctx.builtins().analyzeBuiltin(*Callee).ID ==
+      if (auto *const Callee = CI->getCalledFunction();
+          Callee && Ctx.builtins().analyzeBuiltin(*Callee).ID ==
                         compiler::utils::eMuxBuiltinGetSubGroupSize) {
         auto *const replacement = [this](CallInst *CI) -> Value * {
+          // The vectorized sub-group size is the mux sub-group reduction sum
+          // of all of the vectorized sub-group sizes:
+          // |   mux 0     |      mux 1       |
+          // | < a,b,c,d > | < e,f,g > (vl=3) |
+          // The total sub-group size above is 4 + 3 => 7.
+          // Note that this expects that the mux sub-group consists entirely of
+          // equivalently vectorized kernels.
+          Value *VecgroupSize;
+          IRBuilder<> B(CI);
+          auto *const I32Ty = B.getInt32Ty();
           if (VL) {
-            return VL;
-          }
-
-          auto *const I32Ty = Type::getInt32Ty(F.getContext());
-          auto *const VFVal =
-              ConstantInt::get(I32Ty, SimdWidth.getKnownMinValue());
-          if (!SimdWidth.isScalable()) {
-            return VFVal;
+            VecgroupSize = VL;
           } else {
-            IRBuilder<> B(CI);
-            return B.CreateVScale(VFVal);
+            auto *const VFVal = B.getInt32(SimdWidth.getKnownMinValue());
+            if (!SimdWidth.isScalable()) {
+              VecgroupSize = VFVal;
+            } else {
+              VecgroupSize = B.CreateVScale(VFVal);
+            }
           }
+          assert(VecgroupSize && "Could not determine vector group size");
+
+          auto *ReduceFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+              compiler::utils::eMuxBuiltinSubgroupReduceAdd, *F.getParent(),
+              {I32Ty});
+          assert(ReduceFn && "Could not get reduction builtin");
+
+          return B.CreateCall(ReduceFn, VecgroupSize, "subgroup.size");
         }(CI);
         CI->replaceAllUsesWith(replacement);
         IC.deleteInstructionLater(CI);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index bafc54f5fa19a..55ff8ee65aaa1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -35,7 +35,8 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 ; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_size(
 ; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK: [[W:%.*]] = shl i32 [[VSCALE]], 2
-; CHECK: store i32 [[W]], ptr addrspace(1) {{.*}}
+; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[W]])
+; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}}
 }
 
 define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index 92b783c208490..f0c0335724115 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -41,7 +41,8 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 ; CHECK-F2: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
 ; CHECK-F2: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 2)
 ; CHECK-F2: [[VL1:%.*]] = trunc i64 [[VL0]] to i32
-; CHECK-F2: store i32 [[VL1]], ptr addrspace(1) {{.*}}
+; CHECK-F2: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
+; CHECK-F2: store i32 [[RED]], ptr addrspace(1) {{.*}}
 
 ; CHECK-S4-LABEL: define spir_kernel void @__vecz_nxv4_vp_get_sub_group_size(
 ; CHECK-S4: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0)
@@ -51,4 +52,5 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 ; CHECK-S4: [[VF1:%.*]] = shl i64 [[VF0]], 2
 ; CHECK-S4: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 [[VF1]])
 ; CHECK-S4: [[VL1:%.*]] = trunc i64 [[VL0]] to i32
-; CHECK-S4: store i32 [[VL1]], ptr addrspace(1) {{.*}}
+; CHECK-S4: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
+; CHECK-S4: store i32 [[RED]], ptr addrspace(1) {{.*}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index 10bac3390e3c6..ed45025e3b512 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -34,7 +34,8 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
   store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
   ret void
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_size(
-; CHECK: store i32 4, ptr addrspace(1) {{.*}}
+; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 4)
+; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}}
 }
 
 define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {

From 1005ff9e148c0d8a12d9e6950ee5442af0cab343 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 5 Sep 2023 19:56:13 +0100
Subject: [PATCH 033/182] [compiler] Add mux sub-group shuffle builtins

These mirror the corresponding builtins in Intel's SPV_INTEL_subgroups
SPIR-V extension.

This commit also provides support for lowering to these via the SPIR-V
SubgroupShuffleINTEL capability, as part of the SPV_INTEL_subgroups
extension.

These builtins are currently not vectorized and thus only implemented
for 'trivial' sub-groups of size 1. Support for wider sub-groups will
come in later commits.
---
 .../vecz/source/transform/packetizer.cpp      | 28 +++++++
 .../vecz/test/lit/llvm/subgroup_shuffles.ll   | 79 +++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index a917da773f951..ad69304848964 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -202,6 +202,13 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   Value *packetizeGroupBroadcast(Instruction *I);
+  /// @brief Returns true if the instruction is a subgroup shuffle.
+  ///
+  /// @param[in] I Instruction to query.
+  ///
+  /// @return True if the instruction is a call to a mux subgroup shuffle
+  /// builtin.
+  bool isSubgroupShuffle(Instruction *I);
   /// @brief Packetize PHI node.
   ///
   /// @param[in] Phi PHI Node to packetize.
@@ -853,6 +860,13 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
   }
 
   auto *const Ins = cast<Instruction>(V);
+
+  // FIXME: Add support for vectorizing sub-group shuffles
+  if (isSubgroupShuffle(Ins)) {
+    emitVeczRemarkMissed(&F, Ins, "Could not packetize sub-group shuffle");
+    return Packetizer::Result(*this);
+  }
+
   if (auto *const Branch = dyn_cast<BranchInst>(Ins)) {
     if (Branch->isConditional()) {
       // varying reductions need to be packetized
@@ -1251,6 +1265,20 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   return CI;
 }
 
+bool Packetizer::Impl::isSubgroupShuffle(Instruction *I) {
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->getCalledFunction()) {
+    return false;
+  }
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *callee = CI->getCalledFunction();
+
+  auto const Builtin = BI.analyzeBuiltin(*callee);
+  auto const Info = BI.isMuxGroupCollective(Builtin.ID);
+
+  return Info && Info->isSubGroupScope() && Info->isShuffleLike();
+}
+
 Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
   if (auto memop = MemOp::get(I)) {
     auto *const mask = memop->getMaskOperand();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll
new file mode 100644
index 0000000000000..7c109f4b48e75
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: Could not packetize sub-group shuffle %shuffle
+define spir_kernel void @kernel1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %gid
+  %val = load i64, ptr addrspace(1) %arrayidx.in, align 8
+  %shuffle = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %gid
+  store i64 %shuffle, ptr addrspace(1) %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK: Could not packetize sub-group shuffle %shuffle_up
+define spir_kernel void @kernel2(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds float, ptr addrspace(1) %in, i64 %gid
+  %val = load float, ptr addrspace(1) %arrayidx.in, align 8
+  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %val, float %val, i32 1)
+  %arrayidx.out = getelementptr inbounds float, ptr addrspace(1) %out, i64 %gid
+  store float %shuffle_up, ptr addrspace(1) %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK: Could not packetize sub-group shuffle %shuffle_down
+define spir_kernel void @kernel3(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds i8, ptr addrspace(1) %in, i64 %gid
+  %val = load i8, ptr addrspace(1) %arrayidx.in, align 8
+  %shuffle_down = call i8 @__mux_sub_group_shuffle_down_i8(i8 %val, i8 %val, i32 1)
+  %arrayidx.out = getelementptr inbounds i8, ptr addrspace(1) %out, i64 %gid
+  store i8 %shuffle_down, ptr addrspace(1) %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK: Could not packetize sub-group shuffle %shuffle_xor
+define spir_kernel void @kernel4(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr addrspace(1) %in, i64 %gid
+  %val = load half, ptr addrspace(1) %arrayidx.in, align 8
+  %shuffle_xor = call half @__mux_sub_group_shuffle_xor_f16(half %val, i32 -1)
+  %arrayidx.out = getelementptr inbounds half, ptr addrspace(1) %out, i64 %gid
+  store half %shuffle_xor, ptr addrspace(1) %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i32 @__mux_get_sub_group_size()
+
+declare i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %lid)
+
+declare half @__mux_sub_group_shuffle_xor_f16(half %val, i32 %xor_val)
+
+declare i8 @__mux_sub_group_shuffle_down_i8(i8 %curr, i8 %next, i32 %delta)
+
+declare float @__mux_sub_group_shuffle_up_f32(float %prev, float %curr, i32 %delta)

From 0c377894bcb0f26a3af4664479eb31ab927d1db6 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 12 Sep 2023 15:37:24 +0100
Subject: [PATCH 034/182] [compiler] Vectorize to any required sub-group size

This updates users of the vectorizer to set the desired vectorization
factor according to the kernel's required sub-group size, if present. It
does so using a common method, to ensure consistent behaviour.This
helper method assumes that the current underlying 'mux' sub-group size
is 1, as we haven't got any in-tree targets that make use of anything
larger than that.

All targets consider the required sub-group size less than they do
`optnone` or `-cl-wfv=never` options. This can be revisited in the
future if users desire different behaviour.
---
 .../compiler_passes/vecz/include/vecz/pass.h  |  3 +++
 .../compiler_passes/vecz/source/pass.cpp      | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index 17c429b9b0c7c..5b29762b15b6a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -25,6 +25,7 @@
 #include <llvm/IR/PassManager.h>
 
 #include <cstdint>
+#include <optional>
 
 #include "vecz/vecz_choices.h"
 
@@ -65,6 +66,8 @@ struct VeczPassOptions {
   uint64_t local_size;
 };
 
+std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(llvm::Function &);
+
 /// @brief Analysis pass which determines on which functions @ref RunVeczPass
 /// should operate.
 class VeczPassOptionsAnalysis
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index 4f532876c1ad1..654d56d89b8ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -30,6 +30,7 @@
 
 #include <cstdlib>
 #include <functional>
+#include <optional>
 #include <tuple>
 
 #include "vectorization_context.h"
@@ -250,4 +251,24 @@ PreservedAnalyses VeczPassOptionsPrinterPass::run(Module &M,
   return PreservedAnalyses::all();
 }
 
+std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(Function &F) {
+  if (auto reqd_sg_size = compiler::utils::getReqdSubgroupSize(F)) {
+    vecz::VeczPassOptions vecz_opts;
+    // Disable auto - we want a specific width
+    vecz_opts.vecz_auto = false;
+    vecz_opts.vec_dim_idx = 0;
+    // If we can't vectorize to the required sub-group size then we must bail.
+    if (*reqd_sg_size % compiler::utils::getMuxSubgroupSize(F)) {
+      return std::nullopt;
+    }
+    // Else we must vectorize such that we multiply the existing mux sub-group
+    // size up to the required one.
+    vecz_opts.factor = compiler::utils::VectorizationFactor::getFixedWidth(
+        *reqd_sg_size / compiler::utils::getMuxSubgroupSize(F));
+    vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions);
+    return vecz_opts;
+  }
+  return std::nullopt;
+}
+
 }  // namespace vecz

From 5c5476657f807a05e8511b025c017adbf24ecd71 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 13 Sep 2023 18:45:54 +0100
Subject: [PATCH 035/182] [compiler] Preserve 'entry point' on vectorized
 kernels

This fixes a long-standing hack in the vectorizer where it would strip
the vectorized kernels of their 'kernel' status.

This wasn't anything to do with vectorization but was a legacy behaviour
owing to the fact that the vectorizer used to replace the old functions.
The `WorkItemLoopsPass` would in turn wrap every kernel it came across,
and we didn't want it wrapping up the old 'scalar' kernel since we'd
never call it: this would negatively impact compile time for no reason.

However, with the revised sub-groups model and to handle the case where
the target is not using degenerate sub-groups, we *do* want to wrap up
the unvectorized scalar kernel as it is useful as a fallback in the same
way the degenerate sub-group kernel is.

To avoid compiling unnecessary kernel wrappers, the WorkItemLoopsPass
has been taught to avoid wrapping scalar kernels if they don't use
sub-group builtins, or if they're degenerate, or if the kernel has a
required sub-group size. This should preserve the old behaviour in the
vast majority of cases.

The `VerifyReqdSubGroupSizeSatisfiedPass` must now be run later in the
pipeline, after the `WorkItemLoopsPass` has run. This is because
immediately after vectorization the old 'scalar' kernel appears as
though the required sub-group size has not been met. It is only once the
`WorkItemLoopsPass` has run and vectorized and unvectorized kernels have
been merged that we have our 'final' kernel forms, and we can glean
whether the compiler correctly satisfied the constraints.

We could alternatively make the pass look at all the vectorized forms of
each kernel and verify that *at least one* form of each kernel has met
the requirement, but this ties its behaviour more closely with the
vectorizer's output, whereas currently it's fairly agnostic: it runs
over all kernel entry points.
---
 .../compiler_passes/vecz/source/pass.cpp              | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index 654d56d89b8ff..df1323414f524 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -168,24 +168,15 @@ PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Fix up the metadata and clean out any dead kernels
   for (auto &P : Results) {
-    Function *Fn = P.first;
     auto &Result = P.second;
-    bool const IsKernel = compiler::utils::isKernel(*Fn);
-    bool DropScalarMDs = IsKernel && !Result.empty();
     for (auto &R : Result) {
       VectorizationUnit *VU = R.first;
       trackVeczSuccessFailure(*VU);
       if (!createVectorizedFunctionMetadata(*VU)) {
-        // We only drop the metadata from the scalar kernel when the number of
-        // Results is non-zero and they all succeeded
-        DropScalarMDs = false;
-        LLVM_DEBUG(dbgs() << Fn->getName() << " failed to vectorize\n");
+        LLVM_DEBUG(dbgs() << P.first->getName() << " failed to vectorize\n");
         eraseFailed(VU);
       }
     }
-    if (DropScalarMDs) {
-      compiler::utils::dropIsKernel(*Fn);
-    }
   }
   return PreservedAnalyses::none();
 }

From a6b43d5897dddbf464dabd1fb4aecbaee34bddab Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 14 Sep 2023 13:11:39 +0100
Subject: [PATCH 036/182] [vecz] Vectorize to the reqd sub-group size under
 'auto'

---
 .../compiler_passes/vecz/source/pass.cpp      |  8 +++
 .../vecz/test/lit/llvm/reqd-sg-size-auto.ll   | 55 +++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index df1323414f524..d84a4e6980817 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -128,6 +128,14 @@ PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
     ResultTy T;
     Results.insert(std::make_pair(Fn, std::move(T)));
     for (auto &Opts : P.second) {
+      // If we've been given an auto width, try and fit it to any requirements
+      // that the kernel places on its sub-groups.
+      if (Opts.vecz_auto) {
+        if (auto ReqdSGOpts = getReqdSubgroupSizeOpts(*Fn)) {
+          Opts = *ReqdSGOpts;
+        }
+      }
+
       auto *const VU =
           createVectorizationUnit(Ctx, Fn, Opts, Mach.getFAM(), Check);
       if (!VU) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
new file mode 100644
index 0000000000000..b9d95d33360e1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Let vecz pick the right vectorization factor for this kernel
+; RUN: veczc --vecz-auto -k bar_sg8 -k foo_sg13 -S < %s | FileCheck %s
+; RUN: veczc --vecz-auto -k bar_sg8:4 -k foo_sg13:8 -S < %s | FileCheck %s
+
+; Check we auto-vectorize to 8, despite any other options telling us a
+; different vectorization factor.
+; CHECK: define void @__vecz_v8_bar_sg8
+define void @bar_sg8(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !intel_reqd_sub_group_size !0 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+; CHECK: = add <8 x i32>
+  %y = add i32 %x, 1
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+; Check we auto-vectorize to 13, despite any other options telling us a
+; different vectorization factor. This is a silly number but it if we're told
+; to do it we must obey.
+; CHECK: define void @__vecz_v13_foo_sg13
+define void @foo_sg13(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !intel_reqd_sub_group_size !1 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+; CHECK: = add <13 x i32>
+  %y = add i32 %x, 1
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+attributes #0 = { "mux-kernel"="entry-point" }
+
+!0 = !{i32 8}
+!1 = !{i32 13}

From 3bae73be19b6b255ff54cede919c900670a78d3f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 14 Sep 2023 14:55:38 +0100
Subject: [PATCH 037/182] [riscv] Vectorize sub-group functions to a device
 size

This changes how the riscv/refsi targets handle sub-group sizes. The
riscv/refsi targets now automatically vectorize kernels/functions
containing explicit use of sub-groups to one of the device's advertised
sub-group widths.

These targets no longer run the `DegenerateSubGroupPass`!

The rationale for this is as follows: The SYCL 2020 spec has
`info::device::sub_group_sizes` which returns "the" list of sub-group
sizes reported by the device. The CTS expects that if it runs a kernel
which (for example) returns the kernel's sub-group size to the host,
then that size is found in that same device list.

This rules out the use of degenerate sub-groups. We'd have to list all
the sub-group (read: work-group) sizes from 1 to 1024, which is useless
for users. All of those sub-group sizes could then, in turn, be required
by a user, and the compiler would effectively be unable to satisfy. A
user requiring a sub-group size of 244 would lead to a very strange
situation where the compiler must create a degenerate sub-groups kernel
and would then just hope that the user uses a work-group size of 244.
Anything else would be invalid. This is another bad user experience.

The vectorizer can be asked to return a suitable vectorization factor
for a kernel, given its use of sub-groups and the device's list of
sub-group sizes. It is quite rudimentary in its heuristic and doesn't
choose between legal powers of two: the first one is taken as the
preferred list.

I've filed a ticket for clarification/questions about sub-groups and
degenerate sub-groups against the SYCL spec, but I think this is a
suitable "workaround" for now, or even for the long term. The behaviour
is arguably more intuitive for users.
---
 .../compiler_passes/vecz/include/vecz/pass.h  | 16 +++
 .../compiler_passes/vecz/source/pass.cpp      | 99 ++++++++++++++++++-
 .../vecz/test/lit/llvm/device-sg-size-auto.ll | 58 +++++++++++
 .../vecz/tools/source/veczc.cpp               | 11 +++
 4 files changed, 181 insertions(+), 3 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index 5b29762b15b6a..7d439fa8c4f06 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -66,8 +66,24 @@ struct VeczPassOptions {
   uint64_t local_size;
 };
 
+/// @brief Returns the vectorization options that would vectorize the provided
+/// function to its required sub-group size.
 std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(llvm::Function &);
 
+/// @brief Returns the vectorization options that would vectorize the provided
+/// function to its required sub-group size (if set) or one of the device's
+/// sub-group sizes.
+///
+/// Only returns options if the function uses sub-group operations, as
+/// determined by the SubGroupAnalysis pass.
+///
+/// Tries to find a good fit that produces one of the device's sub-group sizes,
+/// preferring ones which fit the known local work-group size and powers of
+/// two. The device's sub-group sizes can be sorted such that preferable sizes
+/// are placed towards the front.
+std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
+    llvm::Function &, llvm::ModuleAnalysisManager &);
+
 /// @brief Analysis pass which determines on which functions @ref RunVeczPass
 /// should operate.
 class VeczPassOptionsAnalysis
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index d84a4e6980817..a9265876a6c0a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -20,6 +20,7 @@
 #include <compiler/utils/builtin_info.h>
 #include <compiler/utils/device_info.h>
 #include <compiler/utils/metadata.h>
+#include <compiler/utils/sub_group_analysis.h>
 #include <compiler/utils/vectorization_factor.h>
 #include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/LLVMContext.h>
@@ -129,10 +130,10 @@ PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
     Results.insert(std::make_pair(Fn, std::move(T)));
     for (auto &Opts : P.second) {
       // If we've been given an auto width, try and fit it to any requirements
-      // that the kernel places on its sub-groups.
+      // that the kernel/device places on its sub-groups.
       if (Opts.vecz_auto) {
-        if (auto ReqdSGOpts = getReqdSubgroupSizeOpts(*Fn)) {
-          Opts = *ReqdSGOpts;
+        if (auto AutoSGOpts = getAutoSubgroupSizeOpts(*Fn, MAM)) {
+          Opts = *AutoSGOpts;
         }
       }
 
@@ -270,4 +271,96 @@ std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(Function &F) {
   return std::nullopt;
 }
 
+std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
+    Function &F, ModuleAnalysisManager &AM) {
+  // If there's a required sub-group size, we must return a vectorization
+  // factor that gets us there.
+  if (auto opts = getReqdSubgroupSizeOpts(F)) {
+    return opts;
+  }
+
+  auto &M = *F.getParent();
+  const auto &GSGI = AM.getResult<compiler::utils::SubgroupAnalysis>(M);
+
+  // If the function doesn't use sub-groups (from the user's perspective) then
+  // we don't need to adhere to a specific sub-group size.
+  if (!GSGI.usesSubgroups(F)) {
+    return std::nullopt;
+  }
+
+  // Use the device's sub-group sizes to determine which to vectorize to.
+  auto &DI = AM.getResult<compiler::utils::DeviceInfoAnalysis>(M);
+
+  // We don't force devices to support any sub-group sizes.
+  if (DI.reqd_sub_group_sizes.empty()) {
+    return std::nullopt;
+  }
+
+  vecz::VeczPassOptions vecz_opts;
+  vecz_opts.vec_dim_idx = 0;
+  // Disable auto - we want a specific width
+  vecz_opts.vecz_auto = false;
+  // Enable some default choices
+  vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions);
+
+  // Now try and choose the best width.
+  std::optional<unsigned> best_width;
+  auto const mux_sub_group_size = compiler::utils::getMuxSubgroupSize(F);
+
+  auto can_produce_legal_width = [&mux_sub_group_size](unsigned size) {
+    // We only support vectorization widths where there's a clean multiple, and
+    // we can vectorize *up* to the desired size - we can't shrink the
+    // sub-group size by vectorizing.
+    return size >= mux_sub_group_size && (size % mux_sub_group_size) == 0;
+  };
+
+  for (auto size : DI.reqd_sub_group_sizes) {
+    if (!can_produce_legal_width(size)) {
+      continue;
+    }
+    unsigned const candidate_width = size / mux_sub_group_size;
+    // Try and choose at least one width.
+    if (!best_width) {
+      best_width = candidate_width;
+      continue;
+    }
+
+    // Prefer non-scalar widths.
+    if (best_width == 1 && candidate_width > 1) {
+      best_width = candidate_width;
+      continue;
+    }
+
+    // If we have a required work-group size, prefer one that will fit well
+    // with that.
+    if (auto wgs = compiler::utils::parseRequiredWGSMetadata(F)) {
+      uint64_t local_size_x = wgs.value()[0];
+      bool const best_fits = !(local_size_x % *best_width);
+      bool const cand_fits = !(local_size_x % candidate_width);
+      if (!best_fits && cand_fits) {
+        best_width = candidate_width;
+        continue;
+      } else if (best_fits && !cand_fits) {
+        continue;
+      }
+    }
+
+    // Else, prefer powers of two.
+    if (!isPowerOf2_32(*best_width) && isPowerOf2_32(candidate_width)) {
+      best_width = candidate_width;
+      continue;
+    }
+  }
+
+  // Return nothing if we couldn't find a good, legal, width.
+  if (!best_width) {
+    return std::nullopt;
+  }
+
+  vecz_opts.factor =
+      compiler::utils::VectorizationFactor::getFixedWidth(*best_width);
+
+  return vecz_opts;
+}
+
 }  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
new file mode 100644
index 0000000000000..48f17235bccf7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Let vecz pick the right vectorization factor for this kernel
+; RUN: veczc --vecz-auto -k foo -k bar --device-sg-sizes 6,7,8,9 -S < %s | FileCheck %s
+; RUN: veczc --vecz-auto -k foo:4 -k bar:4 --device-sg-sizes 6,7,8,9 -S < %s | FileCheck %s
+
+; Check we auto-vectorize to 8, despite any other options telling us a
+; different vectorization factor. A factor of 8 is 'best' here because it's a
+; power of two.
+; CHECK: define void @__vecz_v8_foo(
+define void @foo(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+  %sglid = call i32 @__mux_get_sub_group_local_id()
+; CHECK: = add <8 x i32>
+  %y = add i32 %x, %sglid
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+; Check we auto-vectorize to 7, despite any other options telling us a
+; different vectorization factor. A factor of 8 is 'best' here because it's a
+; power of two, but a factor of 7 works well because it won't need a tail.
+; CHECK: define void @__vecz_v7_bar(
+define void @bar(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+  %sglid = call i32 @__mux_get_sub_group_local_id()
+; CHECK: = add <7 x i32>
+  %y = add i32 %x, %sglid
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i32 @__mux_get_sub_group_local_id()
+
+attributes #0 = { "mux-kernel"="entry-point" }
+
+!0 = !{i64 14, i64 1, i64 1}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index d64675142518c..59e17c72c35ce 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -19,6 +19,7 @@
 #include <compiler/utils/metadata.h>
 #include <compiler/utils/optimal_builtin_replacement_pass.h>
 #include <compiler/utils/pass_machinery.h>
+#include <compiler/utils/sub_group_analysis.h>
 #include <compiler/utils/vectorization_factor.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/ADT/StringSwitch.h>
@@ -111,6 +112,11 @@ static llvm::cl::opt<bool> DoubleSupport(
     llvm::cl::desc(
         "Assume the target has double-precision floating point support"));
 
+static llvm::cl::list<unsigned> SGSizes(
+    "device-sg-sizes",
+    llvm::cl::desc("Comma-separated list of supported sub-group sizes"),
+    llvm::cl::CommaSeparated);
+
 static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
                                            llvm::StringRef cpu_model,
                                            llvm::StringRef target_features) {
@@ -372,10 +378,15 @@ int main(const int argc, const char *const argv[]) {
       [&] { return vecz::TargetInfoAnalysis(TICallback); });
   passMach.getMAM().registerPass(
       [&] { return compiler::utils::BuiltinInfoAnalysis(); });
+  passMach.getMAM().registerPass(
+      [&] { return compiler::utils::SubgroupAnalysis(); });
   passMach.getFAM().registerPass([] { return llvm::TargetIRAnalysis(); });
   passMach.getMAM().registerPass([] {
     compiler::utils::DeviceInfo Info{/*half*/ 0, /*float*/ 0, DoubleSupport,
                                      /*MaxWorthWidth*/ 64};
+    for (const auto S : SGSizes) {
+      Info.reqd_sub_group_sizes.push_back(S);
+    }
     return compiler::utils::DeviceInfoAnalysis(Info);
   });
   passMach.getMAM().registerPass([&kernelOpts] {

From 449c0fe58b0d7b7bb955c866f87b8f875a51c5a6 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 20 Sep 2023 13:39:29 +0100
Subject: [PATCH 038/182] [vecz] Fix alignment of load operations

We were alternatively under- and over-setting the alignment on
vectorized loads. Under-setting the alignment is still correct if
sub-optimal.

However, the `SimplifyMaskedMemOpsPass` was ignoring the alignment of
the scalar masked mem load/store builtins when optimizing to regular
LLVM load and store instructions. This would over-set the alignment,
meaning the compiler could assume properties about the address which
were not true.

This led to riscv reporting a misaligned store exception on some SYCL
CTS tests.
---
 .../vecz/include/vecz/vecz_target_info.h      |  2 +
 .../vecz/source/transform/packetizer.cpp      |  4 +-
 .../vecz/source/transform/passes.cpp          | 11 +++--
 .../vecz/source/vector_target_info.cpp        | 10 ++---
 .../llvm/ScalableVectors/broadcast_vector.ll  |  6 +--
 .../insertelement_runtime_index.ll            |  2 +-
 .../test/lit/llvm/VectorWidening/widen_abs.ll |  2 +-
 .../lit/llvm/VectorWidening/widen_binops.ll   |  8 ++--
 .../lit/llvm/VectorWidening/widen_copysign.ll |  4 +-
 .../test/lit/llvm/VectorWidening/widen_fma.ll | 12 +++---
 .../lit/llvm/VectorWidening/widen_fmuladd.ll  | 12 +++---
 .../llvm/VectorWidening/widen_fmuladd_phi.ll  | 12 +++---
 .../test/lit/llvm/simplify-masked-memops.ll   | 42 +++++++++++++++++++
 13 files changed, 87 insertions(+), 40 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
index 74d22c2391b53..3b66d21951ac4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -79,6 +79,7 @@ class TargetInfo {
   /// @param[in] ptr Memory address to load a vector value from.
   /// @param[in] stride Distance in elements between two lanes in memory.
   ///                     A stride of one represents a contiguous load.
+  /// @param[in] alignment The alignment of the load, in bytes
   /// @param[in] evl 'effective vector length' of the operation. Must be
   /// pre-scaled for vector operations. If null, the operation is unpredicated:
   /// it is executed on all lanes.
@@ -86,6 +87,7 @@ class TargetInfo {
   /// @return IR value that results from the vector load.
   virtual llvm::Value *createLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
                                   llvm::Value *ptr, llvm::Value *stride,
+                                  unsigned alignment,
                                   llvm::Value *evl = nullptr) const;
 
   /// @brief Create a vector store. If a stride greater than one is used, the
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index ad69304848964..64d945887074f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2158,8 +2158,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
             ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
                                       Twine(name, ".incr"));
           }
-          results.push_back(
-              VTI.createLoad(B, getWideType(dataTy, factor), ptr, one, EVL));
+          results.push_back(VTI.createLoad(B, getWideType(dataTy, factor), ptr,
+                                           one, alignment, EVL));
         }
       } else {
         auto *const one = B.getInt64(1);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index 684082f7a2411..2c99f5e6af665 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -123,6 +123,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
         Value *Data = MaskedOp->getDataOperand();
         Value *Ptr = MaskedOp->getPointerOperand();
         Type *DataTy = MaskedOp->getDataType();
+        auto Alignment = BuiltinDesc->getAlignment();
         if (MaskedOp->isLoad()) {
           Value *Load = nullptr;
           if (DataTy->isVectorTy()) {
@@ -133,11 +134,13 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
             if (isa<ScalableVectorType>(DataTy)) {
               continue;
             }
-            Load = VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1));
+            Load =
+                VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1), Alignment);
           } else {
-            Load = B.CreateLoad(CI->getType(), Ptr, /*isVolatile*/ false,
-                                CI->getName());
+            Load = B.CreateAlignedLoad(CI->getType(), Ptr, Align(Alignment),
+                                       /*isVolatile*/ false, CI->getName());
           }
+          Load->takeName(CI);
           CI->replaceAllUsesWith(Load);
         } else {
           if (DataTy->isVectorTy()) {
@@ -151,7 +154,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
             VTI.createStore(B, Data, Ptr, B.getInt64(1),
                             BuiltinDesc->getAlignment());
           } else {
-            B.CreateStore(Data, Ptr);
+            B.CreateAlignedStore(Data, Ptr, Align(Alignment));
           }
         }
         ToDelete.push_back(CI);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 81db2d7be2f49..7a3d3d74dd8eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -73,7 +73,8 @@ bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
 TargetInfo::TargetInfo(TargetMachine *tm) : TM_(tm) {}
 
 Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
-                              Value *Stride, Value *EVL) const {
+                              Value *Stride, unsigned Alignment,
+                              Value *EVL) const {
   if (!Ptr || !Stride || !Ty->isVectorTy()) {
     return nullptr;
   }
@@ -90,10 +91,9 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   PointerType *VecPtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
   Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
   if (CIntStride && CIntStride->getSExtValue() == 1) {
-    unsigned Align = EleTy->getScalarSizeInBits() / 8;
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
-      auto const Legality = isVPLoadLegal(F, Ty, Align);
+      auto const Legality = isVPLoadLegal(F, Ty, Alignment);
       if (!Legality.isVPLegal()) {
         emitVeczRemarkMissed(F,
                              "Could not create a VP load as the target "
@@ -106,7 +106,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
       SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
     }
-    return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Align));
+    return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Alignment));
   }
 
   if (EVL) {
@@ -1112,7 +1112,7 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
       }
       Value *Load = nullptr;
       if (!HasMask) {
-        Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1));
+        Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1), Align);
       } else {
         Value *Mask = VecMasks[i];
         Load =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 3eda351c2d3c2..9c1eee69775f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -115,7 +115,7 @@ entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
@@ -125,7 +125,7 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
@@ -160,7 +160,7 @@ entry:
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[V46:%.*]] = fadd <vscale x 16 x float> [[TMP6]], [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
index 095175f4331a1..89ea36ef5057a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -38,7 +38,7 @@ entry:
 
 ; CHECK: define spir_kernel void @__vecz_v4_runtime_index
 
-; CHECK: %[[INTO:.+]]  = load <16 x i32>, ptr %arrayidx, align 4
+; CHECK: %[[INTO:.+]]  = load <16 x i32>, ptr %arrayidx, align 16
 ; CHECK: %[[LD:.+]] = load <4 x i32>, ptr
 ; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LD]], <i32 0, i32 4, i32 8, i32 12>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
index 3b6f26e35e548..99119d074b41a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -62,7 +62,7 @@ entry:
 ; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
 ; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx
-; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 4
+; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 8
 ; CHECK: %[[RES2:.+]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %[[T0]], i1 true)
 ; CHECK: store <8 x i32> %[[RES2]], ptr %b, align 8
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
index 401d6cf336fc6..66dc8c706779a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -41,10 +41,10 @@ entry:
 
 ; It checks that the zexts and add of <4 x i32> gets widened by a factor of 8,
 ; to produce PAIRs of <16 x i32>s.
-; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
 ; CHECK: %[[XA0:.+]] = zext <16 x i32> %[[LDA0]] to <16 x i64>
 ; CHECK: %[[XA1:.+]] = zext <16 x i32> %[[LDA1]] to <16 x i64>
 ; CHECK: %[[XB0:.+]] = zext <16 x i32> %[[LDB0]] to <16 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
index 33f9ac40e5465..4eccec152d9e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -69,8 +69,8 @@ entry:
 ; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx
 ; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx
 ; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx
-; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 4
-; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 4
+; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 8
+; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 8
 ; CHECK: %res1 = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[T0]], <8 x float> [[T1]])
 ; CHECK: store <8 x float> %res1, ptr %c, align 8
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
index 75f0fd0a8a6f3..14d681721b6a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -43,12 +43,12 @@ declare <4x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 ; It checks that the fma intrinsic of <4 x float> gets widened by a factor of 8,
 ; to produce a PAIR of <16 x float>s.
-; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
 ; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
 ; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
 ; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
index 3460f67581d3d..1f60ef3fd04d2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -43,12 +43,12 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 ; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
 ; to produce a PAIR of <16 x float>s.
-; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
 ; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
 ; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
 ; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
index 12671866f8ab7..739ef93b8e334 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -53,12 +53,12 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 ; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
 ; to produce a PAIR of <16 x float>s.
-; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 4
-; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 4
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
 
 ; CHECK: loop:
 ; CHECK: %[[ACC0:.+]] = phi <16 x float> [ %[[FMA0:.+]], %loop ], [ %[[LDA0]], %entry ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
new file mode 100644
index 0000000000000..ef84853e03bba
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo -vecz-passes=mask-memops -S < %s | FileCheck %s
+
+define void @foo(i16 %x, i32 %y, ptr addrspace(1) %p) {
+entry:
+  call void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16 %x, ptr addrspace(1) %p, i1 true)
+  call void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32 %y, ptr addrspace(1) %p, i1 true)
+  %f = call float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1) %p, i1 true)
+  %v4f = call <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1) %p, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+; Check we correctly set the alignment on the optimized loads and stores. The
+; alignment must come from the builtin, not from the natural/preferred
+; alignment for that type.
+; CHECK: define void @__vecz_v4_foo(i16 %x, i32 %y, ptr addrspace(1) %p)
+; CHECK: entry:
+; CHECK:      store i16 %x, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: store i32 %y, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: %f = load float, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: %v4f = load <4 x float>, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: ret void
+
+declare void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16, ptr addrspace(1), i1)
+declare void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32, ptr addrspace(1), i1)
+declare float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1), i1)
+declare <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1), <4 x i1>)

From 1887b3f3014054297335237809d68af93191dc6c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 21 Sep 2023 17:10:08 +0100
Subject: [PATCH 039/182] [vecz] Fix invalid dangling AssumptionCaches

The StrideAnalysisResult had its own local copy of an AssumptionCache.

These were still live at the time we deleted instructions at the end of
packetization, and/or tore down the vectorization process upon
packetization failure.

This meant that some llvm Values could have callback handles in two
assumption caches, and when one was deleted the other would hold onto
garbage memory.

The fix is to use the same assumption cache in StrideAnalysisResult as
is obtained from the FunctionAnalysisManager.

An additional bug was that we were ostensibly invalidating all function
analyses on deleted vectorized functions, but were accidentally
inverting the set of preserved analyses such that all analyses were
being *preserved*, rather than invalidated.
---
 .../vecz/source/analysis/stride_analysis.cpp  | 10 +++--
 .../source/include/analysis/stride_analysis.h |  5 ++-
 .../vecz/source/offset_info.cpp               |  4 +-
 .../compiler_passes/vecz/source/pass.cpp      |  2 +-
 .../invalid_cached_assumption_regression.ll   | 44 +++++++++++++++++++
 .../lit/llvm/invalid_cached_vu_regression.ll  | 36 +++++++++++++++
 6 files changed, 92 insertions(+), 9 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
index e4c61dd941ebc..126558e518f38 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -49,8 +49,9 @@ OffsetInfo &StrideAnalysisResult::analyze(Value *V) {
 }
 
 StrideAnalysisResult::StrideAnalysisResult(llvm::Function &f,
-                                           UniformValueResult &uvr)
-    : F(f), UVR(uvr), assumptions(F) {
+                                           UniformValueResult &uvr,
+                                           AssumptionCache &AC)
+    : F(f), UVR(uvr), AC(AC) {
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (!UVR.isVarying(&I)) {
@@ -83,6 +84,7 @@ Value *StrideAnalysisResult::buildMemoryStride(IRBuilder<> &B, llvm::Value *Ptr,
 
 StrideAnalysisResult StrideAnalysis::run(llvm::Function &F,
                                          llvm::FunctionAnalysisManager &AM) {
-  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
-  return Result(F, UVR);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &UVR = AM.getResult<UniformValueAnalysis>(F);
+  return Result(F, UVR, AC);
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
index f013fd1259c21..45561ea9285ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -46,9 +46,10 @@ class StrideAnalysisResult {
   /// @brief The Uniform Value Result to use during analysis
   UniformValueResult &UVR;
   /// @brief AssumptionCache for computing live bits of uniform values
-  llvm::AssumptionCache assumptions;
+  llvm::AssumptionCache &AC;
 
-  StrideAnalysisResult(llvm::Function &f, UniformValueResult &uvr);
+  StrideAnalysisResult(llvm::Function &f, UniformValueResult &uvr,
+                       llvm::AssumptionCache &AC);
 
   /// @brief generate stride `ConstantInt`s or `Instruction`s for all analyzed
   /// values.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 1dca4a6f72cd9..a73ed219d7941 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -176,8 +176,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
   // If we have a uniform value here we don't need to analyse any further.
   if (!SAR.UVR.isVarying(Ins)) {
-    auto const &KB = computeKnownBits(Ins, SAR.F.getParent()->getDataLayout(),
-                                      0, &SAR.assumptions);
+    auto const &KB =
+        computeKnownBits(Ins, SAR.F.getParent()->getDataLayout(), 0, &SAR.AC);
     auto const bitWidth = OffsetTy->getIntegerBitWidth();
 
     // We are interested in the bits that are not known to be zero.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index a9265876a6c0a..b7d316b6d301a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -166,7 +166,7 @@ PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
       // If we fail to vectorize a function, we still cloned and then
       // deleted it which affects internal addresses. The module has changed
       // and we can't cache any analyses.
-      Mach.getFAM().invalidate(*VectorizedFn, llvm::PreservedAnalyses::all());
+      Mach.getFAM().invalidate(*VectorizedFn, llvm::PreservedAnalyses::none());
       // Remove the partially-vectorized function if something went wrong.
       Ctx.clearActiveVU(VectorizedFn);
       VU->setVectorizedFunction(nullptr);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
new file mode 100644
index 0000000000000..5ccbd9e0f6a25
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Just check that we correctly clean up the assumption cache when vectorizing
+; this function.:
+; RUN: veczc -k foo -w 2 -S < %s
+; RUN: not veczc -k foo -w 2 -vecz-scalable -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @foo(ptr addrspace(1) nocapture readonly %_arg_v_acc) #0 {
+entry:
+  %v4 = tail call i64 @__mux_get_global_id(i32 0) #2
+  %v5 = tail call i64 @__mux_get_global_offset(i32 0) #2
+  %v6 = sub i64 %v4, %v5
+  %v7 = icmp ult i64 %v6, 2147483648
+  tail call void @llvm.assume(i1 %v7)
+  %arrayidx.i.i = getelementptr inbounds i32, ptr addrspace(1) %_arg_v_acc, i64 %v6
+  %v8 = load i32, ptr addrspace(1) %arrayidx.i.i, align 4
+  ret void
+}
+
+declare void @llvm.assume(i1 noundef) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+declare i64 @__mux_get_global_offset(i32) #2
+
+attributes #0 = { convergent nounwind "mux-kernel"="entry-point" "mux-orig-fn"="foo" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn inaccessiblememonly }
+attributes #2 = { alwaysinline norecurse nounwind readonly }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
new file mode 100644
index 0000000000000..9af73ae1d1d82
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: not veczc -k noduplicate:4,8 -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @noduplicate(i32 addrspace(1)* %in1, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i64 @__mux_get_global_id(i32 0) #3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid
+  %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16
+  %dec = call i32 @llvm.loop.decrement.reg.i32(i32 %i1, i32 4)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid
+  store i32 %dec, i32 addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+
+;CHECK: Failed to vectorize function 'noduplicate'

From 753793ed0e37652b25410cbe1f009ff8adf3b611 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 26 Sep 2023 15:45:25 +0100
Subject: [PATCH 040/182] [compiler] Add work-group scan support in
 vecz/work-item-loops

This completes the set of work-group collective functions we can support
via the 'alternative' pathway through the vectorizer and the
work-item-loops pass. They join broadcasts and reductions, which were
added a while back.

The scans are implemented using a linear work-item loop, accumulating
with each iteration of the 'X' loop. The vectorizer vectorizes them like
the sub-group scans, so the accumulator used is just the
current/previous value of scan operation so far. The vectorized
work-group merges this with the partial scan over the vector group.

The 'scans-only' parameter to the replace-wgc pass has been removed. The
pass is either run or it isn't run, depending on what the target wants
to do to support the work-group collective operations. Only the RefSi G1
WIPT example uses the old pass, though it should still be tested in LIT.

In unofficial local testing, these scans appear to be around 10% faster
to complete the associated SYCL-CTS tests. This requires more
investigation, however.
---
 .../vecz/source/transform/packetizer.cpp      |  55 +++--
 .../vecz/test/lit/llvm/workgroup_scans.ll     | 204 ++++++++++++++++++
 2 files changed, 236 insertions(+), 23 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 64d945887074f..3ac35bdacc1bb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -221,14 +221,14 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized values.
   ValuePacket packetizeCall(CallInst *CI);
-  /// @brief Packetize a subgroup scan.
+  /// @brief Packetize a subgroup/workgroup scan.
   ///
   /// @param[in] CI CallInst to packetize.
-  /// @param[in] Scan type of subgroup scan to packetized.
+  /// @param[in] Scan type of scan to packetized.
   ///
   /// @return Packetized values.
-  ValuePacket packetizeSubgroupScan(CallInst *CI,
-                                    compiler::utils::GroupCollective Scan);
+  ValuePacket packetizeGroupScan(CallInst *CI,
+                                 compiler::utils::GroupCollective Scan);
   /// @brief Perform post-packetization tasks for the given scalar value.
   ///
   /// @param[in] Scalar Scalar value to assign a vectorized value.
@@ -1520,10 +1520,10 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
 
   auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
 
-  // Handle subgroup scans, which defer to internal builtins.
+  // Handle scans, which defer to internal builtins.
   if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin.ID)) {
-    if (Info->isSubGroupScope() && Info->isScan()) {
-      return packetizeSubgroupScan(CI, *Info);
+    if (Info->isScan()) {
+      return packetizeGroupScan(CI, *Info);
     }
   }
 
@@ -1626,7 +1626,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeSubgroupScan(
+ValuePacket Packetizer::Impl::packetizeGroupScan(
     CallInst *CI, compiler::utils::GroupCollective Scan) {
   ValuePacket results;
 
@@ -1637,8 +1637,11 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
 
   compiler::utils::NameMangler mangler(&CI->getContext());
 
+  unsigned ArgOffset = Scan.isWorkGroupScope() ? 1 : 0;
+
   // The operands and types for the internal builtin
-  SmallVector<Value *, 2> Ops = {packetize(CI->getArgOperand(0)).getAsValue()};
+  SmallVector<Value *, 2> Ops = {
+      packetize(CI->getArgOperand(ArgOffset)).getAsValue()};
   SmallVector<Type *, 2> Tys = {getWideType(CI->getType(), SimdWidth)};
 
   bool isInclusive =
@@ -1723,22 +1726,23 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
     }
   }
 
-  auto *SubgroupScanFnTy = FunctionType::get(Tys[0], Tys, /*isVarArg*/ false);
-  auto *const SubgroupFn =
-      Ctx.getOrCreateInternalBuiltin(NameSV, SubgroupScanFnTy);
+  auto *VecgroupScanFnTy = FunctionType::get(Tys[0], Tys, /*isVarArg*/ false);
+  auto *const VecgroupFn =
+      Ctx.getOrCreateInternalBuiltin(NameSV, VecgroupScanFnTy);
 
   IRBuilder<> B(CI);
 
-  auto *VectorScan = B.CreateCall(SubgroupFn, Ops);
+  auto *VectorScan = B.CreateCall(VecgroupFn, Ops);
 
-  // We've currently got a scan over each vector group, but the full sub-group
-  // is further multiplied by the mux sub-group size. For example, we may have
-  // a vectorization factor sized group of 4 and a mux sub-group size of 2.
-  // Together the full sub-group size to the user is 4*2 = 8.
+  // We've currently got a scan over each vector group, but the full group scan
+  // is further multiplied by the group size (either the work-group size or the
+  // 'mux' hardware sub-group size). For example, we may have a vectorization
+  // factor sized group of 4 and a group size of 2. Together the full group
+  // size to the user is 4*2 = 8.
   // In terms of invocations, we've essentially currently got:
   //   <a0, a0+a1, a0+a1+a2, a0+a1+a2+a3> (invocation 0)
   //   <a4, a4+a5, a4+a5+a6, a4+a5+a6+a7> (invocation 1)
-  // These two iterations need to be further scanned over the mux sub-group
+  // These two iterations need to be further scanned over the group
   // size. We do this by adding the identity to the first invocation, the
   // result of the scan over the first invocation to the second, etc. This is
   // an exclusive scan over the *reduction* of the input vector:
@@ -1747,7 +1751,7 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
   // -> reduction
   //   (a0+a1+a2+a3) (invocation 0)
   //   (a4+a5+a6+a7) (invocation 1)
-  // -> exclusive mux sub-group scan
+  // -> exclusive group scan
   //               I (invocation 0)
   //   (a0+a1+a2+a3) (invocation 1)
   // -> adding that to the result of the vector scan:
@@ -1755,8 +1759,8 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
   //   <(a0+a1+a2+a3)+a4, (a0+a1+a2+a3)+a4+a5,             (invocation 1)
   //    (a0+a1+a2+a3)+a4+a5+a6, (a0+a1+a2+a3)+a4+a5+a6+a7>
   // When viewed as a full 8-element vector, this is our final scan.
-  // Thus we essentially keep the original mux sub-group scan, but change it to
-  // be an exclusive one.
+  // Thus we essentially keep the original group scan, but change it to be an
+  // exclusive one.
   auto *Reduction = Ops.front();
   if (VL) {
     Reduction = sanitizeVPReductionInput(B, Reduction, VL, Scan.Recurrence);
@@ -1766,7 +1770,7 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
   }
   Reduction = createSimpleTargetReduction(B, &TTI, Reduction, Scan.Recurrence);
 
-  // Now we defer to an *exclusive* scan over the mux sub-group.
+  // Now we defer to an *exclusive* scan over the group.
   auto ExclScan = Scan;
   ExclScan.Op = compiler::utils::GroupCollective::OpKind::ScanExclusive;
 
@@ -1777,7 +1781,12 @@ ValuePacket Packetizer::Impl::packetizeSubgroupScan(
       ExclScanID, *F.getParent(), {CI->getType()});
   assert(ExclScanFn);
 
-  auto *const ExclScanCI = B.CreateCall(ExclScanFn, {Reduction});
+  SmallVector<Value *, 2> ExclScanOps = {Reduction};
+  if (Scan.isWorkGroupScope()) {
+    // Forward on the current barrier ID.
+    ExclScanOps.insert(ExclScanOps.begin(), CI->getArgOperand(0));
+  }
+  auto *const ExclScanCI = B.CreateCall(ExclScanFn, ExclScanOps);
 
   Value *const Splat = B.CreateVectorSplat(SimdWidth, ExclScanCI);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
new file mode 100644
index 0000000000000..0fbba9c59df96
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
@@ -0,0 +1,204 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i32 @__mux_work_group_scan_inclusive_add_i32(i32, i32)
+declare i64 @__mux_work_group_scan_inclusive_add_i64(i32, i64)
+declare float @__mux_work_group_scan_inclusive_fadd_f32(i32, float)
+
+declare i32 @__mux_work_group_scan_inclusive_smin_i32(i32, i32)
+declare i32 @__mux_work_group_scan_inclusive_umin_i32(i32, i32)
+declare i32 @__mux_work_group_scan_inclusive_smax_i32(i32, i32)
+declare i32 @__mux_work_group_scan_inclusive_umax_i32(i32, i32)
+declare float @__mux_work_group_scan_inclusive_fmin_f32(i32, float)
+declare float @__mux_work_group_scan_inclusive_fmax_f32(i32, float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_add_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_add_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i32> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i64 @__mux_work_group_scan_inclusive_add_i64(i32 0, i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64(
+; CHECK: [[SCAN:%.*]] = call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_work_group_scan_exclusive_add_i64(i32 0, i64 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[HEAD]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i64> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i64> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call float @__mux_work_group_scan_inclusive_fadd_f32(i32 0, float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.0{{.*}}, <4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fadd_f32(i32 0, float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = fadd <4 x float> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_smin_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_smin_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_umin_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_umin_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_smax_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_smax_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_umax_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_umax_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call float @__mux_work_group_scan_inclusive_fmin_f32(i32 0, float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fmin_f32(i32 0, float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call float @__mux_work_group_scan_inclusive_fmax_f32(i32 0, float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fmax_f32(i32 0, float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
+}

From a21a7afa80583e7a2320900e678e1cda51b07f94 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 12 Oct 2023 18:40:37 +0100
Subject: [PATCH 041/182] [vecz] Use correct alignment for memcpy source

The inlining for memcpy was incorrectly using the destination
parameter's alignment rather than the source parameter, resulting
in miscompiles.

As well as fixing this issue, a test has been added.
---
 .../transform/builtin_inlining_pass.cpp       |  2 +-
 .../test/lit/llvm/builtin_inlining_memcpy.ll  | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index 308cc64f677bd..26d243724d842 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -245,7 +245,7 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
         byte == 0 ? DestAlignment : std::min(Align(8u), DestAlignment);
     MC->setAlignment(StoreAlign);
     Align LoadAlign =
-        byte == 0 ? DestAlignment : std::min(Align(8u), SourceAlignment);
+        byte == 0 ? SourceAlignment : std::min(Align(8u), SourceAlignment);
     LoadValue->setAlignment(LoadAlign);
   }
   // ...and then we fill in the remaining with 8bit stores.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
new file mode 100644
index 0000000000000..08a65fc010dd4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k memcpy_align -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @memcpy_align(ptr align(16) %out, ptr align(8) %in) {
+entry:
+; CHECK:  %[[A:.*]] = getelementptr inbounds i8, ptr %in, i64 0
+; CHECK:  %[[B:.*]] = getelementptr inbounds i8, ptr %out, i64 0
+; CHECK:  %[[C:.*]] = load i64, ptr %[[A]], align 8
+; CHECK:  store i64 %[[C]], ptr %[[B]], align 16
+
+; CHECK:  %[[D:.*]] = getelementptr inbounds i8, ptr %in, i64 8
+; CHECK:  %[[E:.*]] = getelementptr inbounds i8, ptr %out, i64 8
+; CHECK:  %[[F:.*]] = load i64, ptr %[[D]], align 8
+; CHECK:  store i64 %[[F]], ptr %[[E]], align 8
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef align(16) %out, ptr noundef align(8) %in, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)

From 69289946678e46ba71673054c4962092d8a0ec75 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 16 Oct 2023 12:50:11 +0100
Subject: [PATCH 042/182] [vecz] Don't mask work-group collective operations

These builtins function as barriers, so masking them produces
undesirable results when laying out barrier regions. For this reason, we
don't mask 'regular' barriers.

These builtins are, in any case, uniform/convergent, so either all
work-items or no work-items should reach that point of execution.

Also ensure the spir-v reduction wrapper is marked as convergent, for
completeness' sake.
---
 .../control_flow_conversion_pass.cpp          | 15 ++++++-
 .../test/lit/llvm/masked_group_collective.ll  | 45 +++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index b49ccc842760c..9287e51311f68 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -1292,7 +1292,8 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   }
 
   // Builtins without side effects do not need to be masked.
-  auto const props = Ctx.builtins().analyzeBuiltin(*callee).properties;
+  auto const builtin = Ctx.builtins().analyzeBuiltin(*callee);
+  auto const props = builtin.properties;
   if (props & compiler::utils::eBuiltinPropertyNoSideEffects) {
     LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n");
     return true;
@@ -1313,6 +1314,18 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
         dbgs() << "vecz-cf: Called function does not have any side-effects\n");
     return true;
   }
+  // We don't want to mask work-group collective builtins, because they are
+  // barriers (see above). This should actually be a rare situation, as these
+  // builtins are required to be uniform/convergent and so either all
+  // work-items or no work-items should hit them. Most of the time, this
+  // situation relies on the vectorizer failing to trace the branch flow and
+  // failing to realize the conditions are in fact uniform.
+  if (auto info = Ctx.builtins().isMuxGroupCollective(builtin.ID);
+      info && info->isWorkGroupScope()) {
+    LLVM_DEBUG(
+        dbgs() << "vecz-cf: Called function is a work-group collective\n");
+    return true;
+  }
 
   // Create the new function and replace the old one with it
   CallInst *newCI = emitMaskedVersion(CI, mask);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
new file mode 100644
index 0000000000000..74c4ada6bb1c2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes="cfg-convert" -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id()
+declare i32 @__mux_work_group_scan_inclusive_smax_i32(i32, i32)
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_foo()
+; CHECK-NOT: @__vecz_b_masked___mux_work_group_scan_inclusive_smax_i32
+define spir_kernel void @foo() {
+entry:
+  %0 = call i64 @__mux_get_local_id()
+  br i1 false, label %for.body.i11, label %if.end.i105.i
+
+for.body.i11:
+  %1 = icmp slt i64 %0, 0
+  br i1 %1, label %if.end.i13, label %if.end.i13
+
+if.end.i13:
+  br i1 false, label %exit, label %if.end.i105.i
+
+if.end.i105.i:
+  %2 = call i32 @__mux_work_group_scan_inclusive_smax_i32(i32 0, i32 0)
+  br label %exit
+
+exit:
+  ret void
+}

From 0d38f0b321e131d27db6bc0385432ee4f7278a43 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 26 Oct 2023 12:17:35 +0100
Subject: [PATCH 043/182] [vecz] Packetize sub-group shuffles with uniform
 indices

This extends fixed-width vectorization capabilities to
`__mux_sub_group_shuffle` builtins, but only those with uniform indices
(where the shuffle index is the same for all invocations in the
sub-group). This accounts for the majority of those tested by the
SYCL-CTS. Support for varying indices will come down the line, once the
other shuffles are covered under similar conditions.

The existing sub-group LIT tests have been split by operation, as they
are expected to grow significantly to cover all of the different
conditions we can vectorize under.
---
 .../vecz/source/transform/packetizer.cpp      | 159 +++++++++++++--
 .../vecz/test/lit/llvm/subgroup_shuffle.ll    | 190 ++++++++++++++++++
 .../test/lit/llvm/subgroup_shuffle_down.ll    |  36 ++++
 .../vecz/test/lit/llvm/subgroup_shuffle_up.ll |  36 ++++
 .../test/lit/llvm/subgroup_shuffle_xor.ll     |  36 ++++
 .../vecz/test/lit/llvm/subgroup_shuffles.ll   |  79 --------
 6 files changed, 445 insertions(+), 91 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 3ac35bdacc1bb..26c1a56f23ed3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -17,6 +17,7 @@
 #include "transform/packetizer.h"
 
 #include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
 #include <compiler/utils/mangling.h>
 #include <llvm/ADT/DepthFirstIterator.h>
 #include <llvm/ADT/SmallPtrSet.h>
@@ -24,6 +25,7 @@
 #include <llvm/ADT/Twine.h>
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/Analysis/VectorUtils.h>
+#include <llvm/IR/Constants.h>
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/IRBuilder.h>
@@ -202,13 +204,24 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   Value *packetizeGroupBroadcast(Instruction *I);
-  /// @brief Returns true if the instruction is a subgroup shuffle.
+  /// @brief Returns true if the instruction is any subgroup shuffle.
   ///
   /// @param[in] I Instruction to query.
   ///
-  /// @return True if the instruction is a call to a mux subgroup shuffle
+  /// @return The group collective data if the instruction is a call to any of
+  /// the mux subgroup shuffle builtins; std::nullopt otherwise.
+  std::optional<compiler::utils::GroupCollective> isSubgroupShuffleLike(
+      Instruction *I);
+  /// @brief Packetize a sub-group shuffle builtin
+  ///
+  /// Note - not any shuffle-like operation, but specifically the 'shuffle'
   /// builtin.
-  bool isSubgroupShuffle(Instruction *I);
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  ///
+  /// @return Packetized instructions.
+  Value *packetizeSubgroupShuffle(Instruction *Ins);
+
   /// @brief Packetize PHI node.
   ///
   /// @param[in] Phi PHI Node to packetize.
@@ -861,12 +874,6 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
 
   auto *const Ins = cast<Instruction>(V);
 
-  // FIXME: Add support for vectorizing sub-group shuffles
-  if (isSubgroupShuffle(Ins)) {
-    emitVeczRemarkMissed(&F, Ins, "Could not packetize sub-group shuffle");
-    return Packetizer::Result(*this);
-  }
-
   if (auto *const Branch = dyn_cast<BranchInst>(Ins)) {
     if (Branch->isConditional()) {
       // varying reductions need to be packetized
@@ -918,6 +925,19 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
     return broadcast(brdcast);
   }
 
+  if (auto shuffle = isSubgroupShuffleLike(Ins)) {
+    if (shuffle->Op == compiler::utils::GroupCollective::OpKind::Shuffle) {
+      if (auto *s = packetizeSubgroupShuffle(Ins)) {
+        return broadcast(s);
+      }
+    }
+    // We can't packetize all sub-group shuffle-like operations, but we also
+    // can't vectorize or instantiate them - so provide a diagnostic saying as
+    // much.
+    emitVeczRemarkMissed(&F, Ins, "Could not packetize sub-group shuffle");
+    return Packetizer::Result(*this);
+  }
+
   // Check if we should broadcast the instruction.
   // Broadcast uniform instructions, unless we want to packetize uniform
   // instructions as well. We can assume that isMaskVarying is false at this
@@ -1265,10 +1285,11 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   return CI;
 }
 
-bool Packetizer::Impl::isSubgroupShuffle(Instruction *I) {
+std::optional<compiler::utils::GroupCollective>
+Packetizer::Impl::isSubgroupShuffleLike(Instruction *I) {
   auto *const CI = dyn_cast<CallInst>(I);
   if (!CI || !CI->getCalledFunction()) {
-    return false;
+    return std::nullopt;
   }
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
@@ -1276,7 +1297,121 @@ bool Packetizer::Impl::isSubgroupShuffle(Instruction *I) {
   auto const Builtin = BI.analyzeBuiltin(*callee);
   auto const Info = BI.isMuxGroupCollective(Builtin.ID);
 
-  return Info && Info->isSubGroupScope() && Info->isShuffleLike();
+  if (Info && Info->isSubGroupScope() && Info->isShuffleLike()) {
+    return Info;
+  }
+
+  return std::nullopt;
+}
+
+Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) {
+  auto *const CI = cast<CallInst>(I);
+
+  // We don't support scalable vectorization of sub-group shuffles.
+  if (SimdWidth.isScalable()) {
+    return nullptr;
+  }
+
+  auto *const Data = CI->getArgOperand(0);
+  auto *const Idx = CI->getArgOperand(1);
+
+  auto PackData = packetize(Data);
+  if (!PackData) {
+    return nullptr;
+  }
+
+  // If the data operand happened to be a broadcast value already, we can use
+  // it directly.
+  if (PackData.info->numInstances == 0) {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(Data);
+    return Data;
+  }
+
+  // We can't packetize varying shuffle indices yet.
+  if (UVR.isVarying(Idx)) {
+    return nullptr;
+  }
+
+  IRBuilder<> B(CI);
+
+  // We need to sanitize the input index so that it stays within the range of
+  // one vectorized group.
+  unsigned const VF = SimdWidth.getFixedValue();
+  auto *const VecIdxFactor = ConstantInt::get(Idx->getType(), VF);
+  // This index is the element of the vector-group which holds the desired
+  // data, per mux sub-group.
+  // <x, y>, <z, w>: idx 1 -> vector element 1, idx 2 -> vector element 0.
+  auto *const VecIdx = B.CreateURem(Idx, VecIdxFactor);
+  // This index is the mux sub-group in which the desired data resides.
+  // <x, y>, <z, w>: idx 1 -> mux sub-group 0, idx 3 -> mux sub-group 1.
+  auto *const MuxIdx = B.CreateUDiv(Idx, VecIdxFactor);
+
+  Value *VecData = PackData.getAsValue();
+
+  // Note: in each illustrative example, imagine two invocations across a
+  // single mux sub-groups, each being vectorized by 4; in other words, 8
+  // 'original' invocations to a sub-group, running in two vectorized
+  // invocations.
+  if (auto *const DataVecTy = dyn_cast<VectorType>(Data->getType());
+      !DataVecTy) {
+    // The vectorized shuffle is producing a scalar (assuming uniform indices,
+    // see above). Imagine i=6 (6 % 4 = 2 and 6 / 4 = 1):
+    //       |  shuffle(X, 6)  |  shuffle(A, 6)  |
+    // VF=4  |-----------------|-----------------|
+    //       | s(<X,Y,Z,W>, 2) | s(<A,B,C,D>, 2) |
+    // elt 2 |        Z        |        C        |
+    // shuff | shuffle(Z, 1)   | shuffle(C, 1)   |
+    //       |        C        |        C        |
+    // bcast |   <C,C,C,C>     |   <C,C,C,C>     |
+    // This way we can see how each of the 8 invocations end up with the 6th
+    // element of the total sub-group.
+    VecData = B.CreateExtractElement(VecData, VecIdx, "vec.extract");
+  } else if (auto *const CIdx = dyn_cast<ConstantInt>(VecIdx)) {
+    // The shuffle produces a vector, and we have a constant shuffle index - we
+    // can extract a subvector easily.
+    // Imagine i=6 (6 % 4 = 2 and 6 / 4 = 1):
+    //       |     shuffle(<X,Y>, 6)   |     shuffle(<A,B>, 6)   |
+    // VF=4  |-------------------------|-------------------------|
+    //       | s(<X,Y,Z,W,P,Q,-,->, 2) | s(<A,B,C,D,E,F,-,->, 2) |
+    // vec 2 |           <P,Q>         |           <E,F>         |
+    // shuff |     shuffle(<P,Q>, 1)   |     shuffle(<E,F>, 1)   |
+    //       |           <E,F>         |           <E,F>         |
+    // bcast |   <E,F,E,F,E,F,E,F>     |   <E,F,E,F,E,F,E,F>     |
+    // This way we can see how each of the 8 invocations end up with the 6th
+    // element of the total sub-group, which is a two-element vector.
+
+    // Note: the subvector vector index type has to be i64. Scale it up by the
+    // size of the vector we're extracting: the index is the *element* from
+    // which to extract - it is not implicitly scaled by the vector size.
+    auto *const ExtractIdx = B.getInt64(
+        CIdx->getZExtValue() * DataVecTy->getElementCount().getFixedValue());
+    VecData = B.CreateExtractVector(Data->getType(), VecData, ExtractIdx,
+                                    "vec.extract");
+  } else {
+    // This is as above, but the process of extracting the initial vector is
+    // more complicated - we have to manually extract and insert each element.
+    // It's possible that for some targets and for some combinations of vector
+    // width and vectorization factor, that going through memory would be
+    // faster.
+    Value *ExtractedVec = UndefValue::get(DataVecTy);
+    unsigned const DataNumElts = DataVecTy->getElementCount().getFixedValue();
+    auto *const BaseIdx = B.CreateMul(VecIdx, B.getInt32(DataNumElts));
+    for (unsigned i = 0; i < DataNumElts; i++) {
+      auto *const SubIdx = B.CreateAdd(BaseIdx, B.getInt32(i));
+      auto *const Elt = B.CreateExtractElement(VecData, SubIdx);
+      ExtractedVec = B.CreateInsertElement(ExtractedVec, Elt, B.getInt32(i));
+    }
+    VecData = ExtractedVec;
+  }
+
+  // We leave the original shuffle function and divert the vectorized
+  // shuffle through it, giving us a shuffle over the full apparent
+  // sub-group size (vecz * mux).
+  CI->setOperand(0, VecData);
+  CI->setOperand(1, MuxIdx);
+
+  return CI;
 }
 
 Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
new file mode 100644
index 0000000000000..eae0ed336e8e1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
@@ -0,0 +1,190 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; See @kernel_varying_idx, below
+; CHECK: Could not packetize sub-group shuffle %shuffle9
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %in, ptr %out)
+; CHECK: [[VECIDX:%.*]] = urem i32 %size_minus_1, 4
+; CHECK: [[MUXIDX:%.*]] = udiv i32 %size_minus_1, 4
+; CHECK: [[VEC:%.*]] = extractelement <4 x i64> {{%.*}}, i32 [[VECIDX]]
+; CHECK: [[SHUFFLE:%.*]] = call i64 @__mux_sub_group_shuffle_i64(i64 [[VEC]], i32 [[MUXIDX]])
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 [[SHUFFLE]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid
+  %val = load i64, ptr %arrayidx.in, align 8
+  %shuffle1 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle1, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %in, ptr %out)
+; CHECK: [[VECIDX:%.*]] = urem i32 %size_minus_1, 4
+; CHECK: [[MUXIDX:%.*]] = udiv i32 %size_minus_1, 4
+; CHECK: [[BASE:%.*]] = mul i32 %2, 2
+; CHECK: [[IDX0:%.*]] = add i32 [[BASE]], 0
+; CHECK: [[ELT0:%.*]] = extractelement <8 x float> %1, i32 [[IDX0]]
+; CHECK: [[TVEC:%.*]] = insertelement <2 x float> undef, float [[ELT0]], i32 0
+; CHECK: [[IDX1:%.*]] = add i32 [[BASE]], 1
+; CHECK: [[ELT1:%.*]] = extractelement <8 x float> %1, i32 [[IDX1]]
+; CHECK: [[VEC:%.*]] = insertelement <2 x float> [[TVEC]], float [[ELT1]], i32 1
+; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 [[MUXIDX]])
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> undef,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+define spir_kernel void @kernel_vec_data(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid
+  %val = load <2 x float>, ptr %arrayidx.in, align 8
+  %shuffle2 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle2, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_const_idx(ptr %in, ptr %out)
+; CHECK: [[VEC:%.*]] = extractelement <4 x i64> {{%.*}}, i32 1
+; CHECK: [[SHUFFLE:%.*]] = call i64 @__mux_sub_group_shuffle_i64(i64 [[VEC]], i32 0)
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 [[SHUFFLE]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel_const_idx(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid
+  %val = load i64, ptr %arrayidx.in, align 8
+  %shuffle3 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 1)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle3, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data_const_idx(ptr %in, ptr %out)
+; We're wanting the "1th" sub-group member, which becomes the 2-element vector
+; at element index 2
+; CHECK: [[VEC:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v8f32(<8 x float> {{%.*}}, i64 2)
+; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 0)
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> undef,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK: store <8 x float> [[SPLAT]]
+define spir_kernel void @kernel_vec_data_const_idx(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid
+  %val = load <2 x float>, ptr %arrayidx.in, align 8
+  %shuffle4 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 1)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle4, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data(i64 %val, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 %val, i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel_uniform_data(i64 %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %shuffle5 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle5, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 %val, i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid
+  %idx = load i32, ptr %arrayidx.idxs, align 4
+  %shuffle6 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %idx)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle6, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data(<2 x float> %val, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> undef,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK: store <8 x float> [[SPLAT]]
+define spir_kernel void @kernel_uniform_vec_data(<2 x float> %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %shuffle7 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle7, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> undef,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK: store <8 x float> [[SPLAT]]
+define spir_kernel void @kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid
+  %idx = load i32, ptr %arrayidx.idxs, align 4
+  %shuffle8 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %idx)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle8, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; We don't support vectorization of varying indices (for now) - see the check
+; above (which is printed before the final IR)
+define spir_kernel void @kernel_varying_idx(ptr %in, ptr %idxs, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid
+  %val = load i64, ptr %arrayidx.in, align 8
+  %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid
+  %idx = load i32, ptr %arrayidx.idxs, align 4
+  %shuffle9 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %idx)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle9, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i32 @__mux_get_sub_group_size()
+
+declare i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %lid)
+declare <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %lid)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
new file mode 100644
index 0000000000000..a3566c8e61dd7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: Could not packetize sub-group shuffle %shuffle_down
+define spir_kernel void @kernel(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds i8, ptr %in, i64 %gid
+  %val = load i8, ptr %arrayidx.in, align 8
+  %shuffle_down = call i8 @__mux_sub_group_shuffle_down_i8(i8 %val, i8 %val, i32 1)
+  %arrayidx.out = getelementptr inbounds i8, ptr %out, i64 %gid
+  store i8 %shuffle_down, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i8 @__mux_sub_group_shuffle_down_i8(i8 %curr, i8 %next, i32 %delta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
new file mode 100644
index 0000000000000..e27365e85c946
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: Could not packetize sub-group shuffle %shuffle_up
+define spir_kernel void @kernel(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds float, ptr %in, i64 %gid
+  %val = load float, ptr %arrayidx.in, align 8
+  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %val, float %val, i32 1)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare float @__mux_sub_group_shuffle_up_f32(float %prev, float %curr, i32 %delta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
new file mode 100644
index 0000000000000..eb8d3820b8ba9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: Could not packetize sub-group shuffle %shuffle_xor
+define spir_kernel void @kernel(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
+  %val = load half, ptr %arrayidx.in, align 8
+  %shuffle_xor = call half @__mux_sub_group_shuffle_xor_f16(half %val, i32 -1)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle_xor, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare half @__mux_sub_group_shuffle_xor_f16(half %val, i32 %xor_val)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll
deleted file mode 100644
index 7c109f4b48e75..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffles.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -w 4 -vecz-passes=packetizer -S \
-; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
-
-target triple = "spir64-unknown-unknown"
-target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK: Could not packetize sub-group shuffle %shuffle
-define spir_kernel void @kernel1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-  %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %size = call i32 @__mux_get_sub_group_size()
-  %size_minus_1 = sub i32 %size, 1
-  %arrayidx.in = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %gid
-  %val = load i64, ptr addrspace(1) %arrayidx.in, align 8
-  %shuffle = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1)
-  %arrayidx.out = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %gid
-  store i64 %shuffle, ptr addrspace(1) %arrayidx.out, align 8
-  ret void
-}
-
-; CHECK: Could not packetize sub-group shuffle %shuffle_up
-define spir_kernel void @kernel2(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-  %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.in = getelementptr inbounds float, ptr addrspace(1) %in, i64 %gid
-  %val = load float, ptr addrspace(1) %arrayidx.in, align 8
-  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %val, float %val, i32 1)
-  %arrayidx.out = getelementptr inbounds float, ptr addrspace(1) %out, i64 %gid
-  store float %shuffle_up, ptr addrspace(1) %arrayidx.out, align 8
-  ret void
-}
-
-; CHECK: Could not packetize sub-group shuffle %shuffle_down
-define spir_kernel void @kernel3(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-  %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.in = getelementptr inbounds i8, ptr addrspace(1) %in, i64 %gid
-  %val = load i8, ptr addrspace(1) %arrayidx.in, align 8
-  %shuffle_down = call i8 @__mux_sub_group_shuffle_down_i8(i8 %val, i8 %val, i32 1)
-  %arrayidx.out = getelementptr inbounds i8, ptr addrspace(1) %out, i64 %gid
-  store i8 %shuffle_down, ptr addrspace(1) %arrayidx.out, align 8
-  ret void
-}
-
-; CHECK: Could not packetize sub-group shuffle %shuffle_xor
-define spir_kernel void @kernel4(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-  %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.in = getelementptr inbounds half, ptr addrspace(1) %in, i64 %gid
-  %val = load half, ptr addrspace(1) %arrayidx.in, align 8
-  %shuffle_xor = call half @__mux_sub_group_shuffle_xor_f16(half %val, i32 -1)
-  %arrayidx.out = getelementptr inbounds half, ptr addrspace(1) %out, i64 %gid
-  store half %shuffle_xor, ptr addrspace(1) %arrayidx.out, align 8
-  ret void
-}
-
-declare i64 @__mux_get_global_id(i32)
-
-declare i32 @__mux_get_sub_group_size()
-
-declare i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %lid)
-
-declare half @__mux_sub_group_shuffle_xor_f16(half %val, i32 %xor_val)
-
-declare i8 @__mux_sub_group_shuffle_down_i8(i8 %curr, i8 %next, i32 %delta)
-
-declare float @__mux_sub_group_shuffle_up_f32(float %prev, float %curr, i32 %delta)

From 14225d93373860fe6981312e32aaab06817c430f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 26 Oct 2023 18:08:36 +0100
Subject: [PATCH 044/182] [vecz] Packetize sub-group shuffle_xor builtins

This extends fixed-width vectorization capabilities to
`__mux_sub_group_shuffle_xor` builtins. This isn't something that is
very efficiently vectorized, because of all of the runtime indexing,
which no built-in LLVM instructions/intrinsics can really make use of.

It might be preferable for some targets to go through memory. We might
want to make that a codegen option in a future update.
---
 .../vecz/source/transform/packetizer.cpp      | 183 +++++++++++++++-
 .../test/lit/llvm/subgroup_shuffle_xor.ll     | 207 +++++++++++++++++-
 2 files changed, 380 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 26c1a56f23ed3..456197020dd25 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -221,6 +221,17 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instructions.
   Value *packetizeSubgroupShuffle(Instruction *Ins);
+  /// @brief Packetize a sub-group shuffle-xor builtin
+  ///
+  /// Note - not any shuffle-like operation, but specifically the 'shuffle_xor'
+  /// builtin.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  /// @param[in] ShuffleXor Shuffle to packetize.
+  ///
+  /// @return Packetized instructions.
+  Result packetizeSubgroupShuffleXor(
+      Instruction *Ins, compiler::utils::GroupCollective ShuffleXor);
 
   /// @brief Packetize PHI node.
   ///
@@ -926,10 +937,19 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
   }
 
   if (auto shuffle = isSubgroupShuffleLike(Ins)) {
-    if (shuffle->Op == compiler::utils::GroupCollective::OpKind::Shuffle) {
-      if (auto *s = packetizeSubgroupShuffle(Ins)) {
-        return broadcast(s);
-      }
+    switch (shuffle->Op) {
+      default:
+        break;
+      case compiler::utils::GroupCollective::OpKind::Shuffle:
+        if (auto *s = packetizeSubgroupShuffle(Ins)) {
+          return broadcast(s);
+        }
+        break;
+      case compiler::utils::GroupCollective::OpKind::ShuffleXor:
+        if (auto s = packetizeSubgroupShuffleXor(Ins, *shuffle)) {
+          return s;
+        }
+        break;
     }
     // We can't packetize all sub-group shuffle-like operations, but we also
     // can't vectorize or instantiate them - so provide a diagnostic saying as
@@ -1414,6 +1434,161 @@ Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) {
   return CI;
 }
 
+Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
+    Instruction *I, compiler::utils::GroupCollective ShuffleXor) {
+  auto *const CI = cast<CallInst>(I);
+
+  // We don't support scalable vectorization of sub-group shuffles.
+  if (SimdWidth.isScalable()) {
+    return Packetizer::Result(*this);
+  }
+  unsigned const VF = SimdWidth.getFixedValue();
+
+  auto *const Data = CI->getArgOperand(0);
+  auto *const Val = CI->getArgOperand(1);
+
+  auto PackData = packetize(Data);
+  if (!PackData) {
+    return Packetizer::Result(*this);
+  }
+
+  // If the data operand happened to be a broadcast value already, we can use
+  // it directly.
+  if (PackData.info->numInstances == 0) {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(Data);
+    return PackData;
+  }
+
+  auto PackVal = packetize(Val);
+  if (!PackVal) {
+    return Packetizer::Result(*this);
+  }
+
+  // With the packetize operands in place, we have to perform the actual
+  // shuffling operation. Since we are one layer higher than the mux
+  // sub-groups, our IDs do not easily translate to the mux level. Therefore we
+  // perform each shuffle using the regular 'shuffle' and do the XOR of the IDs
+  // ourselves.
+
+  // Note: in this illustrative example, imagine two invocations across a
+  // single mux sub-groups, each being vectorized by 4; in other words, 8
+  // 'original' invocations to a sub-group, running in two vectorized
+  // invocations. Imagine value = 5:
+  //                |  shuffle(X, 5)       |  shuffle(A, 5)       |
+  // VF=4           |----------------------|----------------------|
+  //                |    s(<X,Y,Z,W>, 5)   |    s(<A,B,C,D>, 5)   |
+  // SG IDs         |       0,1,2,3        |       4,5,6,7        |
+  // SG IDs^5       |       5,4,7,6        |       1,0,3,2        |
+  // I=(SG IDs^5)/4 |       1,1,1,1        |       0,0,0,0        |
+  // J=(SG IDs^5)%4 |       1,0,3,2        |       1,0,3,2        |
+  // <X,Y,Z,W>[J]   |       Y,X,W,Z        |       B,A,D,A        |
+  // Mux-shuffle[I] | [Y,B][1],[X,A][1],.. | [Y,B][0],[X,A][1],.. |
+  //                |       B,A,D,A        |       Y,X,W,Z        |
+  IRBuilder<> B(CI);
+
+  auto *const SubgroupLocalIDFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      compiler::utils::eMuxBuiltinGetSubGroupLocalId, *F.getParent(),
+      {CI->getType()});
+  assert(SubgroupLocalIDFn);
+
+  auto *const SubgroupLocalID =
+      B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
+  auto const Builtin =
+      Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
+
+  // Vectorize the sub-group local ID
+  auto *const VecSubgroupLocalID =
+      vectorizeWorkGroupCall(SubgroupLocalID, Builtin);
+  if (!VecSubgroupLocalID) {
+    return Packetizer::Result(*this);
+  }
+  VecSubgroupLocalID->setName("vec.sg.local.id");
+
+  // The value is always i32, as is the sub-group local ID. Vectorizing both of
+  // them should result in the same vector type, with as many elements as the
+  // vectorization factor.
+  auto *const VecVal = PackVal.getAsValue();
+
+  assert(VecVal->getType() == VecSubgroupLocalID->getType() &&
+         VecVal->getType()->isVectorTy() &&
+         cast<VectorType>(VecVal->getType())
+                 ->getElementCount()
+                 .getKnownMinValue() == VF &&
+         "Unexpected vectorization of sub-group shuffle xor");
+
+  // Perform the XOR of the sub-group IDs with the 'value', as per the
+  // semantics of the builtin.
+  auto *const XoredID = B.CreateXor(VecSubgroupLocalID, VecVal);
+
+  // We need to sanitize the input index so that it stays within the range of
+  // one vectorized group.
+  auto *const VecIdxFactor = ConstantInt::get(SubgroupLocalID->getType(), VF);
+
+  // Bring this ID into the range of 'mux' sub-groups by dividing it by the
+  // vector size.
+  auto *const MuxXoredID =
+      B.CreateUDiv(XoredID, B.CreateVectorSplat(VF, VecIdxFactor));
+  // And into the range of the vector group
+  auto *const VecXoredID =
+      B.CreateURem(XoredID, B.CreateVectorSplat(VF, VecIdxFactor));
+
+  // Now we defer to an *exclusive* scan over the group.
+  auto RegularShuffle = ShuffleXor;
+  RegularShuffle.Op = compiler::utils::GroupCollective::OpKind::Shuffle;
+
+  auto RegularShuffleID = Ctx.builtins().getMuxGroupCollective(RegularShuffle);
+  assert(RegularShuffleID != compiler::utils::eBuiltinInvalid);
+
+  auto *const RegularShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      RegularShuffleID, *F.getParent(), {CI->getType()});
+  assert(RegularShuffleFn);
+
+  auto *const VecData = PackData.getAsValue();
+  Value *CombinedShuffle = UndefValue::get(VecData->getType());
+
+  for (unsigned i = 0; i < VF; i++) {
+    auto *Idx = B.getInt32(i);
+    // Get the XORd index local to the vector group that this vector group
+    // element wants to shuffle with.
+    auto *const VecGroupIdx = B.CreateExtractElement(VecXoredID, Idx);
+    // Grab that element. It may be a vector, in which case we must extract
+    // each element individually.
+    Value *DataElt = nullptr;
+    if (auto *DataVecTy = dyn_cast<VectorType>(Data->getType()); !DataVecTy) {
+      DataElt = B.CreateExtractElement(VecData, VecGroupIdx);
+    } else {
+      DataElt = UndefValue::get(DataVecTy);
+      auto VecWidth = DataVecTy->getElementCount().getFixedValue();
+      // VecGroupIdx is the 'base' of the subvector, whose elements are stored
+      // sequentially from that point.
+      auto *const VecVecGroupIdx =
+          B.CreateMul(VecGroupIdx, B.getInt32(VecWidth));
+      for (unsigned j = 0; j != VecWidth; j++) {
+        auto *const Elt = B.CreateExtractElement(
+            VecData, B.CreateAdd(VecVecGroupIdx, B.getInt32(j)));
+        DataElt = B.CreateInsertElement(DataElt, Elt, B.getInt32(j));
+      }
+    }
+    assert(DataElt);
+    // Shuffle it across the mux sub-group.
+    auto *const MuxID = B.CreateExtractElement(MuxXoredID, Idx);
+    auto *const Shuff = B.CreateCall(RegularShuffleFn, {DataElt, MuxID});
+    // Combine that back into the final shuffled vector.
+    if (auto *DataVecTy = dyn_cast<VectorType>(Data->getType()); !DataVecTy) {
+      CombinedShuffle = B.CreateInsertElement(CombinedShuffle, Shuff, Idx);
+    } else {
+      auto VecWidth = DataVecTy->getElementCount().getFixedValue();
+      CombinedShuffle = B.CreateInsertVector(
+          CombinedShuffle->getType(), CombinedShuffle, Shuff,
+          B.getInt64(static_cast<uint64_t>(i) * VecWidth));
+    }
+  }
+
+  IC.deleteInstructionLater(CI);
+  return assign(CI, CombinedShuffle);
+}
+
 Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
   if (auto memop = MemOp::get(I)) {
     auto *const mask = memop->getMaskOperand();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
index eb8d3820b8ba9..e84ec6ba216d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
@@ -20,17 +20,212 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK: Could not packetize sub-group shuffle %shuffle_xor
-define spir_kernel void @kernel(ptr %in, ptr %out) {
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_const_value(ptr %in, ptr %out)
+; The XOR'd sub-group local IDs
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; Which mux sub-group each of the XOR'd sub-group local IDs correspond to
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; Which vector group element each of the XOR'd sub-group local IDs correspond to
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+
+; Extract the first XOR'd vector-local sub-group local ID from the vector of vector indices
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; Extract the data element that this XOR'd local ID corresponds to
+; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
+; Extract the first XOR'd mux-local sub-group local ID from the vector of mux indices
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; Shuffle across any hardware sub-group
+; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
+; Put that result into the final vector
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> undef, half [[SHUFF_ELT0]], i32 0
+
+; And so on for the other shuffle values
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]])
+; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1
+
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]]
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]])
+; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2
+
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]]
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]])
+; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3
+
+; CHECK: store <4 x half> [[SHUFF_VEC3]],
+define spir_kernel void @kernel_varying_data_const_value(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
+  %data = load half, ptr %arrayidx.in, align 2
+  %shuffle1 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 4)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle1, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; This should just be the same as the previous kernel. The uniform value doesn't change anything.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out)
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> undef, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]])
+; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]]
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]])
+; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]]
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]])
+; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3
+; CHECK: store <4 x half> [[SHUFF_VEC3]],
+define spir_kernel void @kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out) {
   %gid = tail call i64 @__mux_get_global_id(i32 0)
   %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
-  %val = load half, ptr %arrayidx.in, align 8
-  %shuffle_xor = call half @__mux_sub_group_shuffle_xor_f16(half %val, i32 -1)
+  %data = load half, ptr %arrayidx.in, align 2
+  %shuffle2 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle2, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data_uniform_value(half %data, i32 %val, ptr %out)
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x half> poison, half %data, i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x half> [[SPLATINS]], <4 x half> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x half> [[SPLAT]]
+define spir_kernel void @kernel_uniform_data_uniform_value(half %data, i32 %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %shuffle3 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val)
   %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
-  store half %shuffle_xor, ptr %arrayidx.out, align 8
+  store half %shuffle3, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; This should just be the same as the previous kernel. The varying value doesn't change anything.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out)
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> undef, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]])
+; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]]
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]])
+; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]]
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]])
+; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3
+; CHECK: store <4 x half> [[SHUFF_VEC3]],
+define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
+  %data = load half, ptr %arrayidx.in, align 2
+  %arrayidx.vals = getelementptr inbounds i32, ptr %in, i64 %gid
+  %val = load i32, ptr %arrayidx.vals, align 4
+  %shuffle4 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle4, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out)
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; CHECK: [[MULIDXELT0:%.*]] = mul i32 [[IDXELT0]], 2
+; CHECK: [[MADIDXELT00:%.*]] = add i32 [[MULIDXELT0]], 0
+; CHECK: [[ELT00:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT00]]
+; CHECK: [[DATAELT00:%.*]] = insertelement <2 x float> undef, float [[ELT00]], i32 0
+; CHECK: [[MADIDXELT01:%.*]] = add i32 [[MULIDXELT0]], 1
+; CHECK: [[ELT01:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT01]]
+; CHECK: [[DATAELT01:%.*]] = insertelement <2 x float> [[DATAELT00]], float [[ELT01]], i32 1
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; CHECK: [[SHUFF_ELT0:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT01]], i32 [[ID0]])
+; CHECK: [[SHUFF_RES0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> undef, <2 x float> [[SHUFF_ELT0]], i64 0)
+
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[MULIDXELT1:%.*]] = mul i32 [[IDXELT1]], 2
+; CHECK: [[MADIDXELT10:%.*]] = add i32 [[MULIDXELT1]], 0
+; CHECK: [[ELT10:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT10]]
+; CHECK: [[DATAELT10:%.*]] = insertelement <2 x float> undef, float [[ELT10]], i32 0
+; CHECK: [[MADIDXELT11:%.*]] = add i32 [[MULIDXELT1]], 1
+; CHECK: [[ELT11:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT11]]
+; CHECK: [[DATAELT11:%.*]] = insertelement <2 x float> [[DATAELT10]], float [[ELT11]], i32 1
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT11]], i32 [[ID1]])
+; CHECK: [[SHUFF_RES1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> [[SHUFF_RES0]], <2 x float> [[SHUFF_ELT1]], i64 2)
+
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[MULIDXELT2:%.*]] = mul i32 [[IDXELT2]], 2
+; CHECK: [[MADIDXELT20:%.*]] = add i32 [[MULIDXELT2]], 0
+; CHECK: [[ELT20:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT20]]
+; CHECK: [[DATAELT20:%.*]] = insertelement <2 x float> undef, float [[ELT20]], i32 0
+; CHECK: [[MADIDXELT21:%.*]] = add i32 [[MULIDXELT2]], 1
+; CHECK: [[ELT21:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT21]]
+; CHECK: [[DATAELT21:%.*]] = insertelement <2 x float> [[DATAELT20]], float [[ELT21]], i32 1
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT21]], i32 [[ID2]])
+; CHECK: [[SHUFF_RES2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> [[SHUFF_RES1]], <2 x float> [[SHUFF_ELT2]], i64 4)
+
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[MULIDXELT3:%.*]] = mul i32 [[IDXELT3]], 2
+; CHECK: [[MADIDXELT30:%.*]] = add i32 [[MULIDXELT3]], 0
+; CHECK: [[ELT30:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT30]]
+; CHECK: [[DATAELT30:%.*]] = insertelement <2 x float> undef, float [[ELT30]], i32 0
+; CHECK: [[MADIDXELT31:%.*]] = add i32 [[MULIDXELT3]], 1
+; CHECK: [[ELT31:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT31]]
+; CHECK: [[DATAELT31:%.*]] = insertelement <2 x float> [[DATAELT30]], float [[ELT31]], i32 1
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT31]], i32 [[ID3]])
+; CHECK: [[SHUFF_RES3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> [[SHUFF_RES2]], <2 x float> [[SHUFF_ELT3]], i64 6)
+
+; CHECK: store <8 x float> [[SHUFF_RES3]]
+define spir_kernel void @kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid
+  %data = load <2 x float>, ptr %arrayidx.in, align 8
+  %arrayidx.vals = getelementptr inbounds i32, ptr %in, i64 %gid
+  %val = load i32, ptr %arrayidx.vals, align 4
+  %shuffle5 = call <2 x float> @__mux_sub_group_shuffle_xor_v2f32(<2 x float> %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle5, ptr %arrayidx.out, align 8
   ret void
 }
 
 declare i64 @__mux_get_global_id(i32)
 
-declare half @__mux_sub_group_shuffle_xor_f16(half %val, i32 %xor_val)
+declare half @__mux_sub_group_shuffle_xor_f16(half, i32)
+declare <2 x float> @__mux_sub_group_shuffle_xor_v2f32(<2 x float>, i32)

From 3871885e8fa971bb38ef34a57abc2ce72c93263e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:11:20 +0000
Subject: [PATCH 045/182] [multi_llvm] Remove ArrayRef helpers

---
 .../include/multi_llvm/multi_llvm.h            | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index a9c99fcb3f61c..d32dfab405ddd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -29,24 +29,6 @@
 
 namespace multi_llvm {
 
-template <typename T>
-llvm::ArrayRef<T> ArrayRef(T *data, size_t size) {
-#if LLVM_VERSION_MAJOR >= 16
-  return llvm::ArrayRef<T>(data, size);
-#else
-  return llvm::makeArrayRef<T>(data, size);
-#endif
-}
-
-template <typename T>
-llvm::ArrayRef<T> ArrayRef(llvm::SmallVectorImpl<T> &data) {
-#if LLVM_VERSION_MAJOR >= 16
-  return llvm::ArrayRef<T>(data.data(), data.size());
-#else
-  return llvm::makeArrayRef<T>(data.data(), data.size());
-#endif
-}
-
 // LLVM 11 changes the InlineFunction API so it takes the CallBase argument as
 // a reference now. Therefore, we need a generic helper that will also work for
 // prior LLVM versions.

From c22c69f19a88b1c3f91bf81a64d7b2490abe5b16 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:14:48 +0000
Subject: [PATCH 046/182] [multi_llvm] Remove TypeSize helpers

---
 .../include/multi_llvm/multi_llvm.h            | 18 ------------------
 .../source/analysis/simd_width_analysis.cpp    |  5 +++--
 .../vecz/source/offset_info.cpp                |  2 +-
 .../vecz/source/transform/packetizer.cpp       | 11 +++++------
 .../vecz/source/vector_target_info.cpp         | 10 ++++++----
 5 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index d32dfab405ddd..23805e2748de4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -67,24 +67,6 @@ inline void insertAtEnd(llvm::BasicBlock *bb, llvm::Instruction *newInst) {
 #endif
 }
 
-template <typename T>
-inline typename std::remove_reference_t<T>::ScalarTy getFixedValue(T &&V) {
-#if LLVM_VERSION_MAJOR >= 16
-  return V.getFixedValue();
-#else
-  return V.getFixedSize();
-#endif
-}
-
-template <typename T>
-inline typename std::remove_reference_t<T>::ScalarTy getKnownMinValue(T &&M) {
-#if LLVM_VERSION_MAJOR >= 16
-  return M.getKnownMinValue();
-#else
-  return M.getKnownMinSize();
-#endif
-}
-
 /// @brief Create a binary operation corresponding to the given
 /// `llvm::RecurKind` with the two provided arguments. It may not
 /// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
index ba8eeaa77a09b..fa5c64bb78480 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -171,8 +171,9 @@ SimdWidthAnalysis::Result SimdWidthAnalysis::run(
   VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
 
   // If the target does not provide vector registers, return 0.
-  MaxVecRegBitWidth = multi_llvm::getFixedValue(
-      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector));
+  MaxVecRegBitWidth =
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
 
   if (MaxVecRegBitWidth == 0) {
     return 0;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index a73ed219d7941..41bc88eefe6bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -36,7 +36,7 @@ using namespace llvm;
 
 namespace {
 inline uint64_t SizeOrZero(TypeSize &&T) {
-  return T.isScalable() ? 0 : multi_llvm::getFixedValue(T);
+  return T.isScalable() ? 0 : T.getFixedValue();
 }
 
 uint8_t highbit(const uint32_t x) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 456197020dd25..da8eb1a6c1572 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1737,10 +1737,10 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
           // If it's an alloca we can widen, we can just change the size
           llvm::TypeSize const allocSize =
               Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
-          auto const lifeSize = allocSize.isScalable() || SimdWidth.isScalable()
-                                    ? -1
-                                    : multi_llvm::getKnownMinValue(allocSize) *
-                                          SimdWidth.getKnownMinValue();
+          auto const lifeSize =
+              allocSize.isScalable() || SimdWidth.isScalable()
+                  ? -1
+                  : allocSize.getKnownMinValue() * SimdWidth.getKnownMinValue();
           CI->setOperand(
               0, ConstantInt::get(CI->getOperand(0)->getType(), lifeSize));
           results.push_back(CI);
@@ -2421,8 +2421,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     // alignment, but may be overaligned. After vectorization it can't be
     // larger than the pointee element type.
     unsigned alignment = op.getAlignment();
-    unsigned sizeInBits =
-        multi_llvm::getKnownMinValue(dataTy->getPrimitiveSizeInBits());
+    unsigned sizeInBits = dataTy->getPrimitiveSizeInBits().getKnownMinValue();
     alignment = std::min(alignment, std::max(sizeInBits, 8u) / 8u);
 
     // Regular load or store.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 7a3d3d74dd8eb..4b4b99069253b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -1278,8 +1278,9 @@ bool TargetInfo::interleaveVectors(IRBuilder<> &B,
 unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
                                        const ArrayRef<const Value *> vals,
                                        unsigned width) const {
-  unsigned MaxVecRegBitWidth = multi_llvm::getFixedValue(
-      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector));
+  unsigned MaxVecRegBitWidth =
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
 
   unsigned NumVecRegs =
       TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));
@@ -1302,8 +1303,9 @@ unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
 
 unsigned TargetInfo::getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
                                            const llvm::Type &Ty) const {
-  unsigned MaxVecRegBitWidth = multi_llvm::getFixedValue(
-      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector));
+  unsigned MaxVecRegBitWidth =
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
 
   if (MaxVecRegBitWidth == 0) {
     return 0;

From 6b4ec57082324858e433d0bb52dee643d0695890 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:16:48 +0000
Subject: [PATCH 047/182] [multi_llvm] Remove insertInto helpers

---
 .../compiler_pipeline/include/multi_llvm/multi_llvm.h     | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 23805e2748de4..39c6f557bd298 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -59,14 +59,6 @@ inline llvm::DILocation *getDILocation(unsigned Line, unsigned Column,
                                InlinedAt, /*ImplicitCode*/ false);
 }
 
-inline void insertAtEnd(llvm::BasicBlock *bb, llvm::Instruction *newInst) {
-#if LLVM_VERSION_MAJOR >= 16
-  newInst->insertInto(bb, bb->end());
-#else
-  bb->getInstList().push_back(newInst);
-#endif
-}
-
 /// @brief Create a binary operation corresponding to the given
 /// `llvm::RecurKind` with the two provided arguments. It may not
 /// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one

From e4dc6c4fd1ecc416d5dd681ca253b5c405ac1bc9 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:20:22 +0000
Subject: [PATCH 048/182] [multi_llvm] Remove InlineFunction helper

---
 .../include/multi_llvm/multi_llvm.h              | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 39c6f557bd298..bd1e1d73f18fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -29,22 +29,6 @@
 
 namespace multi_llvm {
 
-// LLVM 11 changes the InlineFunction API so it takes the CallBase argument as
-// a reference now. Therefore, we need a generic helper that will also work for
-// prior LLVM versions.
-inline llvm::InlineResult InlineFunction(llvm::CallInst *CI,
-                                         llvm::InlineFunctionInfo &IFI,
-                                         llvm::AAResults *CalleeAAR = nullptr,
-                                         bool InsertLifetime = true) {
-#if LLVM_VERSION_MAJOR >= 16
-  return llvm::InlineFunction(*CI, IFI, /* MergeAttributes */ false, CalleeAAR,
-                              InsertLifetime,
-                              /* *ForwardVarArgsTo */ nullptr);
-#else
-  return llvm::InlineFunction(*CI, IFI, CalleeAAR, InsertLifetime);
-#endif
-}
-
 inline llvm::StructType *getStructTypeByName(llvm::LLVMContext &ctx,
                                              llvm::StringRef name) {
   return llvm::StructType::getTypeByName(ctx, name);

From b88164f0729e737ed5f9dcef47bf0c5f213e4a5d Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:21:48 +0000
Subject: [PATCH 049/182] [multi_llvm] Remove
 addVectorizableFunctionsFromVecLib helper

---
 .../compiler_pipeline/include/multi_llvm/multi_llvm.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index bd1e1d73f18fe..7bdf35348f015 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -95,17 +95,6 @@ inline llvm::Value *createBinOpForRecurKind(llvm::IRBuilder<> &B,
             : (isSigned ? llvm::Intrinsic::smax : llvm::Intrinsic::umax);
   return B.CreateBinaryIntrinsic(intrOpc, lhs, rhs);
 }
-
-inline void addVectorizableFunctionsFromVecLib(
-    llvm::TargetLibraryInfoImpl &TLII,
-    llvm::TargetLibraryInfoImpl::VectorLibrary VecLib, llvm::Triple TT) {
-#if LLVM_VERSION_MAJOR >= 16
-  TLII.addVectorizableFunctionsFromVecLib(VecLib, TT);
-#else
-  (void)TT;
-  TLII.addVectorizableFunctionsFromVecLib(VecLib);
-#endif
-}
 }  // namespace multi_llvm
 
 #endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED

From 41dad0f9b8ea0bc515f66613647fc67d3689a821 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:24:01 +0000
Subject: [PATCH 050/182] [multi_llvm] Clean up unused includes

---
 .../compiler_pipeline/include/multi_llvm/multi_llvm.h         | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 7bdf35348f015..620e7bc96eb71 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -16,11 +16,7 @@
 #ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 #define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 
-#include <llvm/ADT/ArrayRef.h>
-#include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/IVDescriptors.h>
-#include <llvm/Analysis/TargetLibraryInfo.h>
-#include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/Transforms/Utils/Cloning.h>

From f1132928c1ce78cdbe5dbad3a22e367c4f8fb06c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:26:01 +0000
Subject: [PATCH 051/182] [multi_llvm] Remove Optional helper

---
 .../include/multi_llvm/optional_helper.h      | 78 -------------------
 1 file changed, 78 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
deleted file mode 100644
index 410606bc736ff..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/optional_helper.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
-#define MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED
-
-#include <multi_llvm/llvm_version.h>
-
-#if (LLVM_VERSION_MAJOR < 17)
-#include <llvm/ADT/None.h>
-#include <llvm/ADT/Optional.h>
-#endif
-
-#include <optional>
-
-namespace multi_llvm {
-
-#if (LLVM_VERSION_MAJOR >= 16)
-
-template <typename T>
-using Optional = std::optional<T>;
-static constexpr std::nullopt_t None = std::nullopt;
-
-#else
-
-using llvm::None;
-using llvm::NoneType;
-template <typename T>
-class Optional : public llvm::Optional<T> {
- public:
-  constexpr Optional() = default;
-  constexpr Optional(llvm::NoneType) {}
-
-  constexpr Optional(const T &value) : llvm::Optional<T>(value) {}
-  constexpr Optional(T &&value) : llvm::Optional<T>(std::move(value)) {}
-
-  Optional &operator=(const T &y) {
-    llvm::Optional<T>::operator=(y);
-    return *this;
-  }
-  Optional &operator=(T &&y) {
-    llvm::Optional<T>::operator=(std::forward<T>(y));
-    return *this;
-  }
-
-  constexpr Optional(llvm::Optional<T> &&value)
-      : llvm::Optional<T>(std::move(value)) {}
-
-  inline constexpr bool has_value() const {
-    return llvm::Optional<T>::hasValue();
-  }
-
-  // Provide implicit conversions to the future proof std::optional.
-  inline constexpr operator std::optional<T>() const {
-    return llvm::Optional<T>::hasValue()
-               ? std::optional<T>(llvm::Optional<T>::getValue())
-               : std::nullopt;
-  }
-};
-
-#endif
-
-}  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_OPTIONAL_HELPER_H_INCLUDED

From 7009d6a6f98b402174a5679f43f7b25b093c565c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:26:51 +0000
Subject: [PATCH 052/182] Remove old LLVM 15 code

---
 .../compiler_passes/vecz/source/control_flow_boscc.cpp      | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 0eb13795b4b51..4c24097961066 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -185,14 +185,8 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
     // first div_causing block.
     if (!sortedNewRegionBlocks.empty() &&
         entry->getNextNode() != sortedNewRegionBlocks[0]) {
-#if LLVM_VERSION_MAJOR >= 16
       F.splice(entry->getNextNode()->getIterator(), &F,
                sortedNewRegionBlocks[0]->getIterator(), F.end());
-#else
-      F.getBasicBlockList().splice(
-          entry->getNextNode()->getIterator(), F.getBasicBlockList(),
-          sortedNewRegionBlocks[0]->getIterator(), F.end());
-#endif
     }
   }
 

From 48113797d29a3472f5b42530a265d307af7b64ca Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:33:17 +0000
Subject: [PATCH 053/182] [multi_llvm] Remove getStructTypeByName helper

---
 .../compiler_pipeline/include/multi_llvm/multi_llvm.h        | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 620e7bc96eb71..544b9e75f9c41 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -25,11 +25,6 @@
 
 namespace multi_llvm {
 
-inline llvm::StructType *getStructTypeByName(llvm::LLVMContext &ctx,
-                                             llvm::StringRef name) {
-  return llvm::StructType::getTypeByName(ctx, name);
-}
-
 inline llvm::DILocation *getDILocation(unsigned Line, unsigned Column,
                                        llvm::MDNode *Scope,
                                        llvm::MDNode *InlinedAt = nullptr) {

From 85623271122806797559b9c7654cc8e51e63c967 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:42:46 +0000
Subject: [PATCH 054/182] [multi_llvm] Remove getDILocation helper

I've moved this into the vectorizer where it's arguably useful in a
localized context. The two other uses outside of the vectorizer don't
need the helper at all, because their scope is always non-null.
---
 .../include/multi_llvm/multi_llvm.h           |  9 ------
 .../vecz/source/vectorization_helpers.cpp     | 32 +++++++++++--------
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 544b9e75f9c41..e23bf6dfa22b9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -25,15 +25,6 @@
 
 namespace multi_llvm {
 
-inline llvm::DILocation *getDILocation(unsigned Line, unsigned Column,
-                                       llvm::MDNode *Scope,
-                                       llvm::MDNode *InlinedAt = nullptr) {
-  // If no scope is available, this is an unknown location.
-  if (!Scope) return llvm::DebugLoc();
-  return llvm::DILocation::get(Scope->getContext(), Line, Column, Scope,
-                               InlinedAt, /*ImplicitCode*/ false);
-}
-
 /// @brief Create a binary operation corresponding to the given
 /// `llvm::RecurKind` with the two provided arguments. It may not
 /// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index e8b7dc9360f7f..2d0c98fc340c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -22,8 +22,6 @@
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/Support/Debug.h>
-#include <multi_llvm/multi_llvm.h>
-#include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
 #include "vectorization_context.h"
@@ -221,6 +219,14 @@ Function *cloneFunctionToVector(VectorizationUnit const &VU) {
   return VectorizedFn;
 }
 
+static DILocation *getDILocation(unsigned Line, unsigned Column, MDNode *Scope,
+                                 MDNode *InlinedAt = nullptr) {
+  // If no scope is available, this is an unknown location.
+  if (!Scope) return DebugLoc();
+  return DILocation::get(Scope->getContext(), Line, Column, Scope, InlinedAt,
+                         /*ImplicitCode*/ false);
+}
+
 void cloneDebugInfo(VectorizationUnit const &VU) {
   DISubprogram *const ScalarDI = VU.scalarFunction()->getSubprogram();
   // We don't have debug info
@@ -279,14 +285,13 @@ void cloneDebugInfo(VectorizationUnit const &VU) {
             continue;
           }
 
-          const DebugLoc InlinedAtLoc = multi_llvm::getDILocation(
+          const DebugLoc InlinedAtLoc = getDILocation(
               InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI);
-          VectorLoc =
-              multi_llvm::getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(),
-                                        ScalarLoc.getScope(), InlinedAtLoc);
+          VectorLoc = getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(),
+                                    ScalarLoc.getScope(), InlinedAtLoc);
         } else {
-          VectorLoc = multi_llvm::getDILocation(ScalarLoc.getLine(),
-                                                ScalarLoc.getCol(), VectorDI);
+          VectorLoc =
+              getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(), VectorDI);
         }
 
         // New DILocalVariable in the scope of vectorized function
@@ -353,15 +358,14 @@ void cloneDebugInfo(VectorizationUnit const &VU) {
         if (DILocation *const InlinedLoc = ScalarLoc.getInlinedAt()) {
           // Don't support nested inlined locations for now
           if (!InlinedLoc->getInlinedAt()) {
-            const DebugLoc VectorKernel = multi_llvm::getDILocation(
+            const DebugLoc VectorKernel = getDILocation(
                 InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI);
-            VectorLoc = multi_llvm::getDILocation(
-                ScalarLoc.getLine(), ScalarLoc.getCol(), ScalarLoc.getScope(),
-                VectorKernel);
+            VectorLoc = getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(),
+                                      ScalarLoc.getScope(), VectorKernel);
           }
         } else {
-          VectorLoc = multi_llvm::getDILocation(ScalarLoc.getLine(),
-                                                ScalarLoc.getCol(), VectorDI);
+          VectorLoc =
+              getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(), VectorDI);
         }
         InstItr.setDebugLoc(VectorLoc);
       }

From 3da0921bba7a07964d59aac70ecdf09b7f61d2c6 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 17:55:30 +0000
Subject: [PATCH 055/182] [multi_llvm] Move some vectorization helpers into the
 vectorizer

---
 .../include/multi_llvm/creation_apis_helper.h | 25 -----------------
 .../include/transform/packetization_helpers.h |  8 ++++++
 .../transform/packetization_helpers.cpp       | 26 ++++++++++++++++--
 .../vecz/source/transform/packetizer.cpp      | 27 +++++++++----------
 .../vecz/source/vector_target_info.cpp        | 11 ++++----
 5 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
index 815c763c88c12..2cb2d69928e08 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
@@ -21,34 +21,9 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Value.h>
 #include <llvm/Support/TypeSize.h>
-#include <multi_llvm/vector_type_helper.h>
 
 namespace multi_llvm {
 
-inline llvm::Value *createAllTrueMask(llvm::IRBuilder<> &B,
-                                      llvm::ElementCount EC) {
-  return llvm::ConstantInt::getTrue(llvm::VectorType::get(B.getInt1Ty(), EC));
-}
-
-inline llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder,
-                                        llvm::Type *Ty, llvm::ElementCount EC,
-                                        const llvm::Twine &Name = "") {
-  (void)Builder;
-  (void)Name;
-  if (EC.isScalable()) {
-    // FIXME: This intrinsic works on fixed-length types too: should we migrate
-    // to using it starting from LLVM 13?
-    return Builder.CreateStepVector(Ty, Name);
-  }
-
-  llvm::SmallVector<llvm::Constant *, 16> Indices;
-  unsigned SimdWidth = EC.getFixedValue();
-  for (unsigned i = 0; i < SimdWidth; i++) {
-    Indices.push_back(llvm::ConstantInt::get(getVectorElementType(Ty), i));
-  }
-  return llvm::ConstantVector::get(Indices);
-}
-
 inline llvm::CallInst *createRISCVMaskedIntrinsic(
     llvm::IRBuilder<> &B, llvm::Intrinsic::ID ID,
     llvm::ArrayRef<llvm::Type *> Types, llvm::ArrayRef<llvm::Value *> Args,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index 1dac719f28eee..cd8409ecb36d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -117,6 +117,14 @@ llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices,
                                     llvm::Type *Ty, unsigned FixedVecElts,
                                     const llvm::Twine &N = "");
 
+/// @brief Returns a boolean vector with all elements set to 'true'.
+llvm::Value *createAllTrueMask(llvm::IRBuilder<> &B, llvm::ElementCount EC);
+
+/// @brief Returns an integer step vector, representing the sequence 0 ... N-1.
+llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder,
+                                 llvm::VectorType *VecTy,
+                                 const llvm::Twine &Name = "");
+
 /// @brief Class that represents a range in a vector of Value pointers.
 /// The range is represented by its integer starting index and length, so that
 /// it remains valid if the vector re-allocates its storage.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 6213225d42170..a6b91a3035cbe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -253,8 +253,8 @@ Value *sanitizeVPReductionInput(IRBuilder<> &B, Value *Val, Value *VL,
   Type *const ValTy = Val->getType();
   ElementCount const EC = multi_llvm::getVectorElementCount(ValTy);
   Value *const VLSplat = B.CreateVectorSplat(EC, VL);
-  Value *const IdxVec = multi_llvm::createIndexSequence(
-      B, VectorType::get(VL->getType(), EC), EC);
+  Value *const IdxVec =
+      createIndexSequence(B, VectorType::get(VL->getType(), EC));
   Value *const ActiveMask = B.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
   auto *const NeutralVal = compiler::utils::getNeutralVal(Kind, ValTy);
   return B.CreateSelect(ActiveMask, Val, NeutralVal);
@@ -272,6 +272,28 @@ Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty,
   auto *const StepsMul = B.CreateMul(Steps, FixedVecEltsSplat);
   return B.CreateAdd(StepsMul, Indices, N);
 }
+
+Value *createAllTrueMask(IRBuilder<> &B, ElementCount EC) {
+  return ConstantInt::getTrue(VectorType::get(B.getInt1Ty(), EC));
+}
+
+Value *createIndexSequence(IRBuilder<> &Builder, VectorType *VecTy,
+                           const Twine &Name) {
+  auto EC = VecTy->getElementCount();
+  if (EC.isScalable()) {
+    // FIXME: This intrinsic works on fixed-length types too: should we migrate
+    // to using it starting from LLVM 13?
+    return Builder.CreateStepVector(VecTy, Name);
+  }
+
+  SmallVector<Constant *, 16> Indices;
+  auto *EltTy = VecTy->getElementType();
+  for (unsigned i = 0, e = EC.getFixedValue(); i != e; i++) {
+    Indices.push_back(ConstantInt::get(EltTy, i));
+  }
+  return ConstantVector::get(Indices);
+}
+
 }  // namespace vecz
 
 PacketRange PacketInfo::getRange(std::vector<llvm::Value *> &d,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index da8eb1a6c1572..562d4431b0831 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -496,9 +496,8 @@ bool Packetizer::Impl::packetize() {
       } else if (TargetArg.PointerRetPointeeTy &&
                  PAR.needsPacketization(TargetArg.NewArg)) {
         if (!idxVector) {
-          idxVector = multi_llvm::createIndexSequence(
-              B, VectorType::get(B.getInt32Ty(), SimdWidth), SimdWidth,
-              "index.vec");
+          idxVector = createIndexSequence(
+              B, VectorType::get(B.getInt32Ty(), SimdWidth), "index.vec");
         }
 
         // CA-3943 this implementation looks unlikely to be correct, but for
@@ -2259,8 +2258,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       // Bitcast the above sub-splat to purely scalar pointers
       vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
       // Create an index sequence to start the offseting process
-      Value *idxVector = multi_llvm::createIndexSequence(
-          B, VectorType::get(B.getInt32Ty(), wideEC), wideEC, "index.vec");
+      Value *idxVector = createIndexSequence(
+          B, VectorType::get(B.getInt32Ty(), wideEC), "index.vec");
       PACK_FAIL_IF(!idxVector);
       // Modulo the indices 0,1,2,.. with the original vector type, producing,
       // e.g., for the above: <0,1,2,3,0,1,2,3>
@@ -2313,7 +2312,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     if (mask || EVL) {
       if (!mask) {
         // If there's no mask then just splat a trivial one.
-        auto *const trueMask = multi_llvm::createAllTrueMask(
+        auto *const trueMask = createAllTrueMask(
             B, multi_llvm::getVectorElementCount(packetVecTy));
         std::fill(maskPacket.begin(), maskPacket.end(), trueMask);
       } else {
@@ -2374,7 +2373,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     if (mask || EVL) {
       if (!mask) {
         // If there's no mask then just splat a trivial one.
-        auto *const trueMask = multi_llvm::createAllTrueMask(
+        auto *const trueMask = createAllTrueMask(
             B, multi_llvm::getVectorElementCount(packetVecTy));
         std::fill(maskPacket.begin(), maskPacket.end(), trueMask);
       } else {
@@ -2602,7 +2601,7 @@ ValuePacket Packetizer::Impl::packetizeBinaryOp(BinaryOperator *BinOp) {
       PACK_FAIL_IF(packetWidth != 1);
       auto VPId = VPIntrinsic::getForOpcode(opcode);
       PACK_FAIL_IF(VPId == Intrinsic::not_intrinsic);
-      auto *const Mask = multi_llvm::createAllTrueMask(
+      auto *const Mask = createAllTrueMask(
           B, multi_llvm::getVectorElementCount(LHS[0]->getType()));
       // Scale the base length by the number of vector elements, where
       // appropriate.
@@ -2976,8 +2975,8 @@ Value *Packetizer::Impl::vectorizeWorkGroupCall(
   auto const Uniformity = Builtin.uniformity;
   if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
       Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
-    Value *StepVector = multi_llvm::createIndexSequence(B, Splat->getType(),
-                                                        SimdWidth, "index.vec");
+    Value *StepVector =
+        createIndexSequence(B, cast<VectorType>(Splat->getType()), "index.vec");
     VECZ_FAIL_IF(!StepVector);
 
     Value *Result = B.CreateAdd(Splat, StepVector);
@@ -3038,8 +3037,8 @@ Value *Packetizer::Impl::vectorizeAlloca(AllocaInst *alloca) {
   deleteInstructionLater(alloca);
 
   auto *const idxTy = Ctx.dataLayout()->getIndexType(wideAlloca->getType());
-  Value *const indices = multi_llvm::createIndexSequence(
-      B, VectorType::get(idxTy, SimdWidth), SimdWidth);
+  Value *const indices =
+      createIndexSequence(B, VectorType::get(idxTy, SimdWidth));
 
   return B.CreateInBoundsGEP(ty, wideAlloca, ArrayRef<Value *>{indices},
                              Twine(alloca->getName(), ".lanes"));
@@ -3421,8 +3420,8 @@ ValuePacket Packetizer::Impl::packetizeShuffleVector(
       auto *const vecMask =
           VTI.createOuterScalableBroadcast(B, mask, EVL, SimdWidth);
 
-      auto *const idxVector = multi_llvm::createIndexSequence(
-          B, VectorType::get(B.getInt32Ty(), fullWidth), fullWidth);
+      auto *const idxVector =
+          createIndexSequence(B, VectorType::get(B.getInt32Ty(), fullWidth));
 
       // We need to create offsets into the source operand subvectors, to add
       // onto the broadcast shuffle mask, so that each subvector of the
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 4b4b99069253b..e158a42eaab74 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -100,8 +100,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
                              "reported it would be illegal");
         VECZ_FAIL();
       }
-      auto *Mask = multi_llvm::createAllTrueMask(
-          B, multi_llvm::getVectorElementCount(Ty));
+      auto *Mask = createAllTrueMask(B, multi_llvm::getVectorElementCount(Ty));
       SmallVector<llvm::Value *, 2> Args = {VecPtr, Mask, EVL};
       SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
@@ -169,8 +168,8 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
                              "reported it would be illegal");
         VECZ_FAIL();
       }
-      auto *Mask = multi_llvm::createAllTrueMask(
-          B, multi_llvm::getVectorElementCount(VecTy));
+      auto *Mask =
+          createAllTrueMask(B, multi_llvm::getVectorElementCount(VecTy));
       SmallVector<llvm::Value *, 3> Args = {Data, VecPtr, Mask, EVL};
       SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
@@ -438,7 +437,7 @@ Value *TargetInfo::createMaskedInterleavedLoad(IRBuilder<> &B, Type *Ty,
   Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
 
   Value *IndicesVector =
-      multi_llvm::createIndexSequence(B, StrideSplat->getType(), EC);
+      createIndexSequence(B, cast<VectorType>(StrideSplat->getType()));
   VECZ_FAIL_IF(!IndicesVector);
   IndicesVector = B.CreateMul(StrideSplat, IndicesVector);
 
@@ -459,7 +458,7 @@ Value *TargetInfo::createMaskedInterleavedStore(IRBuilder<> &B, Value *Data,
   Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
 
   Value *IndicesVector =
-      multi_llvm::createIndexSequence(B, StrideSplat->getType(), EC);
+      createIndexSequence(B, cast<VectorType>(StrideSplat->getType()));
   VECZ_FAIL_IF(!IndicesVector);
   IndicesVector = B.CreateMul(StrideSplat, IndicesVector);
 

From d47f463d58d66bc24f418d60f533f4a2e3c6ab1f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 18:02:14 +0000
Subject: [PATCH 056/182] [multi_llvm] Remove creation_apis_helper.h

The only remaining helper function was local to one file in the
vectorizer.
---
 .../include/multi_llvm/creation_apis_helper.h | 40 -------------------
 .../transform/packetization_helpers.cpp       |  1 -
 .../vecz/source/transform/packetizer.cpp      |  1 -
 .../vecz/source/vector_target_info.cpp        |  1 -
 .../vecz/source/vector_target_info_riscv.cpp  | 20 +++++++---
 .../vecz/source/vectorization_context.cpp     |  1 -
 6 files changed, 15 insertions(+), 49 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
deleted file mode 100644
index 2cb2d69928e08..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/creation_apis_helper.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#ifndef MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
-#define MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
-
-#include <llvm/IR/BasicBlock.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/Value.h>
-#include <llvm/Support/TypeSize.h>
-
-namespace multi_llvm {
-
-inline llvm::CallInst *createRISCVMaskedIntrinsic(
-    llvm::IRBuilder<> &B, llvm::Intrinsic::ID ID,
-    llvm::ArrayRef<llvm::Type *> Types, llvm::ArrayRef<llvm::Value *> Args,
-    unsigned TailPolicy, llvm::Instruction *FMFSource = nullptr,
-    const llvm::Twine &Name = "") {
-  llvm::SmallVector<llvm::Value *> InArgs(Args.begin(), Args.end());
-  InArgs.push_back(
-      B.getIntN(Args.back()->getType()->getIntegerBitWidth(), TailPolicy));
-  return B.CreateIntrinsic(ID, Types, InArgs, FMFSource, Name);
-}
-
-}  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_CREATION_APIS_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index a6b91a3035cbe..a76651a6412f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -27,7 +27,6 @@
 #include <llvm/Analysis/VectorUtils.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
-#include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 562d4431b0831..b63211def0e30 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -36,7 +36,6 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/LoopUtils.h>
-#include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index e158a42eaab74..4069f46e14b1b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -19,7 +19,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/MC/TargetRegistry.h>
 #include <llvm/Target/TargetMachine.h>
-#include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/triple.h>
 #include <multi_llvm/vector_type_helper.h>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 5907b1aa14feb..e26cbd895cf4c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -18,7 +18,6 @@
 #include <llvm/IR/IntrinsicsRISCV.h>
 #include <llvm/Support/MathExtras.h>
 #include <llvm/Target/TargetMachine.h>
-#include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -444,6 +443,18 @@ llvm::Value *TargetInfoRISCV::createScalableBroadcast(llvm::IRBuilder<> &B,
   return gather;
 }
 
+static CallInst *createRISCVMaskedIntrinsic(IRBuilder<> &B, Intrinsic::ID ID,
+                                            ArrayRef<Type *> Types,
+                                            ArrayRef<Value *> Args,
+                                            unsigned TailPolicy,
+                                            Instruction *FMFSource = nullptr,
+                                            const Twine &Name = "") {
+  SmallVector<Value *> InArgs(Args.begin(), Args.end());
+  InArgs.push_back(
+      B.getIntN(Args.back()->getType()->getIntegerBitWidth(), TailPolicy));
+  return B.CreateIntrinsic(ID, Types, InArgs, FMFSource, Name);
+}
+
 llvm::Value *TargetInfoRISCV::createScalableInsertElement(
     llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
     llvm::Instruction *origInsert, llvm::Value *elt, llvm::Value *into,
@@ -546,10 +557,9 @@ llvm::Value *TargetInfoRISCV::createScalableInsertElement(
   // -> <0,1,0,0, 0,0,0,1, 1,0,0,0, ...>
   auto *const mask = B.CreateICmpEQ(index, innerIndices, "vm");
 
-  return multi_llvm::createRISCVMaskedIntrinsic(
-      B, intrinsicID, {intoTy, avl->getType()},
-      {into, elt, outerIndices, mask, avl},
-      /*TailUndisturbed*/ 1);
+  return createRISCVMaskedIntrinsic(B, intrinsicID, {intoTy, avl->getType()},
+                                    {into, elt, outerIndices, mask, avl},
+                                    /*TailUndisturbed*/ 1);
 }
 
 llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index e6fda0c73117c..99fd244cbe0bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -22,7 +22,6 @@
 #include <llvm/ADT/Statistic.h>
 #include <llvm/IR/Attributes.h>
 #include <llvm/Target/TargetMachine.h>
-#include <multi_llvm/creation_apis_helper.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <algorithm>

From a8ae40fe260d3f027e003563b1577174e2f5558e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 18:14:07 +0000
Subject: [PATCH 057/182] [multi_llvm] Move createBinOpForRecurKind helper

This hasn't belonged in multi_llvm for a long time. It's now just a
reusable compiler helper function.

This commit moves it into pass_functions.h, which itself is a bit
problematic as it's such a generic concept for a file. But it's better
than living in multi_llvm and we'll need to sort out 'pass_functions.h'
at some point in the future anyway.
---
 .../include/multi_llvm/multi_llvm.h           | 61 +------------------
 .../include/transform/packetization_helpers.h |  1 +
 .../source/include/vectorization_context.h    |  3 +
 .../vecz/source/transform/packetizer.cpp      | 10 +--
 .../vecz/source/vectorization_context.cpp     |  6 +-
 .../vecz/source/vectorization_helpers.cpp     |  1 +
 6 files changed, 16 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index e23bf6dfa22b9..b20e29b63c90c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -13,70 +13,11 @@
 // under the License.
 //
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 #define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 
-#include <llvm/Analysis/IVDescriptors.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/Transforms/Utils/Cloning.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/triple.h>
 
-namespace multi_llvm {
-
-/// @brief Create a binary operation corresponding to the given
-/// `llvm::RecurKind` with the two provided arguments. It may not
-/// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one
-/// operation: integer min/max operations may defer to multiple instructions or
-/// intrinsics depending on the LLVM version.
-///
-/// @param[in] B the IRBuilder to build new instructions
-/// @param[in] lhs the left-hand value for the operation
-/// @param[in] rhs the right-hand value for the operation
-/// @param[in] kind the kind of operation to create
-/// @param[out] The binary operation.
-inline llvm::Value *createBinOpForRecurKind(llvm::IRBuilder<> &B,
-                                            llvm::Value *lhs, llvm::Value *rhs,
-                                            llvm::RecurKind kind) {
-  switch (kind) {
-    default:
-      break;
-    case llvm::RecurKind::None:
-      return nullptr;
-    case llvm::RecurKind::Add:
-      return B.CreateAdd(lhs, rhs);
-    case llvm::RecurKind::Mul:
-      return B.CreateMul(lhs, rhs);
-    case llvm::RecurKind::Or:
-      return B.CreateOr(lhs, rhs);
-    case llvm::RecurKind::And:
-      return B.CreateAnd(lhs, rhs);
-    case llvm::RecurKind::Xor:
-      return B.CreateXor(lhs, rhs);
-    case llvm::RecurKind::FAdd:
-      return B.CreateFAdd(lhs, rhs);
-    case llvm::RecurKind::FMul:
-      return B.CreateFMul(lhs, rhs);
-  }
-  assert((kind == llvm::RecurKind::FMin || kind == llvm::RecurKind::FMax ||
-          kind == llvm::RecurKind::SMin || kind == llvm::RecurKind::SMax ||
-          kind == llvm::RecurKind::UMin || kind == llvm::RecurKind::UMax) &&
-         "Unexpected min/max kind");
-  if (kind == llvm::RecurKind::FMin || kind == llvm::RecurKind::FMax) {
-    return B.CreateBinaryIntrinsic(kind == llvm::RecurKind::FMin
-                                       ? llvm::Intrinsic::minnum
-                                       : llvm::Intrinsic::maxnum,
-                                   lhs, rhs);
-  }
-  bool isMin = kind == llvm::RecurKind::SMin || kind == llvm::RecurKind::UMin;
-  bool isSigned =
-      kind == llvm::RecurKind::SMin || kind == llvm::RecurKind::SMax;
-  llvm::Intrinsic::ID intrOpc =
-      isMin ? (isSigned ? llvm::Intrinsic::smin : llvm::Intrinsic::umin)
-            : (isSigned ? llvm::Intrinsic::smax : llvm::Intrinsic::umax);
-  return B.CreateBinaryIntrinsic(intrOpc, lhs, rhs);
-}
-}  // namespace multi_llvm
-
 #endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index cd8409ecb36d1..6366145fd2572 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -23,6 +23,7 @@
 
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/IVDescriptors.h>
 #include <llvm/IR/IRBuilder.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index aa8b0654733ec..8c119eee3c8da 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -22,8 +22,11 @@
 #define VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
 
 #include <llvm/ADT/DenseMap.h>
+#include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/PassManager.h>
 #include <llvm/IR/ValueHandle.h>
 #include <llvm/Support/TypeSize.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/multi_llvm.h>
 
 #include <map>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index b63211def0e30..7600f96e6c3db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -19,8 +19,10 @@
 #include <compiler/utils/builtin_info.h>
 #include <compiler/utils/group_collective_helpers.h>
 #include <compiler/utils/mangling.h>
+#include <compiler/utils/pass_functions.h>
 #include <llvm/ADT/DepthFirstIterator.h>
 #include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallString.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/ADT/Twine.h>
 #include <llvm/Analysis/LoopInfo.h>
@@ -1208,8 +1210,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
     for (decltype(packetWidth) i = 0; i < packetWidth; ++i) {
       Value *const lhs = opPackets[i];
       Value *const rhs = opPackets[i + packetWidth];
-      opPackets[i] =
-          multi_llvm::createBinOpForRecurKind(B, lhs, rhs, Info->Recurrence);
+      opPackets[i] = compiler::utils::createBinOpForRecurKind(B, lhs, rhs,
+                                                              Info->Recurrence);
     }
   }
 
@@ -2098,8 +2100,8 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
 
   Value *const Splat = B.CreateVectorSplat(SimdWidth, ExclScanCI);
 
-  auto *const Result = multi_llvm::createBinOpForRecurKind(B, VectorScan, Splat,
-                                                           Scan.Recurrence);
+  auto *const Result = compiler::utils::createBinOpForRecurKind(
+      B, VectorScan, Splat, Scan.Recurrence);
 
   results.push_back(Result);
   return results;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 99fd244cbe0bf..e4fbbf4d67d4a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -19,6 +19,7 @@
 #include <compiler/utils/builtin_info.h>
 #include <compiler/utils/group_collective_helpers.h>
 #include <compiler/utils/mangling.h>
+#include <compiler/utils/pass_functions.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/IR/Attributes.h>
 #include <llvm/Target/TargetMachine.h>
@@ -661,7 +662,8 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
       N = N2;
       auto *const Shuffle =
           createOptimalShuffle(B, Result, NeutralVec, mask, Twine("scan_impl"));
-      Result = multi_llvm::createBinOpForRecurKind(B, Result, Shuffle, OpKind);
+      Result =
+          compiler::utils::createBinOpForRecurKind(B, Result, Shuffle, OpKind);
     }
 
     if (!IsInclusive) {
@@ -738,7 +740,7 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   auto *const Mask = B.CreateXor(MaskPhi, SplatN, "mask");
   auto *const Shuffle = VTI.createVectorShuffle(B, VecPhi, Mask, VL);
   auto *const Accum =
-      multi_llvm::createBinOpForRecurKind(B, VecPhi, Shuffle, OpKind);
+      compiler::utils::createBinOpForRecurKind(B, VecPhi, Shuffle, OpKind);
 
   auto *const NBit = B.CreateAnd(MaskPhi, SplatN, "isolate");
   auto *const Which = B.CreateICmpNE(NBit, VZero, "which");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 2d0c98fc340c9..0385dec201531 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -22,6 +22,7 @@
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/Cloning.h>
 
 #include "debugging.h"
 #include "vectorization_context.h"

From 23584d525529fcb00d9c80fb8cbafe134b551007 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 18:25:18 +0000
Subject: [PATCH 058/182] [multi_llvm] Clean up assorted 'inline' versioning

---
 .../vecz/source/vecz_pass_builder.cpp                | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index fa9d5cb1373cd..39f4910bcb184 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -119,7 +119,6 @@ void VeczPassMachinery::addClassToPassNames() {
   // Register a callback which skips all passes once we've failed to vectorize
   // a function.
   PIC.registerShouldRunOptionalPassCallback([&](StringRef, llvm::Any IR) {
-#if LLVM_VERSION_GREATER_EQUAL(16, 0)
     const Function **FPtr = any_cast<const Function *>(&IR);
     const Function *F = FPtr ? *FPtr : nullptr;
     if (!F) {
@@ -130,17 +129,6 @@ void VeczPassMachinery::addClassToPassNames() {
         return true;
       }
     }
-#else
-    const Function *F = nullptr;
-    if (any_isa<const Function *>(IR)) {
-      F = any_cast<const Function *>(IR);
-    } else if (any_isa<const Loop *>(IR)) {
-      F = any_cast<const Loop *>(IR)->getHeader()->getParent();
-    } else {
-      // Always run module passes
-      return true;
-    }
-#endif
     // FIXME: This is repeating the job of the VectorizationUnitAnalysis.
     // We should track 'failure' more directly in the
     // Function/VectorizationContext?

From 6b5b08de1b111da5ba151fe9e477a31485612e9f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 Nov 2023 19:16:54 +0000
Subject: [PATCH 059/182] [vecz] Support vector-predicated reductions natively

The initial vecz support for vector-predication was implemented around
LLVM 12, before there were intrinsics available for reduction
operations. This meant that we had to work around the lack intrinsics by
using regular reduction intrinsics and 'sanitizing' the input by masking
out the unwanted vector elements with the neutral value.

Vector-predicated reduction intrinsics have been around since LLVM 14 so
it's high time we accommodate them natively. This should lead to better
code generation when vector-predicating kernels.
---
 .../include/transform/packetization_helpers.h | 26 +++---
 .../transform/packetization_helpers.cpp       | 78 ++++++++++++++---
 .../vecz/source/transform/packetizer.cpp      | 52 ++----------
 .../llvm/VectorPredication/boscc_reduction.ll |  7 +-
 .../packetize_mask_varying.ll                 |  7 +-
 .../VectorPredication/subgroup_reductions.ll  | 83 +++++--------------
 ...ions_spv_khr_uniform_group_instructions.ll | 72 ++++++----------
 7 files changed, 143 insertions(+), 182 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index 6366145fd2572..c5da96058e219 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -24,6 +24,7 @@
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/IR/IRBuilder.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
@@ -87,18 +88,21 @@ bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B,
                      llvm::SmallVectorImpl<llvm::Value *> &srcs,
                      unsigned subWidth);
 
-/// @brief Utility function for sanitizing the input to a reduction when
-/// vector-predicating. Since VP reduction intrinsics didn't land in LLVM 13,
-/// reductions must ensure that elements past VL don't affect the result.
+/// @brief Utility function for creating a reduction operation.
 ///
-/// Only works on RecurKind::And, Or, Add, SMin, SMax, UMin, UMax, FAdd.
+/// The value must be a vector.
 ///
-/// @param[in] B IRBuilder to build any new instructions created
-/// @param[in] Val The value to sanitize
-/// @param[in] VL The vector length
-/// @param[in] Kind The kind of reduction to sanitize for
-llvm::Value *sanitizeVPReductionInput(llvm::IRBuilder<> &B, llvm::Value *Val,
-                                      llvm::Value *VL, llvm::RecurKind Kind);
+/// If VL is passed and is non-null, it is assumed to be the i32 value
+/// representing the active vector length. The reduction will be
+/// vector-predicated according to this length.
+///
+/// Only works on RecurKind::And, Or, Xor, Add, Mul, FAdd, FMul, {S,U,F}Min,
+/// {S,U,F}Max.
+llvm::Value *createMaybeVPTargetReduction(llvm::IRBuilderBase &B,
+                                          const llvm::TargetTransformInfo &TTI,
+                                          llvm::Value *Val,
+                                          llvm::RecurKind Kind,
+                                          llvm::Value *VL = nullptr);
 
 /// @brief Utility function to obtain an indices vector to be used in a gather
 /// operation.
@@ -119,7 +123,7 @@ llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices,
                                     const llvm::Twine &N = "");
 
 /// @brief Returns a boolean vector with all elements set to 'true'.
-llvm::Value *createAllTrueMask(llvm::IRBuilder<> &B, llvm::ElementCount EC);
+llvm::Value *createAllTrueMask(llvm::IRBuilderBase &B, llvm::ElementCount EC);
 
 /// @brief Returns an integer step vector, representing the sequence 0 ... N-1.
 llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index a76651a6412f8..9c5c5c8530792 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -27,6 +27,8 @@
 #include <llvm/Analysis/VectorUtils.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/Transforms/Utils/LoopUtils.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -247,16 +249,70 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
   return true;
 }
 
-Value *sanitizeVPReductionInput(IRBuilder<> &B, Value *Val, Value *VL,
-                                RecurKind Kind) {
-  Type *const ValTy = Val->getType();
-  ElementCount const EC = multi_llvm::getVectorElementCount(ValTy);
-  Value *const VLSplat = B.CreateVectorSplat(EC, VL);
-  Value *const IdxVec =
-      createIndexSequence(B, VectorType::get(VL->getType(), EC));
-  Value *const ActiveMask = B.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
-  auto *const NeutralVal = compiler::utils::getNeutralVal(Kind, ValTy);
-  return B.CreateSelect(ActiveMask, Val, NeutralVal);
+Value *createMaybeVPTargetReduction(IRBuilderBase &B,
+                                    const TargetTransformInfo &TTI, Value *Val,
+                                    RecurKind Kind, Value *VL) {
+  assert(isa<VectorType>(Val->getType()) && "Must be vector type");
+  // If VL is null, it's not a vector-predicated reduction.
+  if (!VL) {
+    return createSimpleTargetReduction(B, &TTI, Val, Kind);
+  }
+  auto IntrinsicOp = Intrinsic::not_intrinsic;
+  switch (Kind) {
+    default:
+      break;
+    case RecurKind::None:
+      return nullptr;
+    case RecurKind::Add:
+      IntrinsicOp = Intrinsic::vp_reduce_add;
+      break;
+    case RecurKind::Mul:
+      IntrinsicOp = Intrinsic::vp_reduce_mul;
+      break;
+    case RecurKind::Or:
+      IntrinsicOp = Intrinsic::vp_reduce_or;
+      break;
+    case RecurKind::And:
+      IntrinsicOp = Intrinsic::vp_reduce_and;
+      break;
+    case RecurKind::Xor:
+      IntrinsicOp = Intrinsic::vp_reduce_xor;
+      break;
+    case RecurKind::FAdd:
+      IntrinsicOp = Intrinsic::vp_reduce_fadd;
+      break;
+    case RecurKind::FMul:
+      IntrinsicOp = Intrinsic::vp_reduce_fmul;
+      break;
+    case RecurKind::SMin:
+      IntrinsicOp = Intrinsic::vp_reduce_smin;
+      break;
+    case RecurKind::SMax:
+      IntrinsicOp = Intrinsic::vp_reduce_smax;
+      break;
+    case RecurKind::UMin:
+      IntrinsicOp = Intrinsic::vp_reduce_umin;
+      break;
+    case RecurKind::UMax:
+      IntrinsicOp = Intrinsic::vp_reduce_umax;
+      break;
+    case RecurKind::FMin:
+      IntrinsicOp = Intrinsic::vp_reduce_fmin;
+      break;
+    case RecurKind::FMax:
+      IntrinsicOp = Intrinsic::vp_reduce_fmax;
+      break;
+  }
+
+  auto *const F = Intrinsic::getDeclaration(B.GetInsertBlock()->getModule(),
+                                            IntrinsicOp, Val->getType());
+  assert(F && "Could not declare vector-predicated reduction intrinsic");
+
+  auto *const VecTy = cast<VectorType>(Val->getType());
+  auto *const NeutralVal =
+      compiler::utils::getNeutralVal(Kind, VecTy->getElementType());
+  auto *const Mask = createAllTrueMask(B, VecTy->getElementCount());
+  return B.CreateCall(F, {NeutralVal, Val, Mask, VL});
 }
 
 Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty,
@@ -272,7 +328,7 @@ Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty,
   return B.CreateAdd(StepsMul, Indices, N);
 }
 
-Value *createAllTrueMask(IRBuilder<> &B, ElementCount EC) {
+Value *createAllTrueMask(IRBuilderBase &B, ElementCount EC) {
   return ConstantInt::getTrue(VectorType::get(B.getInt1Ty(), EC));
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 7600f96e6c3db..f1deff2a27ee2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -839,12 +839,7 @@ Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
   // value.
   Value *&f = conds.front();
 
-  if (VL) {
-    f = sanitizeVPReductionInput(B, f, VL, kind);
-    VECZ_FAIL_IF(!f);
-  }
-
-  return createSimpleTargetReduction(B, &TTI, f, kind);
+  return createMaybeVPTargetReduction(B, TTI, f, kind, VL);
 }
 
 Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) {
@@ -899,14 +894,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
       if (newCond->getType()->isVectorTy()) {
         IRBuilder<> B(Branch);
         RecurKind kind = RecurKind::Or;
-        // Sanitize VP reduction inputs, if required.
-        if (VL) {
-          newCond = sanitizeVPReductionInput(B, newCond, VL, kind);
-          if (!newCond) {
-            return Packetizer::Result(*this);
-          }
-        }
-        newCond = createSimpleTargetReduction(B, &TTI, newCond, kind);
+        newCond = createMaybeVPTargetReduction(B, TTI, newCond, kind, VL);
       }
 
       Branch->setCondition(newCond);
@@ -1183,19 +1171,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   // them of ordering? See CA-3969.
   op.getPacketValues(packetWidth, opPackets);
 
-  // When in VP mode, pre-sanitize the reduction input (before VP reduction
-  // intrinsics, introduced in LLVM 14)
-  if (VL) {
-    assert(opPackets.size() == 1 &&
-           "Should have bailed if dealing with more than one packet");
-    Value *&val = opPackets.front();
-    val = sanitizeVPReductionInput(B, val, VL, Info->Recurrence);
-    if (!val) {
-      emitVeczRemarkMissed(
-          &F, CI, "Can not vector-predicate workgroup/subgroup reduction");
-      return nullptr;
-    }
-  }
+  assert((!VL || packetWidth) &&
+         "Should have bailed if dealing with more than one VP packet");
 
   // According to the OpenCL Spec, we are allowed to rearrange the operation
   // order of a workgroup/subgroup reduction any way we like (even though
@@ -1216,8 +1193,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   }
 
   // Reduce to a scalar.
-  Value *v =
-      createSimpleTargetReduction(B, &TTI, opPackets.front(), Info->Recurrence);
+  Value *v = createMaybeVPTargetReduction(B, TTI, opPackets.front(),
+                                          Info->Recurrence, VL);
 
   // We leave the original reduction function and divert the vectorized
   // reduction through it, giving us a reduction over the full apparent
@@ -1624,14 +1601,8 @@ Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
     auto *maskInst = dyn_cast<Instruction>(vecMask);
     IRBuilder<> B(maskInst ? buildAfter(maskInst, F) : I);
 
-    // Sanitize any vector-predicated inputs.
-    if (VL) {
-      vecMask = sanitizeVPReductionInput(B, vecMask, VL, RecurKind::Or);
-      VECZ_FAIL_IF(!vecMask);
-    }
-
     Value *anyOfMask =
-        createSimpleTargetReduction(B, &TTI, vecMask, RecurKind::Or);
+        createMaybeVPTargetReduction(B, TTI, vecMask, RecurKind::Or, VL);
     anyOfMask->setName("any_of_mask");
 
     if (isVector) {
@@ -2072,13 +2043,8 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
   // Thus we essentially keep the original group scan, but change it to be an
   // exclusive one.
   auto *Reduction = Ops.front();
-  if (VL) {
-    Reduction = sanitizeVPReductionInput(B, Reduction, VL, Scan.Recurrence);
-    if (!Reduction) {
-      return results;
-    }
-  }
-  Reduction = createSimpleTargetReduction(B, &TTI, Reduction, Scan.Recurrence);
+  Reduction =
+      createMaybeVPTargetReduction(B, TTI, Reduction, Scan.Recurrence, VL);
 
   // Now we defer to an *exclusive* scan over the group.
   auto ExclScan = Scan;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
index c8393af17a5f0..d23ac4380018b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -43,9 +43,4 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) nocapture readonly %a, ptr addrspace(1) nocapture %out)
 ; CHECK:  [[CMP:%.*]] = fcmp oeq <vscale x 2 x float> %{{.*}}, zeroinitializer
-; CHECK:  [[INS:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[VL:%.*]], {{(i32|i64)}} 0
-; CHECK:  [[SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[INS]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK:  [[IDX:%.*]] = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
-; CHECK:  [[MASK:%.*]] = icmp ult <vscale x 2 x i32> [[IDX]], [[SPLAT]]
-; CHECK:  [[INP:%.*]] = select <vscale x 2 x i1> [[MASK]], <vscale x 2 x i1> [[CMP]], <vscale x 2 x i1> zeroinitializer
-; CHECK:  %{{.*}} = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[INP]])
+; CHECK:  %{{.*}} = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, <vscale x 2 x i1> [[CMP]], {{.*}}, i32 {{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 4e7ba7db75a99..6fc5db369a69f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -39,12 +39,7 @@ if.end:
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying
 ; CHECK: [[CMP:%.*]] = icmp slt <vscale x 4 x i64> %{{.*}},
-; CHECK: [[INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VL:%.*]], {{(i32|i64)}} 0
-; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK: [[IDX:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK: [[MASK:%.*]] = icmp ult <vscale x 4 x i32> [[IDX]], [[SPLAT]]
-; CHECK: [[INP:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[CMP]], <vscale x 4 x i1> zeroinitializer
-; CHECK: [[RED:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[INP]])
+; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[CMP]], {{.*}}, i32 {{.*}})
 ; CHECK: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0
 ; CHECK: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
index e4f95885c58b8..9e35021ec6536 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -49,13 +49,8 @@ entry:
   store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32(
-; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ult <4 x i32> [[S]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[T2]]
-; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4
-; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], -1
+; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]])
 ; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
@@ -75,13 +70,8 @@ entry:
   store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32(
-; CHECK: [[T2:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i1> [[T2]], <4 x i1> zeroinitializer
-; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[I]] to i4
-; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0
+; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]])
 ; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
 ; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
@@ -99,11 +89,8 @@ entry:
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.add.v4i32(i32 0, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -120,11 +107,8 @@ entry:
   store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i64(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer
-; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1(
+; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.add.v4i64(i64 0, <4 x i64> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]])
 ; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -141,11 +125,8 @@ entry:
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_add_f32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float -0.000000e+00, float -0.000000e+00,
-; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[I]])
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]])
 ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -162,11 +143,8 @@ entry:
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_smin_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 2147483647, i32 2147483647, 
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smin.v4i32(i32 2147483647, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -183,11 +161,8 @@ entry:
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_umin_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umin.v4i32(i32 -1, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -204,11 +179,8 @@ entry:
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_smax_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -2147483648, i32 -2147483648, 
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smax.v4i32(i32 -2147483648, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -225,11 +197,8 @@ entry:
   store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_umax_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umax.v4i32(i32 0, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 }
@@ -246,12 +215,9 @@ entry:
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_fmin_f32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
-; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[I]])
-; CHEKC: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]])
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmin.v4f32(float 0x7FF8000000000000, <4 x float> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]])
 ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
 
@@ -267,11 +233,8 @@ entry:
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_fmax_f32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 0xFFF8000000000000, float 0xFFF8000000000000,
-; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[I]])
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmax.v4f32(float 0xFFF8000000000000, <4 x float> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]])
 ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 357d6bd0d0143..5ee579906c23e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -35,11 +35,8 @@ declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1)
 declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.mul.v4i32(i32 1, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -56,11 +53,8 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i64(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
-; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1(
+; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.mul.v4i64(i64 1, <4 x i64> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]])
 ; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -77,11 +71,8 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_f32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x float> {{%.*}}, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[I]])
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]])
 ; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -98,11 +89,8 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_and_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> 
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.and.v4i32(i32 -1, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -119,11 +107,8 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_or_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i32> {{%.*}}, <4 x i32> zeroinitializer
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[I]])
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.or.v4i32(i32 0, <4 x i32> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 [[R]])
 ; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -139,31 +124,28 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i32(
-; CHECK: [[SI:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, {{(i32|i64)}} 0
-; CHECK: [[S:%.*]] = shufflevector <4 x i32> [[SI]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[C:%.*]] = icmp ugt <4 x i32> [[S]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK: [[I:%.*]] = select <4 x i1> [[C]], <4 x i64> {{%.*}}, <4 x i64> zeroinitializer
-; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[I]])
+; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i64(
+; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1(
+; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.xor.v4i64(i64 0, <4 x i64> [[C]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]])
-; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
-define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 8
+define spir_kernel void @reduce_xor_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
   %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
   %conv = zext i32 %call1 to i64
-  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
-  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 8
   %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0)
   %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
-  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 8
   ret void
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and(
-; This doesn't generate a reduction intrinsic...
-; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1
-; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]])
+; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[R]])
 ; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -182,8 +164,9 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or(
-; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0
-; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]])
+; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[R]])
 ; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
@@ -202,10 +185,9 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor(
-; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
-; CHECK: [[T:%.*]] = and i4 [[X]], 1
-; CHECK: [[C:%.*]] = icmp ne i4 [[T]], 0
-; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[C]])
+; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[R]])
 ; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {

From e2eddfb4a6e001159b9722d9c46ce4aa4d4e8944 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 1 Nov 2023 15:27:02 +0000
Subject: [PATCH 060/182] [vecz] Fix vectorization of sub-group broadcasts

Forgetting to normalize the broadcast ID back into the 'mux' range would
produce incorrect results on an implementation where the mux sub-groups
were non-trivial, or where bounds were checked. This is, as such, not
currently a visible problem in the default implementation of these
builtins, but rather a theoretical one.
---
 .../vecz/source/transform/packetizer.cpp      | 33 +++++++++++++++----
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |  2 +-
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index f1deff2a27ee2..de76544755b87 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1249,17 +1249,17 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   if (SimdWidth.isScalable()) {
     idxFactor = B.CreateVScale(minVal);
   }
-  idx = B.CreateURem(idx, idxFactor);
+  auto *const vecIdx = B.CreateURem(idx, idxFactor);
 
   Value *val = nullptr;
   // Optimize the constant fixed-vector case, where we can choose the exact
   // subpacket to extract from directly.
-  if (isa<ConstantInt>(idx) && !SimdWidth.isScalable()) {
+  if (isa<ConstantInt>(vecIdx) && !SimdWidth.isScalable()) {
     ValuePacket opPackets;
     op.getPacketValues(opPackets);
     auto factor = SimdWidth.divideCoefficientBy(opPackets.size());
     const unsigned subvecSize = factor.getFixedValue();
-    const unsigned idxVal = cast<ConstantInt>(idx)->getZExtValue();
+    const unsigned idxVal = cast<ConstantInt>(vecIdx)->getZExtValue();
     // If individual elements are scalar (through instantiation, say) then just
     // use the desired packet directly.
     if (subvecSize == 1) {
@@ -1268,16 +1268,37 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
       // Else extract from the correct packet, adjusting the index as we go.
       val = B.CreateExtractElement(
           opPackets[idxVal / subvecSize],
-          ConstantInt::get(idx->getType(), idxVal % subvecSize));
+          ConstantInt::get(vecIdx->getType(), idxVal % subvecSize));
     }
   } else {
-    val = B.CreateExtractElement(op.getAsValue(), idx);
+    val = B.CreateExtractElement(op.getAsValue(), vecIdx);
   }
 
-  // We leave the origial broadcast function and divert the vectorized
+  // We leave the original broadcast function and divert the vectorized
   // broadcast through it, giving us a broadcast over the full apparent
   // sub-group or work-group size (vecz * mux).
   CI->setOperand(argIdx, val);
+  if (!isWorkGroup) {
+    // For sub-groups, we need to normalize the sub-group ID into the range of
+    // mux sub-groups.
+    //       |-----------------|-----------------|
+    //       | broadcast(X, 6) | broadcast(A, 6) |
+    // VF=4  |-----------------|-----------------|
+    //       | b(<X,Y,Z,W>, 6) | b(<A,B,C,D>, 6) |
+    //       |-----------------|-----------------|
+    // M=I/4 |        1        |        1        |
+    // V=I%4 |        2        |        2        |
+    //       |-----------------|-----------------|
+    //       |   <X,Y,Z,W>[V]  |   <A,B,C,D>[V]  |
+    //       |       Z         |       C         |
+    //       |-----------------|-----------------|
+    //       | broadcast(Z, M) | broadcast(C, M) |
+    // res   |       C         |       C         |
+    // splat |    <C,C,C,C>    |    <C,C,C,C>    |
+    //       |-----------------|-----------------|
+    auto *const muxIdx = B.CreateUDiv(idx, idxFactor);
+    CI->setOperand(argIdx + 1, muxIdx);
+  }
 
   return CI;
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index ed45025e3b512..d118893c3f1d6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -83,7 +83,7 @@ define spir_kernel void @sub_group_broadcast_wider_than_vf(i32 addrspace(1)* %in
 ; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4
 ; The sixth sub-group member is the (6 % 4 ==) 2nd vector group member
 ; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 2
-; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 6)
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 1)
 ; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
 ; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)

From 437baf78a8d6efea39914d41fe7859dc373ab55a Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 30 Oct 2023 16:58:27 +0000
Subject: [PATCH 061/182] [vecz] Packetize sub-group shuffle_(up|down) builtins

This extends fixed-width vectorization capabilities to
`__mux_sub_group_shuffle_up` and `__mux_sub_group_shuffle_down`
builtins. Again, these aren't very efficiently vectorized as we have to
perform the shuffle for each work-item in a pseudo-scalarized fashion.
---
 .../vecz/source/transform/packetizer.cpp      | 320 ++++++++++++++++--
 .../test/lit/llvm/subgroup_shuffle_down.ll    | 186 +++++++++-
 .../vecz/test/lit/llvm/subgroup_shuffle_up.ll | 216 +++++++++++-
 3 files changed, 686 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index de76544755b87..071d72773f5bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -152,6 +152,11 @@ class Packetizer::Impl : public Packetizer {
   /// @return Packetized values.
   ValuePacket packetizeAndGet(Value *V, unsigned Width);
 
+  /// @brief Helper to produce a Result from a Packet
+  Packetizer::Result getPacketizationResult(
+      Instruction *I, const SmallVectorImpl<Value *> &Packet,
+      bool UpdateStats = false);
+
   /// @brief Packetize the given value from the function, only if it is a
   /// varying value. Ensures Mask Varying values are handled correctly.
   ///
@@ -233,6 +238,17 @@ class Packetizer::Impl : public Packetizer {
   /// @return Packetized instructions.
   Result packetizeSubgroupShuffleXor(
       Instruction *Ins, compiler::utils::GroupCollective ShuffleXor);
+  /// @brief Packetize a sub-group shuffle-up or shuffle-down builtin
+  ///
+  /// Note - not any shuffle-like operation, but specifically the 'shuffle_up'
+  /// and 'shuffle_down' builtins.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  /// @param[in] ShuffleUpDown Shuffle to packetize.
+  ///
+  /// @return Packetized instructions.
+  Result packetizeSubgroupShuffleUpDown(
+      Instruction *Ins, compiler::utils::GroupCollective ShuffleUpDown);
 
   /// @brief Packetize PHI node.
   ///
@@ -803,6 +819,41 @@ PacketRange Packetizer::createPacket(Value *V, unsigned width) {
   return Result(*this, V, &info).createPacket(width);
 }
 
+Packetizer::Result Packetizer::Impl::getPacketizationResult(
+    Instruction *I, const SmallVectorImpl<Value *> &Packet, bool UpdateStats) {
+  if (Packet.empty()) {
+    return Result(*this);
+  }
+  auto PacketWidth = Packet.size();
+
+  // If there's only one value in the packet, we can assign the new packetized
+  // value to the old instruction directly.
+  if (PacketWidth == 1) {
+    Value *Vec = Packet.front();
+    if (Vec != I) {
+      // Only delete if the vectorized value is different from the scalar.
+      IC.deleteInstructionLater(I);
+    }
+    vectorizeDI(I, Vec);
+    return assign(I, Vec);
+  }
+
+  // Otherwise we have to create a 'Result' out of the packetized values.
+  IC.deleteInstructionLater(I);
+  auto &Info = packets[I];
+  auto Res = Result(*this, I, &Info);
+  auto P = Res.createPacket(PacketWidth);
+  for (unsigned i = 0; i < PacketWidth; ++i) {
+    P[i] = Packet[i];
+  }
+
+  if (UpdateStats) {
+    ++VeczPacketized;
+  }
+  Info.numInstances = PacketWidth;
+  return Res;
+}
+
 Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
                                           bool allOf) {
   // Get the branch condition at its natural packet width
@@ -938,6 +989,12 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
           return s;
         }
         break;
+      case compiler::utils::GroupCollective::OpKind::ShuffleUp:
+      case compiler::utils::GroupCollective::OpKind::ShuffleDown:
+        if (auto s = packetizeSubgroupShuffleUpDown(Ins, *shuffle)) {
+          return s;
+        }
+        break;
     }
     // We can't packetize all sub-group shuffle-like operations, but we also
     // can't vectorize or instantiate them - so provide a diagnostic saying as
@@ -1099,29 +1156,8 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
       break;
   }
 
-  if (!results.empty()) {
-    auto packetWidth = results.size();
-    if (packetWidth == 1) {
-      Value *vec = results.front();
-      if (vec != Ins) {
-        // Only delete if the vectorized value is different from the scalar.
-        IC.deleteInstructionLater(Ins);
-      }
-      vectorizeDI(Ins, vec);
-      return assign(Ins, vec);
-    } else {
-      IC.deleteInstructionLater(Ins);
-      auto &info = packets[Ins];
-      auto res = Result(*this, Ins, &info);
-      auto P = res.createPacket(packetWidth);
-      for (unsigned i = 0; i < packetWidth; ++i) {
-        P[i] = results[i];
-        // TODO CA-3376: vectorize the debug instructions
-      }
-      info.numInstances = packetWidth;
-      ++VeczPacketized;
-      return res;
-    }
+  if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) {
+    return res;
   }
 
   if (auto *vec = vectorizeInstruction(Ins)) {
@@ -1587,6 +1623,244 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
   return assign(CI, CombinedShuffle);
 }
 
+Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
+    Instruction *I, compiler::utils::GroupCollective ShuffleUpDown) {
+  bool IsDown =
+      ShuffleUpDown.Op == compiler::utils::GroupCollective::OpKind::ShuffleDown;
+  assert((IsDown || ShuffleUpDown.Op ==
+                        compiler::utils::GroupCollective::OpKind::ShuffleUp) &&
+         "Invalid shuffle kind");
+
+  auto *const CI = cast<CallInst>(I);
+
+  // We don't support scalable vectorization of sub-group shuffles.
+  if (SimdWidth.isScalable()) {
+    return Packetizer::Result(*this);
+  }
+  unsigned const VF = SimdWidth.getFixedValue();
+
+  // LHS is 'current' for a down-shuffle, and 'previous' for an up-shuffle.
+  auto *const LHSOp = CI->getArgOperand(0);
+  // RHS is 'next' for a down-shuffle, and 'current' for an up-shuffle.
+  auto *const RHSOp = CI->getArgOperand(1);
+  auto *const DeltaOp = CI->getArgOperand(2);
+
+  auto PackDelta = packetize(DeltaOp);
+  if (!PackDelta) {
+    return Packetizer::Result(*this);
+  }
+
+  auto PackLHS = packetize(LHSOp);
+  if (!PackLHS) {
+    return Packetizer::Result(*this);
+  }
+
+  auto PackRHS = packetize(RHSOp);
+  if (!PackRHS) {
+    return Packetizer::Result(*this);
+  }
+
+  auto *const LHSPackVal = PackLHS.getAsValue();
+  auto *const RHSPackVal = PackRHS.getAsValue();
+  assert(LHSPackVal && RHSPackVal &&
+         LHSPackVal->getType() == RHSPackVal->getType());
+
+  // Remember in the example below that the builtins take *deltas* which add
+  // onto the mux sub-group local ID. Therefore a delta of 2 returns different
+  // data for each of the mux sub-group elements.
+  //                |----------------------------|----------------------------|
+  //                |   shuffle_down(A, X, 2)    |   shuffle_down(E, I, 2)    |
+  // VF=4           |----------------------------|----------------------------|
+  //                | s(<A,B,C,D>, <X,Y,Z,W>, 2) | s(<E,F,G,H>, <I,J,K,L>, 2) |
+  // SGIds          |          0,1,2,3           |          4,5,6,7           |
+  // SGIds+D        |          2,3,4,5           |          6,7,8,9           |
+  // MuxSGIds       |          0,0,0,0           |          1,1,1,1           |
+  //                |----------------------------|----------------------------|
+  // M=(SGIds+D)/VF |          0,0,1,1           |          1,1,2,2           |
+  // V=(SGIds+D)%VF |          2,3,0,1           |          2,3,0,1           |
+  //                |----------------------------|----------------------------|
+  // M - MuxSGIds   |          0,0,1,1           |          0,0,1,1           |
+  //                |----------------------------|----------------------------|
+  // Shuff[0]       | s(<A,B,C,D>, <X,Y,Z,W>, 0) | s(<E,F,G,H>, <I,J,K,L>, 0) |
+  // Data returned  | 0+0 => 0 => <A,B,C,D>      | 1+0 => 1 => <E,F,G,H>      |
+  // Shuff[0][V[0]] |     <A,B,C,D>[2] = C       |     <E,F,G,H>[2] = G       |
+  //                |----------------------------|----------------------------|
+  // Shuff[1]       | s(<A,B,C,D>, <X,Y,Z,W>, 0) | s(<E,F,G,H>, <I,J,K,L>, 0) |
+  // Data returned  | 0+0 => 0 => <A,B,C,D>      | 1+0 => 1 => <E,F,G,H>      |
+  // Shuff[1][V[1]] |     <A,B,C,D>[3] = D       |     <E,F,G,H>[3] = H       |
+  //                |----------------------------|----------------------------|
+  // Shuff[2]       | s(<A,B,C,D>, <X,Y,Z,W>, 1) | s(<E,F,G,H>, <I,J,K,L>, 1) |
+  // Data returned  | 0+1 => 1 => <E,F,G,H>      | 1+1 => 2 => 0 => <X,Y,Z,W> |
+  // Shuff[2][V[2]] |     <E,F,G,H>[0] = E       |     <X,Y,Z,W>[0] = X       |
+  //                |----------------------------|----------------------------|
+  // Shuff[3]       | s(<A,B,C,D>, <X,Y,Z,W>, 1) | s(<E,F,G,H>, <I,J,K,L>, 1) |
+  // Data returned  | 0+1 => 1 => <E,F,G,H>      | 1+1 => 2 => 0 => <X,Y,Z,W> |
+  // Shuff[3][V[3]] |     <E,F,G,H>[1] = F       |     <X,Y,Z,W>[1] = Y       |
+  //                |----------------------------|----------------------------|
+  // Result         |          C,D,E,F           |          G,H,X,Y           |
+  IRBuilder<> B(CI);
+
+  // Grab the packetized/vectorized sub-group local IDs
+  auto *const SubgroupLocalIDFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      compiler::utils::eMuxBuiltinGetSubGroupLocalId, *F.getParent(),
+      {CI->getType()});
+  assert(SubgroupLocalIDFn);
+
+  auto *const SubgroupLocalID =
+      B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
+  auto const Builtin =
+      Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
+
+  // Vectorize the sub-group local ID
+  auto *const VecSubgroupLocalID =
+      vectorizeWorkGroupCall(SubgroupLocalID, Builtin);
+  if (!VecSubgroupLocalID) {
+    return Packetizer::Result(*this);
+  }
+  VecSubgroupLocalID->setName("vec.sg.local.id");
+
+  auto *const DeltaVal = PackDelta.getAsValue();
+
+  // The delta is always i32, as is the sub-group local ID. Vectorizing both of
+  // them should result in the same vector type, with as many elements as the
+  // vectorization factor.
+  assert(DeltaVal->getType() == VecSubgroupLocalID->getType() &&
+         DeltaVal->getType()->isVectorTy() &&
+         cast<VectorType>(DeltaVal->getType())
+                 ->getElementCount()
+                 .getKnownMinValue() == VF &&
+         "Unexpected vectorization of sub-group shuffle up/down");
+
+  // Produce the sum of the sub-group IDs with the 'delta', as per the
+  // semantics of the builtin.
+  auto *const IDPlusDelta = IsDown ? B.CreateAdd(VecSubgroupLocalID, DeltaVal)
+                                   : B.CreateSub(VecSubgroupLocalID, DeltaVal);
+
+  // We need to sanitize the input indices so that they stay within the range
+  // of one vectorized group.
+  auto *const VecIdxFactor = ConstantInt::get(SubgroupLocalID->getType(), VF);
+
+  // Bring this ID into the range of 'mux' sub-groups by dividing it by the
+  // vector size. We have to do this differently for 'up' and 'down' shuffles
+  // because the 'up' shuffles use signed indexing, and we need to round down
+  // to negative infinity to get the right sub-group delta.
+  Value *MuxAbsoluteIDs = nullptr;
+  Value *VecEltIDs = nullptr;
+  if (IsDown) {
+    MuxAbsoluteIDs =
+        B.CreateUDiv(IDPlusDelta, B.CreateVectorSplat(VF, VecIdxFactor));
+    // And into the range of the vector group
+    VecEltIDs =
+        B.CreateURem(IDPlusDelta, B.CreateVectorSplat(VF, VecIdxFactor));
+  } else {
+    // Note that shuffling up is more complicated, owing to the signed
+    // sub-group local IDs.
+    // The steps are identical to the example outlined above, except both the
+    // division and modulo operations performed on the sub-group IDs have to
+    // floor towards negative infinity. That is, we want to see:
+    //                |----------------------------|---------------------------|
+    //                |    shuffle_up(A, X, 2)     |    shuffle_up(E, I, 2)    |
+    // VF=4           |----------------------------|---------------------------|
+    //                | s(<A,B,C,D>, <X,Y,Z,W>, 2) | s(<E,F,G,H>, <I,J,K,L>, 2)|
+    // SGIds          |          0,1,2,3           |          4,5,6,7          |
+    // SGIds-D        |        -2,-1,0,1           |          2,3,4,5          |
+    // MuxSGIds       |          0,0,0,0           |          1,1,1,1          |
+    //                |----------------------------|---------------------------|
+    // both flooring: |                            |                           |
+    // M=(SGIds-D)/VF |        -1,-1,0,0           |          0,0,1,1          |
+    // V=(SGIds-D)%VF |          2,3,0,1           |          2,3,0,1          |
+    //                |----------------------------|---------------------------|
+    // MuxSGIds - M   |          1,1,0,0           |          1,1,0,0          |
+    //                |----------------------------|---------------------------|
+    //
+    // We use the following formulae for division and modulo:
+    // int div_floor(int x, int y) {
+    //   int q = x/y;
+    //   int r = x%y;
+    //   if ((r!=0) && ((r<0) != (y<0))) --q;
+    //   return q;
+    // }
+    // int mod_floor(int x, int y) {
+    //   int r = x%y;
+    //   if ((r!=0) && ((r<0) != (y<0))) { r += y; }
+    //   return r;
+    // }
+    // We note also that the conditions are equal between the two operations,
+    // and that the condition is equivalent to:
+    //   if ((r!=0) && ((x ^ y) < 0)) { ... }
+    // (see https://alive2.llvm.org/ce/z/ebGrdL)
+    auto *X = IDPlusDelta;
+    auto *Y = B.CreateVectorSplat(VF, VecIdxFactor);
+    auto *const Quotient = B.CreateSDiv(X, Y, "quotient");
+    auto *const Remainder = B.CreateSRem(X, Y, "remainder");
+
+    auto *const ArgXor = B.CreateXor(X, Y, "arg.xor");
+    auto *const One = ConstantInt::get(ArgXor->getType(), 1);
+    auto *const Zero = ConstantInt::get(ArgXor->getType(), 0);
+    auto *const ArgSignDifferent =
+        B.CreateICmpSLT(ArgXor, Zero, "signs.different");
+    auto *const RemainderIsNotZero =
+        B.CreateICmpNE(Remainder, Zero, "remainder.nonzero");
+    auto *const ConditionHolds =
+        B.CreateAnd(RemainderIsNotZero, ArgSignDifferent, "condition.holds");
+    auto *const QuotientMinus1 = B.CreateSub(Quotient, One, "quotient.minus.1");
+    auto *const RemainderPlusY = B.CreateAdd(Remainder, Y, "remainder.plus.y");
+
+    MuxAbsoluteIDs = B.CreateSelect(ConditionHolds, QuotientMinus1, Quotient);
+    VecEltIDs = B.CreateSelect(ConditionHolds, RemainderPlusY, Remainder);
+  }
+
+  // We've produced the 'absolute' mux sub-group local IDs for the data we want
+  // to access in each shuffle, but we want to get back to 'relative' IDs in
+  // the form of deltas. Splat the mux sub-group local ID.
+  auto *const SplatSubgroupLocalID =
+      B.CreateVectorSplat(VF, SubgroupLocalID, "splat.sg.local.id");
+  auto *DeltaLHS = MuxAbsoluteIDs;
+  auto *DeltaRHS = SplatSubgroupLocalID;
+  if (!IsDown) {
+    // For 'up' shuffles, we invert the operation as the deltas are implicitly
+    // negative. See above.
+    std::swap(DeltaLHS, DeltaRHS);
+  }
+  auto *const MuxDeltas =
+      B.CreateSub(DeltaLHS, DeltaRHS, "mux.sg.local.id.deltas");
+
+  auto ShuffleID = Ctx.builtins().getMuxGroupCollective(ShuffleUpDown);
+  auto *const ShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      ShuffleID, *F.getParent(), {LHSPackVal->getType()});
+  assert(ShuffleFn);
+
+  SmallVector<Value *, 16> Results(VF);
+  for (unsigned i = 0; i != VF; i++) {
+    auto *const MuxDelta = B.CreateExtractElement(MuxDeltas, B.getInt32(i));
+    auto *const Shuffle =
+        B.CreateCall(ShuffleFn, {LHSPackVal, RHSPackVal, MuxDelta});
+
+    Value *Elt = nullptr;
+    auto *const Idx = B.CreateExtractElement(VecEltIDs, B.getInt32(i));
+    if (auto *DataVecTy = dyn_cast<VectorType>(LHSOp->getType()); !DataVecTy) {
+      Elt = B.CreateExtractElement(Shuffle, Idx);
+    } else {
+      // For vector data types we need to extract consecutive elements starting
+      // at the sub-vector whose index is Idx.
+      Elt = UndefValue::get(DataVecTy);
+      auto VecWidth = DataVecTy->getElementCount().getFixedValue();
+      // Idx is the 'base' of the subvector, whose elements are stored
+      // sequentially from that point.
+      auto *const VecVecGroupIdx = B.CreateMul(Idx, B.getInt32(VecWidth));
+      for (unsigned j = 0; j != VecWidth; j++) {
+        auto *const E = B.CreateExtractElement(
+            Shuffle, B.CreateAdd(VecVecGroupIdx, B.getInt32(j)));
+        Elt = B.CreateInsertElement(Elt, E, B.getInt32(j));
+      }
+    }
+    Results[i] = Elt;
+  }
+
+  IC.deleteInstructionLater(CI);
+  return getPacketizationResult(I, Results);
+}
+
 Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
   if (auto memop = MemOp::get(I)) {
     auto *const mask = memop->getMaskOperand();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
index a3566c8e61dd7..3e90d729f5b77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
@@ -20,17 +20,187 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK: Could not packetize sub-group shuffle %shuffle_down
-define spir_kernel void @kernel(ptr %in, ptr %out) {
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_down_f32(float %lhs, float %rhs, i32 1)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS:%.*]], <16 x i8> [[RHS:%.*]], i32 [[DELTA0]])
+; CHECK: [[SUBVECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4
+; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0
+; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]]
+; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> undef, i8 [[ELT00]], i32 0
+; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1
+; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]]
+; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1
+; CHECK: [[VECIDX02:%.*]] = add i32 [[ELTBASE0]], 2
+; CHECK: [[ELT02:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX02]]
+; CHECK: [[VEC02:%.*]] = insertelement <4 x i8> [[VEC01]], i8 [[ELT02]], i32 2
+; CHECK: [[VECIDX03:%.*]] = add i32 [[ELTBASE0]], 3
+; CHECK: [[ELT03:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX03]]
+; CHECK: [[VEC03:%.*]] = insertelement <4 x i8> [[VEC02]], i8 [[ELT03]], i32 3
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[SUBVECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4
+; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0
+; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]]
+; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> undef, i8 [[ELT10]], i32 0
+; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1
+; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]]
+; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1
+; CHECK: [[VECIDX12:%.*]] = add i32 [[ELTBASE1]], 2
+; CHECK: [[ELT12:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX12]]
+; CHECK: [[VEC12:%.*]] = insertelement <4 x i8> [[VEC11]], i8 [[ELT12]], i32 2
+; CHECK: [[VECIDX13:%.*]] = add i32 [[ELTBASE1]], 3
+; CHECK: [[ELT13:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX13]]
+; CHECK: [[VEC13:%.*]] = insertelement <4 x i8> [[VEC12]], i8 [[ELT13]], i32 3
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[SUBVECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4
+; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0
+; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]]
+; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> undef, i8 [[ELT20]], i32 0
+; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1
+; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]]
+; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1
+; CHECK: [[VECIDX22:%.*]] = add i32 [[ELTBASE2]], 2
+; CHECK: [[ELT22:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX22]]
+; CHECK: [[VEC22:%.*]] = insertelement <4 x i8> [[VEC21]], i8 [[ELT22]], i32 2
+; CHECK: [[VECIDX23:%.*]] = add i32 [[ELTBASE2]], 3
+; CHECK: [[ELT23:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX23]]
+; CHECK: [[VEC23:%.*]] = insertelement <4 x i8> [[VEC22]], i8 [[ELT23]], i32 3
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[SUBVECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4
+; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0
+; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]]
+; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> undef, i8 [[ELT30]], i32 0
+; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1
+; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]]
+; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1
+; CHECK: [[VECIDX32:%.*]] = add i32 [[ELTBASE3]], 2
+; CHECK: [[ELT32:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX32]]
+; CHECK: [[VEC32:%.*]] = insertelement <4 x i8> [[VEC31]], i8 [[ELT32]], i32 2
+; CHECK: [[VECIDX33:%.*]] = add i32 [[ELTBASE3]], 3
+; CHECK: [[ELT33:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX33]]
+; CHECK: [[VEC33:%.*]] = insertelement <4 x i8> [[VEC32]], i8 [[ELT33]], i32 3
+define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds <4 x i8>, ptr %lhsptr, i64 %gid
+  %lhs = load <4 x i8>, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds <4 x i8>, ptr %rhsptr, i64 %gid
+  %rhs = load <4 x i8>, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call <4 x i8> @__mux_sub_group_shuffle_down_v4i8(<4 x i8> %lhs, <4 x i8> %rhs, i32 2)
+  %arrayidx.out = getelementptr inbounds <4 x i8>, ptr %out, i64 %gid
+  store <4 x i8> %shuffle_up, ptr %arrayidx.out, align 4
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4
+
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, [[DELTALD]]
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) {
   %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.in = getelementptr inbounds i8, ptr %in, i64 %gid
-  %val = load i8, ptr %arrayidx.in, align 8
-  %shuffle_down = call i8 @__mux_sub_group_shuffle_down_i8(i8 %val, i8 %val, i32 1)
-  %arrayidx.out = getelementptr inbounds i8, ptr %out, i64 %gid
-  store i8 %shuffle_down, ptr %arrayidx.out, align 8
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %arrayidx.deltas = getelementptr inbounds i32, ptr %deltaptr, i64 %gid
+  %delta = load i32, ptr %arrayidx.deltas, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_down_f32(float %lhs, float %rhs, i32 %delta)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
   ret void
 }
 
 declare i64 @__mux_get_global_id(i32)
 
-declare i8 @__mux_sub_group_shuffle_down_i8(i8 %curr, i8 %next, i32 %delta)
+declare float @__mux_sub_group_shuffle_down_f32(float %prev, float %curr, i32 %delta)
+declare <4 x i8> @__mux_sub_group_shuffle_down_v4i8(<4 x i8> %prev, <4 x i8> %curr, i32 %delta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
index e27365e85c946..a3e645e88ac17 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
@@ -20,12 +20,217 @@
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK: Could not packetize sub-group shuffle %shuffle_up
-define spir_kernel void @kernel(ptr %in, ptr %out) {
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
+; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
+; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
+
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], <i32 4, i32 4, i32 4, i32 4> 
+
+; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
+; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
+
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]]
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %lhs, float %rhs, i32 1)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
+; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
+; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
+
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], <i32 4, i32 4, i32 4, i32 4> 
+
+; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
+; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
+
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]]
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS:%.*]], <16 x i8> [[RHS:%.*]], i32 [[DELTA0]])
+; CHECK: [[SUBVECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4
+; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0
+; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]]
+; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> undef, i8 [[ELT00]], i32 0
+; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1
+; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]]
+; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1
+; CHECK: [[VECIDX02:%.*]] = add i32 [[ELTBASE0]], 2
+; CHECK: [[ELT02:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX02]]
+; CHECK: [[VEC02:%.*]] = insertelement <4 x i8> [[VEC01]], i8 [[ELT02]], i32 2
+; CHECK: [[VECIDX03:%.*]] = add i32 [[ELTBASE0]], 3
+; CHECK: [[ELT03:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX03]]
+; CHECK: [[VEC03:%.*]] = insertelement <4 x i8> [[VEC02]], i8 [[ELT03]], i32 3
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[SUBVECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4
+; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0
+; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]]
+; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> undef, i8 [[ELT10]], i32 0
+; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1
+; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]]
+; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1
+; CHECK: [[VECIDX12:%.*]] = add i32 [[ELTBASE1]], 2
+; CHECK: [[ELT12:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX12]]
+; CHECK: [[VEC12:%.*]] = insertelement <4 x i8> [[VEC11]], i8 [[ELT12]], i32 2
+; CHECK: [[VECIDX13:%.*]] = add i32 [[ELTBASE1]], 3
+; CHECK: [[ELT13:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX13]]
+; CHECK: [[VEC13:%.*]] = insertelement <4 x i8> [[VEC12]], i8 [[ELT13]], i32 3
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[SUBVECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4
+; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0
+; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]]
+; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> undef, i8 [[ELT20]], i32 0
+; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1
+; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]]
+; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1
+; CHECK: [[VECIDX22:%.*]] = add i32 [[ELTBASE2]], 2
+; CHECK: [[ELT22:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX22]]
+; CHECK: [[VEC22:%.*]] = insertelement <4 x i8> [[VEC21]], i8 [[ELT22]], i32 2
+; CHECK: [[VECIDX23:%.*]] = add i32 [[ELTBASE2]], 3
+; CHECK: [[ELT23:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX23]]
+; CHECK: [[VEC23:%.*]] = insertelement <4 x i8> [[VEC22]], i8 [[ELT23]], i32 3
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[SUBVECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4
+; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0
+; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]]
+; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> undef, i8 [[ELT30]], i32 0
+; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1
+; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]]
+; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1
+; CHECK: [[VECIDX32:%.*]] = add i32 [[ELTBASE3]], 2
+; CHECK: [[ELT32:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX32]]
+; CHECK: [[VEC32:%.*]] = insertelement <4 x i8> [[VEC31]], i8 [[ELT32]], i32 2
+; CHECK: [[VECIDX33:%.*]] = add i32 [[ELTBASE3]], 3
+; CHECK: [[ELT33:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX33]]
+; CHECK: [[VEC33:%.*]] = insertelement <4 x i8> [[VEC32]], i8 [[ELT33]], i32 3
+define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds <4 x i8>, ptr %lhsptr, i64 %gid
+  %lhs = load <4 x i8>, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds <4 x i8>, ptr %rhsptr, i64 %gid
+  %rhs = load <4 x i8>, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call <4 x i8> @__mux_sub_group_shuffle_up_v4i8(<4 x i8> %lhs, <4 x i8> %rhs, i32 2)
+  %arrayidx.out = getelementptr inbounds <4 x i8>, ptr %out, i64 %gid
+  store <4 x i8> %shuffle_up, ptr %arrayidx.out, align 4
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4
+
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[DELTALD]]
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
+; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
+; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
+
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], <i32 4, i32 4, i32 4, i32 4> 
+
+; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
+; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
+
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]]
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) {
   %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.in = getelementptr inbounds float, ptr %in, i64 %gid
-  %val = load float, ptr %arrayidx.in, align 8
-  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %val, float %val, i32 1)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %arrayidx.deltas = getelementptr inbounds i32, ptr %deltaptr, i64 %gid
+  %delta = load i32, ptr %arrayidx.deltas, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %lhs, float %rhs, i32 %delta)
   %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
   store float %shuffle_up, ptr %arrayidx.out, align 8
   ret void
@@ -34,3 +239,4 @@ define spir_kernel void @kernel(ptr %in, ptr %out) {
 declare i64 @__mux_get_global_id(i32)
 
 declare float @__mux_sub_group_shuffle_up_f32(float %prev, float %curr, i32 %delta)
+declare <4 x i8> @__mux_sub_group_shuffle_up_v4i8(<4 x i8> %prev, <4 x i8> %curr, i32 %delta)

From 643bc62946a88765d5271a6a87ac01ef91a3451c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 7 Nov 2023 17:29:23 +0000
Subject: [PATCH 062/182] [vecz] Fix SquashSmallVectorsPass on larger vectors

This commit fixes the optimization when it is faced with
vectors/integers (of the same size) whose bit-width is greater than 64
bits in size, or whose vector elements are at least 32 bits in size.

When the elements were at least 32 bits in size, the logic would exhibit
UB and shift a 32-bit integer (`1`) left by 32. This was leading to
incorrect code generation (the masks would come out as `0` rather than
the intended `0xFFFF...`).

Furthermore, when the vectors are greater than 64 bits in size, the
masking logic would break down as LLVM can only be given constants who
are representable by `uint64_t` and so we wouldn't be able to correctly
represent the mask.

At the same time, the the LIT test has been refactored to test only the
pass, as the output of the packetizer is largely uninteresting for this
test case.
---
 .../transform/squash_small_vectors_pass.cpp   |  24 ++--
 .../vecz/test/lit/llvm/squash_extract_zext.ll | 117 +++++++++++++-----
 2 files changed, 101 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
index 7b8d991c681f0..e17d2b592a18f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -15,6 +15,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <llvm/IR/PassManager.h>
+#include <llvm/Support/MathExtras.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "analysis/stride_analysis.h"
@@ -170,14 +171,19 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
         // vector size is the same as the extended integer size. That is (for
         // little-endian systems):
         //
-        // zext i32(extract <4 x i8> data, i32 3)
+        //   zext i32(extract <4 x i8> data, i32 3)
         //
-        // becomes
+        // becomes:
         //
-        // and(lshr(bitcast i32 data), i32 24), 0xFF)
+        //   and(lshr(bitcast i32 data), i32 24), 0xFF)
         //
-        // this avoids creating shufflevectors during packetization
+        // this avoids creating shufflevectors during packetization.
         //
+        // We limit this optimization to vectors no larger than 64 bits in
+        // size. This is primarily because this optimization focuses on 'small'
+        // vectors but also, because LLVM's constants are limited to 64-bit
+        // integers, the masking logic would need to be done with extra
+        // instructions.
         auto *const srcOp = zext->getOperand(0);
         if (auto *const extract = dyn_cast<ExtractElementInst>(srcOp)) {
           auto *const vector = extract->getVectorOperand();
@@ -186,7 +192,8 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
           auto *const vecTy = vector->getType();
           if (vecTy->getPrimitiveSizeInBits() ==
                   intTy->getPrimitiveSizeInBits() &&
-              isa<ConstantInt>(indexOp)) {
+              zext->getSrcTy()->getPrimitiveSizeInBits() <= 32 &&
+              intTy->getScalarSizeInBits() <= 64 && isa<ConstantInt>(indexOp)) {
             IRBuilder<> B(zext);
             Value *element = getSquashed(vector, intTy, B);
 
@@ -209,9 +216,10 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
                   B.CreateLShr(element, ConstantInt::get(intTy, shift),
                                Twine(extract->getName(), ".squashExtract"));
             }
-            element =
-                B.CreateAnd(element, ConstantInt::get(intTy, (1 << bits) - 1),
-                            Twine(zext->getName(), ".squashZExt"));
+            element = B.CreateAnd(
+                element,
+                ConstantInt::get(intTy, maskTrailingOnes<uint64_t>(bits)),
+                Twine(zext->getName(), ".squashZExt"));
 
             zext->replaceAllUsesWith(element);
             toErase.push_back(zext);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
index 60aa309f7b0ec..4c4fd2a2b92d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
@@ -14,18 +14,33 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
+; RUN: veczc -vecz-passes=squash-small-vecs -S < %s | FileCheck %s
 
-; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-; Function Attrs: nounwind
-define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks
+; to implement the extract elements and zexts.
+; CHECK: void @__vecz_v4_squashv4i8(
+; CHECK:  %[[DATA:.+]] = load <4 x i8>
+; CHECK:  %[[FREEZE:.+]] = freeze <4 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <4 x i8> %[[FREEZE]] to i32
+; CHECK:  %[[ZEXT0:.+]] = and i32 %[[SQUASH]], 255
+; CHECK:  %[[EXTR1:.+]] = lshr i32 %[[SQUASH]], 8
+; CHECK:  %[[ZEXT1:.+]] = and i32 %[[EXTR1]], 255
+; CHECK:  %[[EXTR2:.+]] = lshr i32 %[[SQUASH]], 16
+; CHECK:  %[[ZEXT2:.+]] = and i32 %[[EXTR2]], 255
+; CHECK:  %[[EXTR3:.+]] = lshr i32 %[[SQUASH]], 24
+; CHECK:  %[[ZEXT3:.+]] = and i32 %[[EXTR3]], 255
+; CHECK:  %[[SUM1:.+]] = add i32 %[[ZEXT0]], %[[ZEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor i32 %[[SUM1]], %[[ZEXT2]]
+; CHECK:  %[[SUM3:.+]] = and i32 %[[SUM2]], %[[ZEXT3]]
+; CHECK:  ret void
+define spir_kernel void @squashv4i8(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
 entry:
-  %gid = call i64 @__mux_get_global_id(i64 0) #2
-  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
-  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <4 x i8>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <4 x i8>, ptr addrspace(1) %data.ptr, align 4
   %ele0 = extractelement <4 x i8> %data.ld, i32 0
   %ele1 = extractelement <4 x i8> %data.ld, i32 1
   %ele2 = extractelement <4 x i8> %data.ld, i32 2
@@ -37,33 +52,71 @@ entry:
   %sum1 = add i32 %zext0, %zext1
   %sum2 = xor i32 %sum1, %zext2
   %sum3 = and i32 %sum2, %zext3
-  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
-  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  %output.ptr = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %gid
+  store i32 %sum3, ptr addrspace(1) %output.ptr, align 4
+  ret void
+}
+
+; CHECK: void @__vecz_v4_squashv2i32(
+; CHECK:  %[[DATA:.+]] = load <2 x i32>
+; CHECK:  %[[FREEZE:.+]] = freeze <2 x i32> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <2 x i32> %[[FREEZE]] to i64
+; CHECK:  %[[ZEXT0:.+]] = and i64 %[[SQUASH]], 4294967295
+; CHECK:  %[[EXTR1:.+]] = lshr i64 %[[SQUASH]], 32
+; CHECK:  %[[ZEXT1:.+]] = and i64 %[[EXTR1]], 4294967295
+; CHECK:  %[[SUM1:.+]] = add i64 %[[ZEXT0]], %[[ZEXT1]]
+define spir_kernel void @squashv2i32(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <2 x i32>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <2 x i32>, ptr addrspace(1) %data.ptr, align 4
+  %ele0 = extractelement <2 x i32> %data.ld, i32 0
+  %ele1 = extractelement <2 x i32> %data.ld, i32 1
+  %zext0 = zext i32 %ele0 to i64
+  %zext1 = zext i32 %ele1 to i64
+  %sum = add i64 %zext0, %zext1
+  %output.ptr = getelementptr inbounds i64, ptr addrspace(1) %output, i64 %gid
+  store i64 %sum, ptr addrspace(1) %output.ptr, align 4
+  ret void
+}
+
+; Check we don't squash vectors we consider too large.
+; CHECK: void @__vecz_v4_squashv8i32(
+; CHECK-NOT: bitcast
+define spir_kernel void @squashv8i32(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <8 x i32>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <8 x i32>, ptr addrspace(1) %data.ptr, align 32
+  %ele0 = extractelement <8 x i32> %data.ld, i32 0
+  %ele1 = extractelement <8 x i32> %data.ld, i32 1
+  %zext0 = zext i32 %ele0 to i256
+  %zext1 = zext i32 %ele1 to i256
+  %sum = add i256 %zext0, %zext1
+  %output.ptr = getelementptr inbounds i256, ptr addrspace(1) %output, i64 %gid
+  store i256 %sum, ptr addrspace(1) %output.ptr, align 32
   ret void
 }
 
-declare i64 @__mux_get_global_id(i64) #1
+; Check we don't squash vectors we consider too large.
+; CHECK: void @__vecz_v4_squashv4i64(
+; CHECK-NOT: bitcast
+define spir_kernel void @squashv4i64(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <4 x i64>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <4 x i64>, ptr addrspace(1) %data.ptr, align 32
+  %ele0 = extractelement <4 x i64> %data.ld, i32 0
+  %ele1 = extractelement <4 x i64> %data.ld, i32 1
+  %zext0 = zext i64 %ele0 to i256
+  %zext1 = zext i64 %ele1 to i256
+  %sum = add i256 %zext0, %zext1
+  %output.ptr = getelementptr inbounds i256, ptr addrspace(1) %output, i64 %gid
+  store i256 %sum, ptr addrspace(1) %output.ptr, align 32
+  ret void
+}
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nobuiltin nounwind }
+declare i64 @__mux_get_global_id(i64)
 
-; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks
-; to implement the extract elements and zexts.
-;
-; CHECK: void @__vecz_v4_squash
-; CHECK:  %[[DATA:.+]] = load <16 x i8>
-; CHECK-NOT: shufflevector
-; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
-; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
-; CHECK:  %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK:  %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK:  %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[ZEXT0]], %[[ZEXT1]]
-; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[ZEXT2]]
-; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[ZEXT3]]
-; CHECK:  ret void
+attributes #0 = { nounwind }
+attributes #1 = { nobuiltin nounwind }

From 45a665b5fa867a2836e952a96655d1c22d309edb Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Tue, 7 Nov 2023 18:30:38 +0000
Subject: [PATCH 063/182] [LLVM] Multi_llvm support for current tip changes

This patch adds multi_llvm support for some changes that have been
made in upstream LLVM's main branch. This will not allow the build
to succeed at the moment, but it will skip some errors.

Specifically:
* `CodeGenFileType` and `CodeGenOptLevel` are now enum classes and
  require a struct to "fake" this in LLVM 17 and earlier.
* `createSimpleTargetReduction` no longer accepts the
  `TargetTransformInfo` parameter. A proxy function has been added
  which calls the appropriate version.
* Use `getVirtualFileRef` on the file manager. Seems to work in
  both LLVM 17 and tip.
---
 .../include/multi_llvm/enums.h                | 46 +++++++++++++++++++
 .../include/multi_llvm/loop_utils.h           | 37 +++++++++++++++
 .../include/multi_llvm/multi_llvm.h           |  2 +
 .../transform/packetization_helpers.cpp       |  2 +-
 .../source/transform/pre_linearize_pass.cpp   |  3 +-
 5 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h
new file mode 100644
index 0000000000000..e44b7ac6da3a0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h
@@ -0,0 +1,46 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_ENUMS_H_INCLUDED
+#define MULTI_LLVM_ENUMS_H_INCLUDED
+
+#include <llvm/Support/CodeGen.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+#if LLVM_VERSION_MAJOR >= 18
+
+typedef llvm::CodeGenFileType CodeGenFileType;
+typedef llvm::CodeGenOptLevel CodeGenOptLevel;
+
+#else
+
+struct CodeGenFileType {
+  static constexpr auto AssemblyFile = llvm::CGFT_AssemblyFile;
+  static constexpr auto ObjectFile = llvm::CGFT_ObjectFile;
+  static constexpr auto Null = llvm::CGFT_Null;
+};
+
+struct CodeGenOptLevel {
+  static constexpr auto None = llvm::CodeGenOpt::None;
+  static constexpr auto Less = llvm::CodeGenOpt::Less;
+  static constexpr auto Default = llvm::CodeGenOpt::Default;
+  static constexpr auto Aggressive = llvm::CodeGenOpt::Aggressive;
+};
+
+#endif
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_ENUMS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
new file mode 100644
index 0000000000000..24ff0aed579c4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
@@ -0,0 +1,37 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_LOOP_UTILS_H_INCLUDED
+#define MULTI_LLVM_LOOP_UTILS_H_INCLUDED
+
+#include <llvm/Transforms/Utils/LoopUtils.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+inline llvm::Value *createSimpleTargetReduction(
+    llvm::IRBuilderBase &B, const llvm::TargetTransformInfo *TTI,
+    llvm::Value *Src, llvm::RecurKind RdxKind) {
+#if LLVM_VERSION_MAJOR >= 18
+  (void)TTI;
+  return llvm::createSimpleTargetReduction(B, Src, RdxKind);
+#else
+  return llvm::createSimpleTargetReduction(B, TTI, Src, RdxKind);
+#endif
+}
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_LOOP_UTILS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index b20e29b63c90c..85ffd2f72715b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -17,7 +17,9 @@
 #ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 #define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 
+#include <multi_llvm/enums.h>
 #include <multi_llvm/llvm_version.h>
+#include <multi_llvm/loop_utils.h>
 #include <multi_llvm/triple.h>
 
 #endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 9c5c5c8530792..11e954e73bda2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -255,7 +255,7 @@ Value *createMaybeVPTargetReduction(IRBuilderBase &B,
   assert(isa<VectorType>(Val->getType()) && "Must be vector type");
   // If VL is null, it's not a vector-predicated reduction.
   if (!VL) {
-    return createSimpleTargetReduction(B, &TTI, Val, Kind);
+    return multi_llvm::createSimpleTargetReduction(B, &TTI, Val, Kind);
   }
   auto IntrinsicOp = Intrinsic::not_intrinsic;
   switch (Kind) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index 28108ced6eff2..54fd680166ce8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -120,7 +120,8 @@ unsigned calculateBoolReductionCost(LLVMContext &context, Module *module,
   auto *F = Function::Create(new_fty, Function::InternalLinkage, "tmp", module);
   auto *BB = BasicBlock::Create(context, "reduce", F);
   IRBuilder<> B(BB);
-  createSimpleTargetReduction(B, &TTI, &*F->arg_begin(), RecurKind::And);
+  multi_llvm::createSimpleTargetReduction(B, &TTI, &*F->arg_begin(),
+                                          RecurKind::And);
   unsigned cost = calculateBlockCost(*BB, TTI);
 
   // We don't really need that function in the module anymore because it's

From 319e7dc517b64b5157d1e4e8c136a4c198b62100 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Wed, 22 Nov 2023 16:56:17 +0000
Subject: [PATCH 064/182] [vecz] Update tests for LLVM tip and fix alignment
 optimization

This change fixes a number of tests that were broken in LLVM tip:
* zext instructions now have an optional nneg flag which indicates
  whether they are not-negative. Tests have been updated to
  optionally read (and ignore) this flag. Since vecz doesn't
  actually set this flag (it's set by InstCombiner) I assume that
  it is being set correctly.
* LLVM added a new InferAlignmentPass which does alignment
  inference that was previously done by InstCombine. This is now
  included in the main vecz pipeline when using LLVM18.
* Due to the above changes in alignment handling, the
  emit_memintrinsics.ll test no longer checks alignment.
* uniform_loop.ll's checks were made more generic. Tip LLVM
  versions are able to pull `%load` out of the loop (which seems
  to be sound).
---
 .../vecz/source/vecz_pass_builder.cpp         |   9 ++
 .../test/lit/llvm/RISCV/broadcast_vector.ll   |   2 +-
 .../llvm/ScalableVectors/broadcast_vector.ll  |   8 +-
 .../ScalableVectors/packetize_mask_varying.ll |   2 +-
 .../ScalableVectors/select_scalar_vector.ll   |   2 +-
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |   2 +-
 .../vecz/test/lit/llvm/emit_memintrinsics.ll  | 119 +++++++++---------
 .../vecz/test/lit/llvm/uniform_loop.ll        |   6 +-
 8 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index 39f4910bcb184..822772dbe030b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -61,6 +61,7 @@
 #include "analysis/vectorizable_function_analysis.h"
 #include "analysis/vectorization_unit_analysis.h"
 #include "debugging.h"
+#include "multi_llvm/llvm_version.h"
 #include "transform/common_gep_elimination_pass.h"
 #include "transform/control_flow_conversion_pass.h"
 #include "transform/inline_post_vectorization_pass.h"
@@ -71,6 +72,10 @@
 #include "transform/scalarization_pass.h"
 #include "transform/ternary_transform_pass.h"
 
+#if LLVM_VERSION_GREATER_EQUAL(18, 0)
+#include <llvm/Transforms/Scalar/InferAlignment.h>
+#endif
+
 #define DEBUG_TYPE "vecz"
 using namespace llvm;
 using namespace vecz;
@@ -252,6 +257,10 @@ bool vecz::buildPassPipeline(ModulePassManager &PM) {
   PM.addPass(createModuleToFunctionPassAdaptor(
       InterleavedGroupCombinePass(eInterleavedLoad)));
   PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+#if LLVM_VERSION_GREATER_EQUAL(18, 0)
+  // LLVM 18 split this pass out of InstCombine
+  PM.addPass(createModuleToFunctionPassAdaptor(InferAlignmentPass()));
+#endif
   PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
   PM.addPass(createModuleToFunctionPassAdaptor(SimplifyMaskedMemOpsPass()));
   PM.addPass(DefineInternalBuiltinsPass());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index eb72c2d7a195a..387eee059943f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -139,7 +139,7 @@ entry:
 ; CHECK-NEXT:    store <32 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 128
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 128 x i32> @llvm.experimental.stepvector.nxv128i32()
 ; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], shufflevector (<vscale x 128 x i32> insertelement (<vscale x 128 x i32> {{(undef|poison)}}, i32 31, {{i32|i64}} 0), <vscale x 128 x i32> {{(undef|poison)}}, <vscale x 128 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> undef)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 9c1eee69775f4..58f417ffbc180 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -110,7 +110,7 @@ entry:
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
 ; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
@@ -146,7 +146,7 @@ entry:
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16
 ; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
 ; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> {{(undef|poison)}})
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
@@ -154,7 +154,7 @@ entry:
 ; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    [[V:%.*]] = load <4 x i32>, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i32> {{(undef|poison)}})
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
@@ -173,7 +173,7 @@ entry:
 ; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK:    [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
 ; CHECK:    store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4
-; CHECK:    [[TMP0:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
+; CHECK:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i8, ptr [[FIXLEN_MASK_ALLOC]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> {{(undef|poison)}})
 ; CHECK:    [[BMASK:%.*]] = trunc <vscale x 16 x i8> [[TMP1]] to <vscale x 16 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index df0d0db831545..b28ea35060d36 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -43,7 +43,7 @@ if.end:
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
-; CHECK: [[idx2:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
+; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
 
 ; CHECK: [[t1:%.*]] = getelementptr inbounds i8, ptr {{.*}}, <vscale x 16 x i64> [[idx2]]
 ; CHECK: [[t2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[t1]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index 85dd9e30fe409..d6242cfe70392 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -48,7 +48,7 @@ entry:
 
 ; Note that since we just did a lshr 1 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
-; CHECK: [[sext2:%.*]] = {{s|z}}ext <vscale x 8 x i32> [[idx1]] to <vscale x 8 x i64>
+; CHECK: [[sext2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 8 x i32> [[idx1]] to <vscale x 8 x i64>
 
 ; CHECK: [[addrs:%.*]] = getelementptr inbounds i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
 ; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[addrs]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 3cdae586545ad..0c4a3b5a5b1d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -37,7 +37,7 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
-; CHECK: [[idx2:%.*]] = {{s|z}}ext <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
+; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
 
 ; CHECK: [[alloc:%.*]] = getelementptr inbounds i32, ptr %{{.*}}, <vscale x 16 x i64> [[idx2]]
 ; CHECK: [[splat:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[alloc]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
index 170c3ae5fd090..1b37af3f30ab0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -65,6 +65,11 @@ declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocaptur
 
 declare i64 @__mux_get_local_id(i32)
 
+; Note: Between LLVM 17 and LLVM 18, optimizations to alignments were moved to
+; their own pass. We don't run that pass here, resulting in a difference in
+; alignment values between LLVM versions. Because of that, we don't check
+; alignment of any loads or stores
+
 ; Sanity checks: Make sure the non-vecz entry function is still in place and
 ; contains memset and memcpy. This is done in order to prevent future bafflement
 ; in case some pass optimizes them out.
@@ -92,104 +97,104 @@ declare i64 @__mux_get_local_id(i32)
 
 ; Check if the generated loads and stores are in place
 ; Check the stores for the first memset
-; CHECK: store i64 %ms64val, ptr %sa, align 16
+; CHECK: store i64 %ms64val, ptr %sa
 ; CHECK: %[[V14:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 8
-; CHECK: store i64 %ms64val, ptr %[[V14]], align 8
+; CHECK: store i64 %ms64val, ptr %[[V14]]
 ; CHECK: %[[V15:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 16
-; CHECK: store i64 %ms64val, ptr %[[V15]], align {{(8|16)}}
+; CHECK: store i64 %ms64val, ptr %[[V15]]
 ; CHECK: %[[V16:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 24
-; CHECK: store i64 %ms64val, ptr %[[V16]], align 8
+; CHECK: store i64 %ms64val, ptr %[[V16]]
 ; CHECK: %[[V17:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 32
-; CHECK: store i64 %ms64val, ptr %[[V17]], align 16
+; CHECK: store i64 %ms64val, ptr %[[V17]]
 ; CHECK: %[[V18:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 40
-; CHECK: store i64 %ms64val, ptr %[[V18]], align 8
+; CHECK: store i64 %ms64val, ptr %[[V18]]
 ; CHECK: %[[V19:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 48
-; CHECK: store i64 %ms64val, ptr %[[V19]], align 16
+; CHECK: store i64 %ms64val, ptr %[[V19]]
 ; CHECK: %[[V20:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 56
 ; CHECK-EQ14: %[[V20:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 3, i64 8
 ; CHECK: %[[V21:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 64
 ; CHECK: %[[V22:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 72
 
 ; Check the stores for the second memset
-; CHECK: store i64 0, ptr addrspace(1) %[[SB_I8AS]], align 16
+; CHECK: store i64 0, ptr addrspace(1) %[[SB_I8AS]]
 ; CHECK: %[[V24:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 8
-; CHECK: store i64 0, ptr addrspace(1) %[[V24]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V24]]
 ; CHECK: %[[V26:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 16
-; CHECK: store i64 0, ptr addrspace(1) %[[V26]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V26]]
 ; CHECK: %[[V28:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 24
-; CHECK: store i64 0, ptr addrspace(1) %[[V28]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V28]]
 ; CHECK: %[[V30:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 32
-; CHECK: store i64 0, ptr addrspace(1) %[[V30]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V30]]
 ; CHECK: %[[V32:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 40
-; CHECK: store i64 0, ptr addrspace(1) %[[V32]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V32]]
 ; CHECK: %[[V33:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 48
-; CHECK: store i64 0, ptr addrspace(1) %[[V33]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V33]]
 ; CHECK: %[[V35T:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 56
 ; CHECK-EQ14: %[[V35T:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %sb, i64 0, i32 3, i64 8
 ; CHECK-EQ14: %[[V35:[0-9]+]] = bitcast i8* %[[V35T]] to i64*
 ; CHECK-EQ14: %[[SB_I8AS18:.+]] = addrspacecast i64* %[[V35]] to i64 addrspace(1)*
-; CHECK: store i64 0, ptr addrspace(1) %[[V35T]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V35T]]
 ; CHECK: %[[V36:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 64
-; CHECK: store i64 0, ptr addrspace(1) %[[V36]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V36]]
 ; CHECK: %[[V38:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 72
-; CHECK: store i64 0, ptr addrspace(1) %[[V38]], align 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V38]]
 
 
 ; Check the loads and stores for the first memcpy
 ; CHECK:middle:                                           ; preds = %entry
-; CHECK: %[[SA_I822:.+]] = load i64, ptr %sa, align 16
-; CHECK: store i64 %[[SA_I822]], ptr addrspace(1) %[[SB_I8AS]], align 16
-; CHECK: %[[SA_I824:.+]] = load i64, ptr %[[V14]], align 8
-; CHECK: store i64 %[[SA_I824]], ptr addrspace(1) %[[V24]], align 8
-; CHECK: %[[SA_I826:.+]] = load i64, ptr %[[V15]], align {{(8|16)}}
-; CHECK: store i64 %[[SA_I826]], ptr addrspace(1) %[[V26]], align 8
-; CHECK: %[[SA_I828:.+]] = load i64, ptr %[[V16]], align 8
-; CHECK: store i64 %[[SA_I828]], ptr addrspace(1) %[[V28]], align 8
-; CHECK: %[[SA_I830:.+]] = load i64, ptr %[[V17]], align 16
-; CHECK: store i64 %[[SA_I830]], ptr addrspace(1) %[[V30]], align 8
-; CHECK: %[[SA_I832:.+]] = load i64, ptr %[[V18]], align 8
-; CHECK: store i64 %[[SA_I832]], ptr addrspace(1) %[[V32]], align 8
-; CHECK: %[[SA_I834:.+]] = load i64, ptr %[[V19]], align 16
-; CHECK: store i64 %[[SA_I834]], ptr addrspace(1) %[[V33]], align 8
-; CHECK: %[[SA_I836:.+]] = load i64, ptr %[[V20]], align 8
-; CHECK: store i64 %[[SA_I836]], ptr addrspace(1) %[[V35T]], align 8
-; CHECK: %[[SA_I838:.+]] = load i64, ptr %[[V21]], align 16
-; CHECK: store i64 %[[SA_I838]], ptr addrspace(1) %[[V36]], align 8
-; CHECK: %[[SA_I840:.+]] = load i64, ptr %[[V22]], align 8
-; CHECK: store i64 %[[SA_I840]], ptr addrspace(1) %[[V38]], align 8
+; CHECK: %[[SA_I822:.+]] = load i64, ptr %sa
+; CHECK: store i64 %[[SA_I822]], ptr addrspace(1) %[[SB_I8AS]]
+; CHECK: %[[SA_I824:.+]] = load i64, ptr %[[V14]]
+; CHECK: store i64 %[[SA_I824]], ptr addrspace(1) %[[V24]]
+; CHECK: %[[SA_I826:.+]] = load i64, ptr %[[V15]]
+; CHECK: store i64 %[[SA_I826]], ptr addrspace(1) %[[V26]]
+; CHECK: %[[SA_I828:.+]] = load i64, ptr %[[V16]]
+; CHECK: store i64 %[[SA_I828]], ptr addrspace(1) %[[V28]]
+; CHECK: %[[SA_I830:.+]] = load i64, ptr %[[V17]]
+; CHECK: store i64 %[[SA_I830]], ptr addrspace(1) %[[V30]]
+; CHECK: %[[SA_I832:.+]] = load i64, ptr %[[V18]]
+; CHECK: store i64 %[[SA_I832]], ptr addrspace(1) %[[V32]]
+; CHECK: %[[SA_I834:.+]] = load i64, ptr %[[V19]]
+; CHECK: store i64 %[[SA_I834]], ptr addrspace(1) %[[V33]]
+; CHECK: %[[SA_I836:.+]] = load i64, ptr %[[V20]]
+; CHECK: store i64 %[[SA_I836]], ptr addrspace(1) %[[V35T]]
+; CHECK: %[[SA_I838:.+]] = load i64, ptr %[[V21]]
+; CHECK: store i64 %[[SA_I838]], ptr addrspace(1) %[[V36]]
+; CHECK: %[[SA_I840:.+]] = load i64, ptr %[[V22]]
+; CHECK: store i64 %[[SA_I840]], ptr addrspace(1) %[[V38]]
 
 ; Check the loads and stores for the second memcpy
 ; CHECK:end:                                              ; preds = %middle, %entry
-; CHECK: %[[SB_I8AS42:.+]] = load i64, ptr addrspace(1) %[[SB_I8AS]], align 16
-; CHECK: store i64 %[[SB_I8AS42]], ptr %result2, align 16
+; CHECK: %[[SB_I8AS42:.+]] = load i64, ptr addrspace(1) %[[SB_I8AS]]
+; CHECK: store i64 %[[SB_I8AS42]], ptr %result2
 ; CHECK: %[[V42:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 8
-; CHECK: %[[SB_I8AS44:.+]] = load i64, ptr addrspace(1) %[[V24]], align 8
-; CHECK: store i64 %[[SB_I8AS44]], ptr %[[V42]], align 8
+; CHECK: %[[SB_I8AS44:.+]] = load i64, ptr addrspace(1) %[[V24]]
+; CHECK: store i64 %[[SB_I8AS44]], ptr %[[V42]]
 ; CHECK: %[[V43:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 16
-; CHECK: %[[SB_I8AS46:.+]] = load i64, ptr addrspace(1) %[[V26]], align 8
-; CHECK: store i64 %[[SB_I8AS46]], ptr %[[V43]], align 8
+; CHECK: %[[SB_I8AS46:.+]] = load i64, ptr addrspace(1) %[[V26]]
+; CHECK: store i64 %[[SB_I8AS46]], ptr %[[V43]]
 ; CHECK: %[[V44:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 24
-; CHECK: %[[SB_I8AS48:.+]] = load i64, ptr addrspace(1) %[[V28]], align 8
-; CHECK: store i64 %[[SB_I8AS48]], ptr %[[V44]], align 8
+; CHECK: %[[SB_I8AS48:.+]] = load i64, ptr addrspace(1) %[[V28]]
+; CHECK: store i64 %[[SB_I8AS48]], ptr %[[V44]]
 ; CHECK: %[[V45:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 32
-; CHECK: %[[SB_I8AS50:.+]] = load i64, ptr addrspace(1) %[[V30]], align 8
-; CHECK: store i64 %[[SB_I8AS50]], ptr %[[V45]], align 8
+; CHECK: %[[SB_I8AS50:.+]] = load i64, ptr addrspace(1) %[[V30]]
+; CHECK: store i64 %[[SB_I8AS50]], ptr %[[V45]]
 ; CHECK: %[[V46:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 40
-; CHECK: %[[SB_I8AS52:.+]] = load i64, ptr addrspace(1) %[[V32]], align 8
-; CHECK: store i64 %[[SB_I8AS52]], ptr %[[V46]], align 8
+; CHECK: %[[SB_I8AS52:.+]] = load i64, ptr addrspace(1) %[[V32]]
+; CHECK: store i64 %[[SB_I8AS52]], ptr %[[V46]]
 ; CHECK: %[[V47:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 48
-; CHECK: %[[SB_I8AS54:.+]] = load i64, ptr addrspace(1) %[[V33]], align 8
-; CHECK: store i64 %[[SB_I8AS54]], ptr %[[V47]], align 8
+; CHECK: %[[SB_I8AS54:.+]] = load i64, ptr addrspace(1) %[[V33]]
+; CHECK: store i64 %[[SB_I8AS54]], ptr %[[V47]]
 ; CHECK: %[[V48:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 56
 ; CHECK-EQ14: %[[V48:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %result2, i64 0, i32 3, i64 8
-; CHECK: %[[SB_I8AS56:.+]] = load i64, ptr addrspace(1) %[[V35T]], align 8
-; CHECK: store i64 %[[SB_I8AS56]], ptr %[[V48]], align 8
+; CHECK: %[[SB_I8AS56:.+]] = load i64, ptr addrspace(1) %[[V35T]]
+; CHECK: store i64 %[[SB_I8AS56]], ptr %[[V48]]
 ; CHECK: %[[V49:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 64
-; CHECK: %[[SB_I8AS58:.+]] = load i64, ptr addrspace(1) %[[V36]], align 8
-; CHECK: store i64 %[[SB_I8AS58]], ptr %[[V49]], align 8
+; CHECK: %[[SB_I8AS58:.+]] = load i64, ptr addrspace(1) %[[V36]]
+; CHECK: store i64 %[[SB_I8AS58]], ptr %[[V49]]
 ; CHECK: %[[V50:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 72
-; CHECK: %[[SB_I8AS60:.+]] = load i64, ptr addrspace(1) %[[V38]], align 8
-; CHECK: store i64 %[[SB_I8AS60]], ptr %[[V50]], align 8
+; CHECK: %[[SB_I8AS60:.+]] = load i64, ptr addrspace(1) %[[V38]]
+; CHECK: store i64 %[[SB_I8AS60]], ptr %[[V50]]
 
 ; End of function
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
index 43c60eb882ea7..ca7a8c7a4cbe5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
@@ -40,6 +40,6 @@ merge:
 }
 
 ; CHECK: define spir_kernel void @__vecz_v4_test
-; CHECK: loop:
-; CHECK: %load = load i32, ptr addrspace(1) %in
-; CHECK: store i32 %load, ptr addrspace(1) %slot
+; CHECK-NOT: define spir_kernel void @test
+; CHECK: %[[LOAD:load.*]] = load i32, ptr addrspace(1) %in
+; CHECK: store i32 %[[LOAD]], ptr addrspace(1) %slot

From b867868859da5b5e6afa4c5b208fbc976dafe0ad Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 23 Nov 2023 17:20:57 +0000
Subject: [PATCH 065/182] [vecz] Suppress cert-err33-c on error-handling
 fprintf

---
 .../compiler_passes/vecz/tools/source/veczc.cpp        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 59e17c72c35ce..d60b154774b13 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -133,8 +133,8 @@ static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
   const llvm::Target *target =
       llvm::TargetRegistry::lookupTarget(triple.getTriple(), e);
   if (!target) {
-    ::fprintf(stderr, "can't get target %s:%s\n", triple.getTriple().c_str(),
-              e.c_str());
+    (void)::fprintf(stderr, "can't get target %s:%s\n",
+                    triple.getTriple().c_str(), e.c_str());
     ::exit(1);
   }
   llvm::PassRegistry &registry = *llvm::PassRegistry::getPassRegistry();
@@ -331,9 +331,9 @@ int main(const int argc, const char *const argv[]) {
       llvm::StringRef name;
       llvm::SmallVector<vecz::VeczPassOptions, 1> opts;
       if (!parsePassOptionsSwitch(S, name, opts)) {
-        fprintf(stderr,
-                "failed to parse kernel vectorization specification%s\n",
-                name.str().c_str());
+        (void)::fprintf(
+            stderr, "failed to parse kernel vectorization specification%s\n",
+            name.str().c_str());
         return 1;
       }
       if (!module->getFunction(name)) {

From f98837395f3ad699089028d448729385ddbe0991 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 5 Dec 2023 14:25:40 +0000
Subject: [PATCH 066/182] Move LLVM 18 fixups.

* LLVM 18 moves clang::CodeGenOptions::VectorLibrary to
  llvm::driver::VectorLibrary. Use decltype to handle both.
* LLVM 18 drops CallingConv::WebKit_JS.
* LLVM 18 drops <llvm/Transforms/Vectorize.h> which we include
  needlessly.
* LLVM 18 requires us to link in libLLVMFrontendDriver,
  libclangAPINotes, libclangBasic.
* LLVM 18 adds a disjoint flag to or instructions which we need to
  account for in tests.
* LLVM 18 moves <llvm/Support/Host.h> to <llvm/TargetParser/Host.h>.
* LLVM 18 is able to infer that we could potentially end up with a
  subvector size of zero, in which case we would end up with a division
  by zero. A subvector size of zero would be a bug elsewhere in OCK, so
  add an assert that it is not zero.
---
 .../compiler_passes/vecz/source/transform/packetizer.cpp      | 1 +
 .../vecz/test/lit/llvm/ScalableVectors/extract_element.ll     | 4 ++--
 .../vecz/test/lit/llvm/ScalableVectors/insert_element.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll   | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 071d72773f5bc..8615245f82c4c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1295,6 +1295,7 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
     op.getPacketValues(opPackets);
     auto factor = SimdWidth.divideCoefficientBy(opPackets.size());
     const unsigned subvecSize = factor.getFixedValue();
+    assert(subvecSize > 0 && "Subvector size cannot be zero");
     const unsigned idxVal = cast<ConstantInt>(vecIdx)->getZExtValue();
     // If individual elements are scalar (through instantiation, say) then just
     // use the desired packet directly.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index d90c4e788f336..27f7b54eb6442 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -118,7 +118,7 @@ entry:
 ; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 2, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
-; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} <vscale x 4 x i64> [[T6]], [[MOD]]
+; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i64> [[T6]], [[MOD]]
 
 ; EE-UNI-VEC: [[T8:%.*]] = getelementptr inbounds float, ptr {{%.*}}, <vscale x 4 x i64> [[T7]]
 ; EE-UNI-VEC: [[T9:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[T8]])
@@ -132,7 +132,7 @@ entry:
 ; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
 ; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 ; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
-; EE-INDICES: [[T5:%.*]] = {{add|or}} <vscale x 4 x i32> [[T4]], [[T3]]
+; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T4]], [[T3]]
 ; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
 ; EE-INDICES: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
 ; EE-INDICES: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[ADDR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index 8722870800f35..1712e6dd4579a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -102,7 +102,7 @@ entry:
 ; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
-; IE-INDICES: [[T3:%.*]] = {{add|or}} <vscale x 4 x i32> [[T2]], {{%.*}}
+; IE-INDICES: [[T3:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T2]], {{%.*}}
 
 ; IE-INDICES: [[T4:%.*]] = sext <vscale x 4 x i32> [[T3]] to <vscale x 4 x i64>
 ; IE-INDICES: [[ADDR:%.*]] = getelementptr inbounds float, ptr %0, <vscale x 4 x i64> [[T4]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index d118893c3f1d6..a07f2b7dda6d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -48,7 +48,7 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 ; CHECK: [[MUL:%.*]] = shl i32 %call, 2
 ; CHECK: [[SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
 ; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK: [[ID:%.*]] = or <4 x i32> [[SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[ID:%.*]] = or {{(disjoint )?}}<4 x i32> [[SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[EXT:%.*]] = sext i32 %call to i64
 ; CHECK: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
 ; CHECK: store <4 x i32> [[ID]], ptr addrspace(1) %arrayidx

From 6b193d6e23f8440cb6158be25cd150a5f0870aa2 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 13 Dec 2023 11:18:01 +0000
Subject: [PATCH 067/182] [vecz] Provide more context when CFG conversion fails

This should help debugging (using `--pass-remarks-missed=vecz`) why
control-flow conversion failed to apply masks to the CFG.  The previous
diagnostics would only print the name of the function that couldn't be
converted, but not any more specific information.

This commit adds an extra level of information via a 'note', which is
optionally printed on the line after the main diagnostic.

This is not intended to be a full solution to better vecz diagnostics,
but a good first step.
---
 .../compiler_passes/vecz/source/debugging.cpp | 24 ++++++----
 .../vecz/source/include/debugging.h           | 14 +++++-
 .../control_flow_conversion_pass.cpp          | 43 ++++++++++++------
 .../vecz/test/lit/llvm/diverging_atomic.ll    | 45 +++++++++++++++++++
 4 files changed, 102 insertions(+), 24 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
index b8fd3aa2b7756..851f47b40883c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
@@ -26,31 +26,39 @@ namespace vecz {
 ///
 /// @param[in] V The value (can be `nullptr`) to be included in the remark
 /// @param[in] Msg The main remark message
+/// @param[in] Note An optional additional note to provide more context/info.
 /// @return The remark message as it is to be printed
-static std::string createRemarkMessage(const Value *V, StringRef Msg) {
+static std::string createRemarkMessage(const Value *V, StringRef Msg,
+                                       StringRef Note = "") {
   std::string helper_str("Vecz: ");
   raw_string_ostream helper_stream(helper_str);
   helper_stream << Msg;
   if (V) {
     if (isa<Instruction>(V)) {
       // Instructions are already prefixed by two spaces when printed
-      V->print(helper_stream, true);
+      V->print(helper_stream, /*IsForDebug=*/true);
     } else if (const Function *F = dyn_cast<Function>(V)) {
-      // Printing a functions leads to it's whole body being printed
+      // Printing a functions leads to its whole body being printed
       helper_stream << " function \"" << F->getName() << "\"";
     } else {
       helper_stream << " ";
-      V->print(helper_stream, true);
+      V->print(helper_stream, /*IsForDebug=*/true);
     }
   }
   helper_stream << '\n';
 
+  // Provide extra context, if supplied
+  if (!Note.empty()) {
+    helper_stream << "  note: " << Note << '\n';
+  }
+
   return helper_stream.str();
 }
 
-void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg) {
+void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg,
+                          StringRef Note) {
   const Instruction *I = V ? dyn_cast<Instruction>(V) : nullptr;
-  auto RemarkMsg = createRemarkMessage(V, Msg);
+  auto RemarkMsg = createRemarkMessage(V, Msg, Note);
   OptimizationRemarkEmitter ORE(F);
   if (I) {
     ORE.emit(OptimizationRemarkMissed("vecz", "vecz", I) << RemarkMsg);
@@ -61,8 +69,8 @@ void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg) {
   }
 }
 
-void emitVeczRemarkMissed(const Function *F, StringRef Msg) {
-  emitVeczRemarkMissed(F, nullptr, Msg);
+void emitVeczRemarkMissed(const Function *F, StringRef Msg, StringRef Note) {
+  emitVeczRemarkMissed(F, nullptr, Msg, Note);
 }
 
 void emitVeczRemark(const Function *F, const Value *V, StringRef Msg) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
index 55d063bfabc84..a5ae45cdcd83b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -28,6 +28,7 @@
 #include <llvm/IR/Module.h>
 #include <llvm/IR/PassManager.h>
 #include <llvm/IR/Value.h>
+#include <llvm/Support/Error.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include <cstdlib>
@@ -64,6 +65,12 @@ struct VeczFailResult {
   operator std::optional<T>() const {
     return std::nullopt;
   }
+
+  /// @brief For functions that return an llvm::Error
+  operator llvm::Error() const {
+    return llvm::make_error<llvm::StringError>("Unknown VeczFailResult",
+                                               llvm::inconvertibleErrorCode());
+  }
 };
 
 struct AnalysisFailResult : public internal::VeczFailResult {
@@ -175,13 +182,16 @@ struct AnalysisFailResult : public internal::VeczFailResult {
 /// @param[in] F The function in which we are currently working
 /// @param[in] V The value (can be `nullptr`) to be included in the message
 /// @param[in] Msg The main remark message text
+/// @param[in] Note An optional additional note to provide more context/info.
 void emitVeczRemarkMissed(const llvm::Function *F, const llvm::Value *V,
-                          llvm::StringRef Msg);
+                          llvm::StringRef Msg, llvm::StringRef Note = "");
 /// @brief Emit a RemarkMissed message
 ///
 /// @param[in] F The function in which we are currently working
 /// @param[in] Msg The main remark message text
-void emitVeczRemarkMissed(const llvm::Function *F, llvm::StringRef Msg);
+/// @param[in] Note An optional additional note to provide more context/info.
+void emitVeczRemarkMissed(const llvm::Function *F, llvm::StringRef Msg,
+                          llvm::StringRef Note = "");
 /// @brief Emit a Remark message
 ///
 /// @param[in] F The function in which we are currently working
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 9287e51311f68..874a793d9993d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -30,6 +30,7 @@
 #include <llvm/IR/Dominators.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/Support/Debug.h>
+#include <llvm/Support/Error.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include <queue>
@@ -145,8 +146,9 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @brief Apply masks to basic blocks in the function, to prevent
   /// side-effects for inactive instances.
   ///
-  /// @return true if masks were applied successfully, false otherwise.
-  bool applyMasks();
+  /// @return llvm::Error::success if masks were applied successfully, an error
+  /// message explaining the failure otherwise.
+  Error applyMasks();
 
   /// @brief Apply a mask to the given basic block, to prevent side-effects for
   /// inactive instances.
@@ -154,8 +156,9 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @param[in] BB Basic block to apply masks to.
   /// @param[in] mask Mask to apply.
   ///
-  /// @return true if masks were applied successfully, false otherwise.
-  bool applyMask(BasicBlock &BB, Value *mask);
+  /// @return llvm::Error::success if masks were applied successfully, an error
+  /// message explaining the failure otherwise.
+  Error applyMask(BasicBlock &BB, Value *mask);
 
   /// @brief Emit a call instructions to the masked version of the called
   /// function.
@@ -378,6 +381,14 @@ Instruction *copyExitMask(Value *mask, StringRef base, BasicBlock &BB) {
   VECZ_ERROR_IF(!mask, "Trying to copy exit mask with invalid arguments");
   return copyMask(mask, base + ".exit_mask", BB.getTerminator());
 }
+
+/// Wrap a string into an llvm::StringError, pointing to an instruction.
+static inline Error makeStringError(const Twine &message, Instruction &I) {
+  std::string helper_str = message.str();
+  raw_string_ostream helper_stream(helper_str);
+  helper_stream << " " << I;
+  return make_error<StringError>(helper_stream.str(), inconvertibleErrorCode());
+}
 }  // namespace
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -538,8 +549,9 @@ bool ControlFlowConversionState::Impl::convertToDataFlow() {
                          "Could not generate masks for");
     return false;
   }
-  if (!applyMasks()) {
-    emitVeczRemarkMissed(&F, VU.scalarFunction(), "Could not apply masks for");
+  if (auto err = applyMasks()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(), "Could not apply masks for",
+                         llvm::toString(std::move(err)));
     return false;
   }
 
@@ -1075,19 +1087,21 @@ bool ControlFlowConversionState::Impl::createCombinedLoopExitMask(
   return true;
 }
 
-bool ControlFlowConversionState::Impl::applyMasks() {
+Error ControlFlowConversionState::Impl::applyMasks() {
   for (auto &BB : F) {
     // Use masks with instructions that have side-effects.
     if (!DR->isUniform(BB) && !DR->isByAll(BB)) {
       auto *const entryMask = MaskInfos[&BB].entryMask;
       VECZ_ERROR_IF(!entryMask, "BasicBlock should have an entry mask");
-      VECZ_FAIL_IF(!applyMask(BB, entryMask));
+      if (auto err = applyMask(BB, entryMask)) {
+        return err;
+      }
     }
   }
-  return true;
+  return Error::success();
 }
 
-bool ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
+Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
   // Packetization hasn't happened yet so this better be a scalar 1 bit int.
   assert(mask->getType()->isIntegerTy(1) && "CFG mask type should be int1");
   // Map the unmasked instruction with the masked one.
@@ -1102,17 +1116,17 @@ bool ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
     // Turn loads and stores into masked loads and stores.
     if (memOp && (memOp->isLoad() || memOp->isStore())) {
       if (!tryApplyMaskToMemOp(*memOp, mask, toDelete)) {
-        return false;
+        return makeStringError("Could not apply mask to MemOp", I);
       }
     } else if (auto *CI = dyn_cast<CallInst>(&I)) {
       // Turn calls into masked calls if possible.
       if (!applyMaskToCall(CI, mask, toDelete)) {
-        return false;
+        return makeStringError("Could not apply mask to call instruction", I);
       }
     } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
       // We need to apply masks to atomic functions, but it is currently not
       // implemented. See CA-3294.
-      return false;
+      return makeStringError("Could not apply mask to atomic instruction", I);
     } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
       // We have to be careful with infinite loops, because if they exist on a
       // divergent code path, they will always be entered and will hang the
@@ -1138,7 +1152,8 @@ bool ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
     updateMaps(unmasked, masked);
     IRCleanup::deleteInstructionNow(unmasked);
   }
-  return true;
+
+  return Error::success();
 }
 
 CallInst *ControlFlowConversionState::Impl::emitMaskedVersion(CallInst *CI,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
new file mode 100644
index 0000000000000..de4501013763a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: Vecz: Could not apply masks for function "kernel"
+; CHECK-NEXT: note: Could not apply mask to atomic instruction
+; CHECK-SAME:  %atomic = atomicrmw add ptr %arrayidx.in, i32 2 monotonic, align 4
+
+define spir_kernel void @kernel(ptr %in, ptr %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %gid, 0
+  br i1 %cmp, label %if.then, label %end
+
+if.then:
+  %arrayidx.in = getelementptr inbounds i32, ptr %in, i64 %gid
+  %atomic = atomicrmw add ptr %arrayidx.in, i32 2 monotonic, align 4
+  br label %end
+
+end:
+  %merge = phi i32 [ 0, %entry ], [ %atomic, %if.then ]
+  %arrayidx.out = getelementptr inbounds i32, ptr %out, i64 %gid
+  store i32 %merge, ptr %arrayidx.out, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)

From 74d0cf51223419d9515d31a6bf6cbf34df97ce32 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 14 Dec 2023 09:51:32 +0000
Subject: [PATCH 068/182] [vecz] Run SROA as part of the vecz pipeline

Some kernels, particularly SYCL kernels, can make heavy use of temporary
allocations which are passed between functions.

In SYCL's case, this is particularly harmful because the state stored
and reloaded via these structs includes the work-item and work-group
builtins (local IDs, global IDs, etc) which means the vectorizer is
unable to analyze the program, leading to incorrect
uniformity/divergence decisions and/or poor code generation.

LLVM's SROA is typically unable to do anything with this pattern before
inlining, as the allocation is escaped by the called function. However,
after vecz inlines these functions, SROA can eliminate much of this and
present the vectorizer with a nicely analyzable program.

Some LIT tests using allocas were updated to use a more specific vecz
pipeline which doesn't run SROA, as the allocas were being removed.
---
 .../vecz/source/vecz_pass_builder.cpp              | 13 ++++++++++++-
 .../lit/llvm/ScalableVectors/broadcast_vector.ll   |  3 +--
 .../vecz/test/lit/llvm/gep_duplication.ll          | 14 ++++++++------
 .../test/lit/llvm/insert_element_debug_info.ll     |  7 +++++--
 4 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index 822772dbe030b..9729fed9e8fbe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -41,6 +41,7 @@
 #include <llvm/Transforms/Scalar/FlattenCFG.h>
 #include <llvm/Transforms/Scalar/GVN.h>
 #include <llvm/Transforms/Scalar/IndVarSimplify.h>
+#include <llvm/Transforms/Scalar/SROA.h>
 #include <llvm/Transforms/Scalar/SimplifyCFG.h>
 #include <llvm/Transforms/Scalar/Sink.h>
 #include <llvm/Transforms/Utils/BreakCriticalEdges.h>
@@ -181,7 +182,17 @@ bool vecz::buildPassPipeline(ModulePassManager &PM) {
   PM.addPass(createModuleToFunctionPassAdaptor(LowerSwitchPass()));
   PM.addPass(createModuleToFunctionPassAdaptor(FixIrreduciblePass()));
 
-  // We have to run LLVM's Mem2Reg pass in case the front end didn't
+  // It's helpful to run SROA in case it opens up more opportunities to
+  // eliminate aggregates in (particularly SYCL) kernels. This is especially
+  // true after inlining - which we've (usually) just performed in the
+  // BuiltinInliningPass - because otherwise SROA is unable to analyze the
+  // lifetime of allocas due to them being "escaped" by the function callee.
+  PM.addPass(
+      createModuleToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG)));
+  // We have to run LLVM's Mem2Reg pass in case the front end didn't. Note that
+  // SROA usually runs Mem2Reg internally (unless disabled via a command-line
+  // option) though using its own heuristic. We run it unconditionally
+  // regardless, just for good measure.
   PM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
   // LLVM's own Mem2Reg pass doesn't always get everything
   PM.addPass(createModuleToFunctionPassAdaptor(BasicMem2RegPass()));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 58f417ffbc180..ea203f8658eea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -14,8 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; NOTE: Assertions have been autogenerated by scripts/testing/update_veczc_checks.py
-; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-passes="function(instcombine),packetizer,gvn,function(instcombine)" -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index 6cc4145e497e0..42909fddd9c44 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -14,13 +14,20 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -S < %s | FileCheck %s
+; RUN: veczc -S -vecz-passes="function(mem2reg,instcombine),cfg-convert,gvn,packetizer" < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.testStruct = type { [2 x i32] }
 
+
+; Check that we de-duplicate the GEPs used across this kernel (using a
+; combination of instcombine and GVN).
+; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
+; CHECK: entry:
+; CHECK: getelementptr inbounds [2 x i32], ptr %myStruct, i{{32|64}} 0, i{{32|64}} 1
+; CHECK-NOT: getelementptr {{.*}}%myStruct
 define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) {
 entry:
   %out.addr = alloca ptr addrspace(1), align 8
@@ -68,8 +75,3 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 declare i64 @__mux_get_global_id(i32)
-
-; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
-; CHECK: entry:
-; CHECK: getelementptr inbounds [2 x i32], ptr %myStruct, i{{32|64}} 0, i{{32|64}} 1
-; CHECK-NOT: getelementptr {{.*}}%myStruct
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index a23678ae51a4e..727130f6f2e92 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -18,7 +18,7 @@
 ; intrinsics across all lanes even when scalarization masks disable some
 ; of the lanes. This occurs when we scalarize insertelement instructions.
 
-; RUN: veczc -k unaligned_load -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -49,7 +49,7 @@ entry:
 ; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the
 ; termination of the previous value assigned to %tmp - we could probably do
 ; better here by manifesting a vectorized value?
-; CHECK: call void @llvm.dbg.value(metadata i32 {{(poison|undef)}}, metadata !{{[0-9]+}},
+; CHECK: call void @llvm.dbg.value(metadata i32 {{(poison|undef)}}, metadata [[VAR:![0-9]+]],
 ; CHECK-SAME:   metadata !DIExpression({{.*}})), !dbg !{{[0-9]+}}
   %1 = load i32, i32* %tid, align 4, !dbg !32
   %mul = mul nsw i32 3, %1, !dbg !32
@@ -105,6 +105,9 @@ attributes #3 = { nobuiltin }
 !llvm.module.flags = !{!27}
 !llvm.ident = !{!28}
 
+; Now check we're actually looking at the right variable.
+; CHECK: [[VAR]] = !DILocalVariable(name: "tmp",
+
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3)
 !1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/vecz_build")
 !2 = !{}

From b6a7155efdd6fcd2f5f38d8733b737f808e933b8 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 14 Dec 2023 11:01:22 +0000
Subject: [PATCH 069/182] [vecz] Simplify pass construction process

This commit simplifies the vectorizer's pass construction and pass
management process by using shared function pass managers, rather than
creating an adapter per pass.

I was initially considering the removal all uses of the verifier pass,
but we couldn't come to an agreement about the best way forward:

* Remove them all, relying on correct pass execution and the
  `--verify-each` command-line option.
* Keep them all as they are
* Keep them all, but only in a debug build of OCK

For now, they largely remain as they were. I've removed one mid-pipeline
use of the verifier pass (after the `SimplifyInfiniteLoopPass`), but kept
the others (specifically after the `ControlFlowConversionPass` which has
historically proven problematic). We can perhaps revisit this question later.

Debugging this locally with `--debug-vecz-pipeline` doesn't show any
substantial changes. The order of "skipped" passes has changed, but that
shouldn't matter. The only other change is to do with removing the
single run of the verifier pass.
---
 .../vecz/source/vecz_pass_builder.cpp         | 197 ++++++++++--------
 1 file changed, 105 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index 9729fed9e8fbe..bcb8f3e1596c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -28,6 +28,7 @@
 #include <llvm/Analysis/TargetLibraryInfo.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Config/llvm-config.h>
+#include <llvm/IR/PassManager.h>
 #include <llvm/IR/PassManagerImpl.h>
 #include <llvm/IR/Verifier.h>
 #include <llvm/Passes/PassBuilder.h>
@@ -41,6 +42,7 @@
 #include <llvm/Transforms/Scalar/FlattenCFG.h>
 #include <llvm/Transforms/Scalar/GVN.h>
 #include <llvm/Transforms/Scalar/IndVarSimplify.h>
+#include <llvm/Transforms/Scalar/LoopPassManager.h>
 #include <llvm/Transforms/Scalar/SROA.h>
 #include <llvm/Transforms/Scalar/SimplifyCFG.h>
 #include <llvm/Transforms/Scalar/Sink.h>
@@ -178,104 +180,115 @@ void VeczPassMachinery::registerPassCallbacks() {
 bool vecz::buildPassPipeline(ModulePassManager &PM) {
   // Preparation passes
   PM.addPass(BuiltinInliningPass());
-  // Lower switches after builtin inlining, incase the builtins had switches.
-  PM.addPass(createModuleToFunctionPassAdaptor(LowerSwitchPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(FixIrreduciblePass()));
-
-  // It's helpful to run SROA in case it opens up more opportunities to
-  // eliminate aggregates in (particularly SYCL) kernels. This is especially
-  // true after inlining - which we've (usually) just performed in the
-  // BuiltinInliningPass - because otherwise SROA is unable to analyze the
-  // lifetime of allocas due to them being "escaped" by the function callee.
-  PM.addPass(
-      createModuleToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG)));
-  // We have to run LLVM's Mem2Reg pass in case the front end didn't. Note that
-  // SROA usually runs Mem2Reg internally (unless disabled via a command-line
-  // option) though using its own heuristic. We run it unconditionally
-  // regardless, just for good measure.
-  PM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
-  // LLVM's own Mem2Reg pass doesn't always get everything
-  PM.addPass(createModuleToFunctionPassAdaptor(BasicMem2RegPass()));
-
-  PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(PreLinearizePass()));
-  // If pre-linearization created any unnecessary Hoist Guards,
-  // Instruction Combining Pass will handily clean them up.
-  PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(UnifyFunctionExitNodesPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(LoopSimplifyPass()));
-  // Lower switches again because CFG simplifcation can create them.
-  PM.addPass(createModuleToFunctionPassAdaptor(LowerSwitchPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(
-      createFunctionToLoopPassAdaptor(VeczLoopRotatePass())));
-  // IndVarSimplify can create a lot of duplicate instructions when there
-  // are unrolled loops. EarlyCSE is there to clear them up. However,
-  // this can destroy LCSSA form, so we need to restore it.
-  PM.addPass(createModuleToFunctionPassAdaptor(
-      createFunctionToLoopPassAdaptor(IndVarSimplifyPass())));
-
-  PM.addPass(createModuleToFunctionPassAdaptor(EarlyCSEPass()));
-  // We run this last because EarlyCSE can actually create infinite loops
-  // (with a "conditional" branch on true)
-  PM.addPass(createModuleToFunctionPassAdaptor(
-      createFunctionToLoopPassAdaptor(SimplifyInfiniteLoopPass())));
-
-  // Verify that the preparation passes cleaned up after themselves.
-  PM.addPass(VerifierPass());
+  {
+    FunctionPassManager FPM;
+    // Lower switches after builtin inlining, in case the builtins had switches.
+    FPM.addPass(LowerSwitchPass());
+    FPM.addPass(FixIrreduciblePass());
+
+    // It's helpful to run SROA in case it opens up more opportunities to
+    // eliminate aggregates in (particularly SYCL) kernels. This is especially
+    // true after inlining - which we've (usually) just performed in the
+    // BuiltinInliningPass - because otherwise SROA is unable to analyze the
+    // lifetime of allocas due to them being "escaped" by the function callee.
+    FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
+    // We have to run LLVM's Mem2Reg pass in case the front end didn't. Note
+    // that SROA usually runs Mem2Reg internally (unless disabled via a
+    // command-line option) though using its own heuristic. We run it
+    // unconditionally regardless, just for good measure.
+    FPM.addPass(PromotePass());
+    // LLVM's own Mem2Reg pass doesn't always get everything
+    FPM.addPass(BasicMem2RegPass());
+
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(AggressiveInstCombinePass());
+    FPM.addPass(DCEPass());
+    FPM.addPass(PreLinearizePass());
+    // If pre-linearization created any unnecessary Hoist Guards,
+    // Instruction Combining Pass will handily clean them up.
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(DCEPass());
+    FPM.addPass(UnifyFunctionExitNodesPass());
+    FPM.addPass(LoopSimplifyPass());
+    // Lower switches again because CFG simplifcation can create them.
+    FPM.addPass(LowerSwitchPass());
+    {
+      LoopPassManager LPM;
+      LPM.addPass(VeczLoopRotatePass());
+      // IndVarSimplify can create a lot of duplicate instructions when there
+      // are unrolled loops. EarlyCSE is there to clear them up. However,
+      // this can destroy LCSSA form, so we need to restore it.
+      LPM.addPass(IndVarSimplifyPass());
+      FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+    }
+
+    FPM.addPass(EarlyCSEPass());
+    // We run this last because EarlyCSE can actually create infinite loops
+    // (with a "conditional" branch on true)
+    FPM.addPass(createFunctionToLoopPassAdaptor(SimplifyInfiniteLoopPass()));
 
-  PM.addPass(createModuleToFunctionPassAdaptor(RemoveIntPtrPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SquashSmallVectorsPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(UniformReassociationPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(TernaryTransformPass()));
+    FPM.addPass(RemoveIntPtrPass());
+    FPM.addPass(SquashSmallVectorsPass());
+    FPM.addPass(UniformReassociationPass());
+    FPM.addPass(TernaryTransformPass());
 
-  PM.addPass(createModuleToFunctionPassAdaptor(BreakCriticalEdgesPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(LCSSAPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(ControlFlowConversionPass()));
+    FPM.addPass(BreakCriticalEdgesPass());
+    FPM.addPass(LCSSAPass());
+    FPM.addPass(ControlFlowConversionPass());
+
+    PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  // Verify that the preparation passes (particularly control-flow conversion)
+  // have left the module in a correct state.
   PM.addPass(VerifierPass());
-  PM.addPass(createModuleToFunctionPassAdaptor(DivergenceCleanupPass()));
-
-  PM.addPass(createModuleToFunctionPassAdaptor(CommonGEPEliminationPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(ScalarizationPass()));
-
-  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(ADCEPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyMaskedMemOpsPass()));
-
-  // Having multiple GEP instructions that perform the same operation
-  // greatly amplifies the code generated by the packetizer as it duplicates
-  // the amount of extractelement instructions, so we want to remove what
-  // is unnecessary.
-  PM.addPass(createModuleToFunctionPassAdaptor(CommonGEPEliminationPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(PacketizationPass()));
-
-  PM.addPass(createModuleToFunctionPassAdaptor(InlinePostVectorizationPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(FlattenCFGPass()));
-  PM.addPass(
-      createModuleToFunctionPassAdaptor(GVNPass(GVNOptions().setMemDep(true))));
-  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(ADCEPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SinkingPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(AggressiveInstCombinePass()));
-
-  PM.addPass(createModuleToFunctionPassAdaptor(
-      InterleavedGroupCombinePass(eInterleavedStore)));
-  PM.addPass(createModuleToFunctionPassAdaptor(
-      InterleavedGroupCombinePass(eInterleavedLoad)));
-  PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+
+  {
+    FunctionPassManager FPM;
+
+    FPM.addPass(DivergenceCleanupPass());
+
+    FPM.addPass(CommonGEPEliminationPass());
+    FPM.addPass(ScalarizationPass());
+
+    FPM.addPass(AggressiveInstCombinePass());
+    FPM.addPass(ADCEPass());
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(SimplifyMaskedMemOpsPass());
+
+    // Having multiple GEP instructions that perform the same operation
+    // greatly amplifies the code generated by the packetizer as it duplicates
+    // the amount of extractelement instructions, so we want to remove what
+    // is unnecessary.
+    FPM.addPass(CommonGEPEliminationPass());
+
+    // The packetizer - the 'main' bit of the vectorization process.
+    FPM.addPass(PacketizationPass());
+
+    FPM.addPass(InlinePostVectorizationPass());
+    FPM.addPass(FlattenCFGPass());
+    FPM.addPass(GVNPass(GVNOptions().setMemDep(true)));
+    FPM.addPass(AggressiveInstCombinePass());
+    FPM.addPass(ADCEPass());
+    FPM.addPass(SinkingPass());
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(AggressiveInstCombinePass());
+
+    FPM.addPass(InterleavedGroupCombinePass(eInterleavedStore));
+    FPM.addPass(InterleavedGroupCombinePass(eInterleavedLoad));
+    FPM.addPass(InstCombinePass());
 #if LLVM_VERSION_GREATER_EQUAL(18, 0)
-  // LLVM 18 split this pass out of InstCombine
-  PM.addPass(createModuleToFunctionPassAdaptor(InferAlignmentPass()));
+    // LLVM 18 split this pass out of InstCombine
+    FPM.addPass(InferAlignmentPass());
 #endif
-  PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
-  PM.addPass(createModuleToFunctionPassAdaptor(SimplifyMaskedMemOpsPass()));
-  PM.addPass(DefineInternalBuiltinsPass());
+    FPM.addPass(DCEPass());
+    FPM.addPass(SimplifyMaskedMemOpsPass());
 
+    PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  PM.addPass(DefineInternalBuiltinsPass());
   PM.addPass(VerifierPass());
 
   return true;

From 9e232b93f70ba49eb8f7c6a1743b5c9f84aa8e7e Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 18 Dec 2023 09:50:53 +0000
Subject: [PATCH 070/182] More LLVM 18 fixups.

* FindDbgDeclareUses got refactored, we have to call findDbgDeclares.
* DIBuilder::createStaticMemberType takes a DWARF tag now. Since we are
  processing a DebugTypeMember, we know it is always a data member and
  can set the tag to DW_TAG_variable.
---
 .../vecz/source/transform/basic_mem2reg_pass.cpp             | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index 1f662ffe7862a..b5559151f5200 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -183,7 +183,10 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
       StoredValue = Store->getValueOperand();
       ToDelete.push_back(Store);
       DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
-#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+#if LLVM_VERSION_GREATER_EQUAL(18, 0)
+      SmallVector<DbgDeclareInst *, 1> DbgIntrinsics;
+      findDbgDeclares(DbgIntrinsics, Alloca);
+#elif LLVM_VERSION_GREATER_EQUAL(17, 0)
       auto DbgIntrinsics = FindDbgDeclareUses(Alloca);
 #else
       auto DbgIntrinsics = FindDbgAddrUses(Alloca);

From 632c85ed1cea51934a6b6e37ff5a09dcbb3421fd Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 14 Dec 2023 16:27:41 +0000
Subject: [PATCH 071/182] [vecz] Add support for masking atomic RMW
 instructions

This commit allows the vectorizer to vectorize kernels in which there
are atomic RMW instructions that need masked for control-flow purposes:
in a divergent if/else or a loop, etc.

It follows a fairly simple paradigm - similar to how we mask loads and
stores - involving:

* Control-flow conversion replacing the atomic with a call to an
  'internal' vecz builtin
* The packetizer widening this builtin, and replacing the call with
  another call (with packetized arguments)
* The post-vectorization `DefineBuiltinsPass` running and providing
  function bodies for these masked atomic builtins

The builtins themselves are simply loops over the entire vectorized
arguments, conditionally doing an atomic operation one by one in
sequence depending on the mask. This should be correct (i.e., not
performing the whole atomic operation at once) since the results are
undefined for how work-items run in parallel and which work-items would
"win" if there was any contention in the atomic memory addresses. Note
also that this is also essentially how plain atomics are vectorized: by
scalarizing them.

There isn't yet support for the atomic cmpxhg instructions - those will
be done separately.
---
 .../source/include/vectorization_context.h    |  45 ++-
 .../source/include/vectorization_helpers.h    |  16 +-
 .../control_flow_conversion_pass.cpp          |  61 +++-
 .../vecz/source/transform/packetizer.cpp      |  72 ++++
 .../vecz/source/vectorization_context.cpp     | 331 ++++++++++++++++++
 .../vecz/source/vectorization_helpers.cpp     |  44 ++-
 .../vecz/test/lit/llvm/Boscc/printf.ll        | 125 -------
 .../vecz/test/lit/llvm/diverging_atomic.ll    |   5 +-
 .../vecz/test/lit/llvm/masked_atomics.ll      |  87 +++++
 .../test/lit/llvm/masked_atomics_scalar.ll    |  43 +++
 10 files changed, 694 insertions(+), 135 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index 8c119eee3c8da..3d580525c4117 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -23,8 +23,11 @@
 
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/PassManager.h>
 #include <llvm/IR/ValueHandle.h>
+#include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/multi_llvm.h>
@@ -150,6 +153,38 @@ class VectorizationContext {
   /// @return The masked version of the function
   llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
 
+  struct MaskedAtomicRMW {
+    llvm::Type *PointerTy;
+    llvm::Type *ValTy;
+    llvm::AtomicRMWInst::BinOp BinOp;
+    llvm::Align Align;
+    bool IsVolatile = false;
+    llvm::SyncScope::ID SyncScope;
+    llvm::AtomicOrdering Ordering;
+    // Vectorization info
+    llvm::ElementCount VF;
+    bool IsVectorPredicated = false;
+  };
+
+  /// @brief Check if the given function is a masked version of an atomic RMW
+  /// operation.
+  ///
+  /// @param[in] F The function to check
+  /// @return A MaskedAtomicRMW instance detailing the atomic operation if the
+  /// function is a masked atomic RMW, or std::nullopt otherwise
+  std::optional<MaskedAtomicRMW> isMaskedAtomicRMWFunction(
+      const llvm::Function &F) const;
+  /// @brief Get (if it exists already) or create the function representing the
+  /// masked version of an atomic RMW operation.
+  ///
+  /// @param[in] I Atomic to be masked
+  /// @param[in] Choices Choices to mangle into the function name
+  /// @param[in] VF The vectorization factor of the atomic operation
+  /// @return The masked version of the function
+  llvm::Function *getOrCreateMaskedAtomicRMWFunction(
+      MaskedAtomicRMW &I, const VectorizationChoices &Choices,
+      llvm::ElementCount VF);
+
   /// @brief Create a VectorizationUnit to use to vectorize the given scalar
   /// function.
   ///
@@ -157,7 +192,7 @@ class VectorizationContext {
   /// VectorizationContext.
   ///
   /// @param[in] F Function to vectorize.
-  /// @param[in] Width VF vectorization factor to use.
+  /// @param[in] VF vectorization factor to use.
   /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
   /// @param[in] Ch Vectorization Choices for the vectorization.
   VectorizationUnit *createVectorizationUnit(llvm::Function &F,
@@ -258,6 +293,14 @@ class VectorizationContext {
   bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
                             llvm::RecurKind OpKind, bool IsVP) const;
 
+  /// @brief Emit the body for a masked atomic builtin
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] MA The MaskedAtomicRMW information
+  /// @returns true on success, false otherwise
+  bool emitMaskedAtomicRMWBody(llvm::Function &F,
+                               const MaskedAtomicRMW &MA) const;
+
   /// @brief Helper for non-vectorization tasks.
   TargetInfo &VTI;
   /// @brief Module in which the vectorization happens.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
index adba458a067eb..febd373bf5abc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
@@ -36,11 +36,25 @@ class VectorizationChoices;
 /// @param[in] ScalarName Name of the original function.
 /// @param[in] VF vectorization factor of the vectorized function.
 /// @param[in] Choices choices used for vectorization
+/// @param[in] IsBuiltin True if this is an internal builtin.
 ///
 /// @return Name for the vectorized function.
 std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
                                       llvm::ElementCount VF,
-                                      VectorizationChoices Choices);
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin = false);
+
+/// @brief Parses a name generated for a vectorized function
+///
+/// @see getVectorizedFunctionName.
+///
+/// @param[in] Name Name of the vectorized function.
+///
+/// @return A tuple containing the original name of the function, and the
+/// element count and choices it was encoded with. Returns std::nullopt on
+/// failure.
+std::optional<std::tuple<std::string, llvm::ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(llvm::StringRef Name);
 
 /// @brief Clone the scalar function's body into the function to vectorize,
 /// vectorizing function argument types where required.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 874a793d9993d..e16eab41bf069 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -27,10 +27,13 @@
 #include <llvm/Analysis/ValueTracking.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Dominators.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
+#include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include <queue>
@@ -211,6 +214,16 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @return true if it is valid to mask this call, false otherwise
   bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
 
+  /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin
+  /// call.
+  ///
+  /// @param[in] atomicI The atomic instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the masked atomic
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this atomic, false otherwise
+  bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask,
+                            DeletionMap &toDelete);
+
   /// @brief Linearize a CFG.
   /// @return true if no problem occurred, false otherwise.
   bool partiallyLinearizeCFG();
@@ -1124,9 +1137,12 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
         return makeStringError("Could not apply mask to call instruction", I);
       }
     } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
-      // We need to apply masks to atomic functions, but it is currently not
-      // implemented. See CA-3294.
-      return makeStringError("Could not apply mask to atomic instruction", I);
+      // Turn atomics into calls to masked builtins if possible.
+      // FIXME: We don't yet support masked cmpxchg instructions.
+      if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I);
+          !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) {
+        return makeStringError("Could not apply mask to atomic instruction", I);
+      }
     } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
       // We have to be careful with infinite loops, because if they exist on a
       // divergent code path, they will always be entered and will hang the
@@ -1356,6 +1372,45 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   return true;
 }
 
+bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW(
+    AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n");
+
+  VectorizationContext::MaskedAtomicRMW MA;
+  MA.Align = atomicI.getAlign();
+  MA.BinOp = atomicI.getOperation();
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+  MA.IsVolatile = atomicI.isVolatile();
+  MA.Ordering = atomicI.getOrdering();
+  MA.SyncScope = atomicI.getSyncScopeID();
+  MA.VF = ElementCount::getFixed(1);
+  MA.ValTy = atomicI.getType();
+  MA.PointerTy = atomicI.getPointerOperand()->getType();
+  // Create the new function and replace the old one with it
+  // Get the masked function
+  Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction(
+      MA, VU.choices(), ElementCount::getFixed(1));
+  VECZ_FAIL_IF(!newFunction);
+  SmallVector<Value *, 8> fnArgs = {atomicI.getPointerOperand(),
+                                    atomicI.getValOperand(), mask};
+  // We don't have a vector length just yet - pass in one as a dummy.
+  if (MA.IsVectorPredicated) {
+    fnArgs.push_back(
+        ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1));
+  }
+
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI);
+  VECZ_FAIL_IF(!newCI);
+
+  atomicI.replaceAllUsesWith(newCI);
+  toDelete.emplace_back(&atomicI, newCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+
+  return true;
+}
+
 bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
   // Two methods are possible to transform the divergent loops into uniform
   // ones:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 8615245f82c4c..9b80bca86e255 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -55,6 +55,7 @@
 #include "memory_operations.h"
 #include "transform/instantiation_pass.h"
 #include "transform/packetization_helpers.h"
+#include "vectorization_context.h"
 #include "vectorization_unit.h"
 #include "vecz/vecz_choices.h"
 #include "vecz/vecz_target_info.h"
@@ -301,6 +302,14 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeMemOp(MemOp &Op);
+  /// @brief Packetize a masked atomic RMW operation.
+  ///
+  /// @param[in] CI Masked atomic RMW builtin call to packetize.
+  /// @param[in] AtomicInfo Information about the masked atomic RMW.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeMaskedAtomicRMW(
+      CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo);
   /// @brief Packetize a GEP instruction.
   ///
   /// @param[in] GEP Instruction to packetize.
@@ -2093,6 +2102,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
         return packetizeMemOp(*MaskedOp);
       }
     }
+    if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) {
+      return packetizeMaskedAtomicRMW(*CI, *AtomicInfo);
+    }
   }
 
   auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
@@ -2766,6 +2778,66 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   return results;
 }
 
+ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
+    CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) {
+  ValuePacket results;
+
+  Value *const ptr = CI.getArgOperand(0);
+  Value *const val = CI.getArgOperand(1);
+  Value *const mask = CI.getArgOperand(2);
+
+  assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch");
+  auto const packetWidth = getPacketWidthForType(val->getType());
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, &CI,
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  ValuePacket valPacket;
+  Result valResult = packetize(val);
+  PACK_FAIL_IF(!valResult);
+  valResult.getPacketValues(packetWidth, valPacket);
+  PACK_FAIL_IF(valPacket.empty());
+
+  ValuePacket ptrPacket;
+  Result ptrResult = packetize(ptr);
+  PACK_FAIL_IF(!ptrResult);
+  ptrResult.getPacketValues(packetWidth, ptrPacket);
+  PACK_FAIL_IF(ptrPacket.empty());
+
+  ValuePacket maskPacket;
+  Result maskResult = packetize(mask);
+  PACK_FAIL_IF(!maskResult);
+  maskResult.getPacketValues(packetWidth, maskPacket);
+  PACK_FAIL_IF(maskPacket.empty());
+
+  IRBuilder<> B(&CI);
+  IC.deleteInstructionLater(&CI);
+
+  for (unsigned i = 0; i != packetWidth; ++i) {
+    auto *const ptrI = ptrPacket[i];
+    auto *const valI = valPacket[i];
+
+    AtomicInfo.ValTy = valI->getType();
+    AtomicInfo.PointerTy = ptrI->getType();
+    auto *maskedAtomicF =
+        Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth);
+    PACK_FAIL_IF(!maskedAtomicF);
+
+    SmallVector<Value *, 4> args = {ptrI, valI, maskPacket[i]};
+    if (AtomicInfo.IsVectorPredicated) {
+      assert(VL && "Missing vector length");
+      args.push_back(VL);
+    }
+
+    results.push_back(B.CreateCall(maskedAtomicF, args));
+  }
+
+  return results;
+}
+
 void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
   // FIXME: Reinstate support for vectorizing debug info
   return;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index e4fbbf4d67d4a..821d62936f576 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -22,11 +22,17 @@
 #include <compiler/utils/pass_functions.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/IR/Attributes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Target/TargetMachine.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <algorithm>
 #include <cassert>
+#include <optional>
 
 #include "analysis/vectorization_unit_analysis.h"
 #include "debugging.h"
@@ -368,6 +374,227 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   return newFunction;
 }
 
+std::optional<VectorizationContext::MaskedAtomicRMW>
+VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const {
+  auto VFInfo = decodeVectorizedFunctionName(F.getName());
+  if (!VFInfo) {
+    return std::nullopt;
+  }
+  auto [FnNameStr, VF, Choices] = *VFInfo;
+
+  llvm::StringRef FnName = FnNameStr;
+  if (!FnName.consume_front("masked_atomicrmw_")) {
+    return std::nullopt;
+  }
+  VectorizationContext::MaskedAtomicRMW AtomicInfo;
+
+  AtomicInfo.VF = VF;
+  AtomicInfo.IsVectorPredicated = Choices.vectorPredication();
+
+  AtomicInfo.IsVolatile = FnName.consume_front("volatile_");
+
+  if (FnName.consume_front("xchg")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg;
+  } else if (FnName.consume_front("add")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add;
+  } else if (FnName.consume_front("sub")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub;
+  } else if (FnName.consume_front("and")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::And;
+  } else if (FnName.consume_front("nand")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand;
+  } else if (FnName.consume_front("or")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or;
+  } else if (FnName.consume_front("xor")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor;
+  } else if (FnName.consume_front("max")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max;
+  } else if (FnName.consume_front("min")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min;
+  } else if (FnName.consume_front("umax")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax;
+  } else if (FnName.consume_front("umin")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin;
+  } else if (FnName.consume_front("fadd")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd;
+  } else if (FnName.consume_front("fsub")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub;
+  } else if (FnName.consume_front("fmax")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax;
+  } else if (FnName.consume_front("fmin")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin;
+  } else if (FnName.consume_front("uincwrap")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap;
+  } else if (FnName.consume_front("udecwrap")) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap;
+  } else {
+    return std::nullopt;
+  }
+
+  if (!FnName.consume_front("_align")) {
+    return std::nullopt;
+  }
+
+  uint64_t Alignment = 0;
+  if (FnName.consumeInteger(/*Radix=*/10, Alignment)) {
+    return std::nullopt;
+  }
+
+  AtomicInfo.Align = Align(Alignment);
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  if (FnName.consume_front("acquire")) {
+    AtomicInfo.Ordering = AtomicOrdering::Acquire;
+  } else if (FnName.consume_front("acqrel")) {
+    AtomicInfo.Ordering = AtomicOrdering::AcquireRelease;
+  } else if (FnName.consume_front("monotonic")) {
+    AtomicInfo.Ordering = AtomicOrdering::Monotonic;
+  } else if (FnName.consume_front("notatomic")) {
+    AtomicInfo.Ordering = AtomicOrdering::NotAtomic;
+  } else if (FnName.consume_front("release")) {
+    AtomicInfo.Ordering = AtomicOrdering::Release;
+  } else if (FnName.consume_front("seqcst")) {
+    AtomicInfo.Ordering = AtomicOrdering::SequentiallyConsistent;
+  } else if (FnName.consume_front("unordered")) {
+    AtomicInfo.Ordering = AtomicOrdering::Unordered;
+  } else {
+    return std::nullopt;
+  }
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  unsigned SyncScopeID = 0;
+  if (FnName.consumeInteger(/*Radix=*/10, SyncScopeID)) {
+    return std::nullopt;
+  }
+
+  AtomicInfo.SyncScope = static_cast<SyncScope::ID>(SyncScopeID);
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  // Note - we just assume the rest of the builtin name is okay, here. It
+  // should be mangled types, but vecz builtins use a strange mangling system,
+  // purely for uniqueness and not to infer types. Types are always assumed to
+  // be inferrable from the function parameters.
+  AtomicInfo.PointerTy = F.getFunctionType()->getParamType(0);
+  AtomicInfo.ValTy = F.getFunctionType()->getParamType(1);
+
+  return AtomicInfo;
+}
+
+Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction(
+    MaskedAtomicRMW &I, const VectorizationChoices &Choices, ElementCount VF) {
+  LLVMContext &ctx = I.ValTy->getContext();
+
+  SmallVector<Type *, 8> argTys;
+
+  argTys.push_back(I.PointerTy);
+  argTys.push_back(I.ValTy);
+  // Add one extra argument for the mask, which is always the same length
+  // (scalar or vector) as the value type.
+  auto *i1Ty = Type::getInt1Ty(ctx);
+  argTys.push_back(
+      !I.ValTy->isVectorTy()
+          ? dyn_cast<Type>(i1Ty)
+          : VectorType::get(i1Ty,
+                            cast<VectorType>(I.ValTy)->getElementCount()));
+  if (Choices.vectorPredication()) {
+    argTys.push_back(Type::getInt32Ty(ctx));
+  }
+
+  std::string maskedFnName;
+  raw_string_ostream O(maskedFnName);
+  O << "masked_atomicrmw_";
+
+  if (I.IsVolatile) {
+    O << "volatile_";
+  }
+
+#define BINOP_CASE(BINOP, STR) \
+  case AtomicRMWInst::BINOP:   \
+    O << (STR);                \
+    break
+
+  switch (I.BinOp) {
+    BINOP_CASE(Xchg, "xchg");
+    BINOP_CASE(Add, "add");
+    BINOP_CASE(Sub, "sub");
+    BINOP_CASE(And, "and");
+    BINOP_CASE(Nand, "nand");
+    BINOP_CASE(Or, "or");
+    BINOP_CASE(Xor, "xor");
+    BINOP_CASE(Max, "max");
+    BINOP_CASE(Min, "min");
+    BINOP_CASE(UMax, "umax");
+    BINOP_CASE(UMin, "umin");
+    BINOP_CASE(FAdd, "fadd");
+    BINOP_CASE(FSub, "fsub");
+    BINOP_CASE(FMax, "fmax");
+    BINOP_CASE(FMin, "fmin");
+    BINOP_CASE(UIncWrap, "uincwrap");
+    BINOP_CASE(UDecWrap, "udecwrap");
+    case llvm::AtomicRMWInst::BAD_BINOP:
+      return nullptr;
+  }
+
+#undef BINOP_CASE
+
+  O << "_align" << I.Align.value() << "_";
+  // Mangle ordering
+  switch (I.Ordering) {
+    default:
+      O << static_cast<unsigned>(I.Ordering);
+      break;
+    case AtomicOrdering::Acquire:
+      O << "acquire";
+      break;
+    case AtomicOrdering::AcquireRelease:
+      O << "acqrel";
+      break;
+    case AtomicOrdering::Monotonic:
+      O << "monotonic";
+      break;
+    case AtomicOrdering::NotAtomic:
+      O << "notatomic";
+      break;
+    case AtomicOrdering::Release:
+      O << "release";
+      break;
+    case AtomicOrdering::SequentiallyConsistent:
+      O << "seqcst";
+      break;
+    case AtomicOrdering::Unordered:
+      O << "unordered";
+      break;
+  }
+  // Syncscope
+  O << "_" << static_cast<unsigned>(I.SyncScope) << "_";
+
+  // Mangle types
+  compiler::utils::NameMangler mangler(&ctx);
+  for (auto *ty : argTys) {
+    VECZ_FAIL_IF(!mangler.mangleType(
+        O, ty,
+        compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone)));
+  }
+
+  maskedFnName =
+      getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true);
+
+  // Create the function type
+  FunctionType *maskedFnTy =
+      FunctionType::get(I.ValTy, argTys, /*isVarArg=*/false);
+
+  return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy);
+}
+
 namespace {
 std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
     StringRef fnName, Type *const ty) {
@@ -460,6 +687,10 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) {
     return emitSubgroupScanBody(*F, isInclusive, opKind, isVP);
   }
 
+  if (auto AtomicInfo = isMaskedAtomicRMWFunction(*F)) {
+    return emitMaskedAtomicRMWBody(*F, *AtomicInfo);
+  }
+
   return false;
 }
 
@@ -777,6 +1008,106 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   return true;
 }
 
+bool VectorizationContext::emitMaskedAtomicRMWBody(
+    Function &F, const VectorizationContext::MaskedAtomicRMW &MA) const {
+  LLVMContext &Ctx = F.getContext();
+
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F);
+
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+
+  auto *const PtrArg = F.getArg(0);
+  auto *const ValArg = F.getArg(1);
+  Value *MaskArg = F.getArg(2);
+
+  const bool IsVector = ValArg->getType()->isVectorTy();
+
+  IRBuilder<> B(EntryBB);
+  Value *const IdxStart = B.getInt32(0);
+  ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue());
+  Value *IdxEnd = !MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin);
+
+  // For vector-predicated masked atomics, we have to merge the incoming mask
+  // with a mask corresponding to the number of elements left active by the
+  // runtime vector length.
+  if (MA.IsVectorPredicated) {
+    auto *const VL = F.getArg(3);
+    auto *const IndexTy = VectorType::get(VL->getType(), MA.VF);
+    auto *const step = B.CreateStepVector(IndexTy);
+    auto *const VLMask = B.CreateICmpULT(step, B.CreateVectorSplat(MA.VF, VL));
+    MaskArg = B.CreateAnd(MaskArg, VLMask);
+  }
+
+  Value *RetVal = nullptr;
+
+  auto CreateLoopBody = [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal,
+                         IsVector](
+                            BasicBlock *BB, Value *Idx, ArrayRef<Value *> IVs,
+                            MutableArrayRef<Value *> IVsNext) -> BasicBlock * {
+    IRBuilder<> IRB(BB);
+
+    Value *MaskElt = MaskArg;
+    if (IsVector) {
+      MaskElt = IRB.CreateExtractElement(MaskArg, Idx, "mask");
+    }
+    auto *const MaskCmp =
+        IRB.CreateICmpNE(MaskElt, IRB.getInt1(false), "mask.cmp");
+
+    auto *const IfBB = BasicBlock::Create(F.getContext(), "if.then", &F);
+    auto *const ElseBB = BasicBlock::Create(F.getContext(), "if.else", &F);
+
+    IRB.CreateCondBr(MaskCmp, IfBB, ElseBB);
+
+    {
+      IRB.SetInsertPoint(IfBB);
+      Value *Ptr = PtrArg;
+      Value *Val = ValArg;
+      if (IsVector) {
+        Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr");
+        Val = IRB.CreateExtractElement(ValArg, Idx, "val");
+      }
+      auto *const AtomicRMW = IRB.CreateAtomicRMW(MA.BinOp, Ptr, Val, MA.Align,
+                                                  MA.Ordering, MA.SyncScope);
+      AtomicRMW->setVolatile(MA.IsVolatile);
+
+      if (IsVector) {
+        RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec");
+      } else {
+        RetVal = AtomicRMW;
+      }
+
+      IRB.CreateBr(ElseBB);
+    }
+
+    {
+      IRB.SetInsertPoint(ElseBB);
+
+      auto *MergePhi = IRB.CreatePHI(RetVal->getType(), 2, "merge");
+      MergePhi->addIncoming(IVs[0], BB);
+      MergePhi->addIncoming(RetVal, IfBB);
+      RetVal = MergePhi;
+    }
+    IVsNext[0] = RetVal;
+
+    // Move the exit block right to the end of the function.
+    ExitBB->moveAfter(ElseBB);
+
+    return ElseBB;
+  };
+
+  compiler::utils::CreateLoopOpts Opts;
+  {
+    Opts.IVs.push_back(PoisonValue::get(MA.ValTy));
+    Opts.loopIVNames.push_back("retvec.prev");
+  }
+  compiler::utils::createLoop(EntryBB, ExitBB, IdxStart, IdxEnd, Opts,
+                              CreateLoopBody);
+
+  B.SetInsertPoint(ExitBB);
+  B.CreateRet(RetVal);
+  return true;
+}
+
 Function *VectorizationContext::getInternalVectorEquivalent(
     Function *ScalarFn, unsigned SimdWidth) {
   // Handle masked memory loads and stores.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 0385dec201531..b65a9c793f704 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -22,8 +22,11 @@
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/Support/Debug.h>
+#include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
+#include <optional>
+
 #include "debugging.h"
 #include "vectorization_context.h"
 #include "vectorization_unit.h"
@@ -146,14 +149,49 @@ SmallVector<Instruction *, 2> createArgumentPlaceholders(
 
 namespace vecz {
 std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF,
-                                      VectorizationChoices Choices) {
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin) {
   Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v");
   Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_");
-  return (Twine("__vecz_") + Prefix + Twine(VF.getKnownMinValue()) + IsVP +
-          ScalarName)
+  return ((IsBuiltin ? VectorizationContext::InternalBuiltinPrefix
+                     : Twine("__vecz_")) +
+          Prefix + Twine(VF.getKnownMinValue()) + IsVP + ScalarName)
       .str();
 }
 
+std::optional<std::tuple<std::string, ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(StringRef Name) {
+  if (!Name.consume_front(VectorizationContext::InternalBuiltinPrefix)) {
+    if (!Name.consume_front("__vecz_")) {
+      return std::nullopt;
+    }
+  }
+
+  ElementCount VF;
+  bool Scalable = false;
+  if (Name.consume_front("nxv")) {
+    Scalable = true;
+  } else if (!Name.consume_front("v")) {
+    return std::nullopt;
+  }
+
+  unsigned KnownMin = 0;
+  if (Name.consumeInteger(10, KnownMin)) {
+    return std::nullopt;
+  }
+
+  VF = ElementCount::get(KnownMin, Scalable);
+
+  VectorizationChoices Choices;
+  if (Name.consume_front("_vp_")) {
+    Choices.enableVectorPredication();
+  } else if (!Name.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  return std::make_tuple(Name.str(), VF, Choices);
+}
+
 Function *cloneFunctionToVector(VectorizationUnit const &VU) {
   auto *const VectorizedFn = declareFunction(VU);
   VECZ_ERROR_IF(!VectorizedFn, "declareFunction failed to initialize");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
deleted file mode 100644
index d3a4c0600cf14..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/printf.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; TODO(CA-1981): Using `not` in qemu does not work.
-; REQUIRES: native
-; RUN: not veczc -k printf_add -vecz-simd-width=4 -S -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC < %s 2>&1 | FileCheck %s
-
-; This test just checks that we don't crash while converting the control flow.
-; LinearizeBOSCC would leave behind an invalid function when control flow fails
-; some time afterwards. This could trigger verification failures or crashes
-; depending on which passes were run later.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-define spir_kernel void @printf_add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i32 addrspace(1)* %status, i8 addrspace(1)* %x) {
-entry:
-  %in1.addr = alloca i32 addrspace(1)*, align 8
-  %in2.addr = alloca i32 addrspace(1)*, align 8
-  %out.addr = alloca i32 addrspace(1)*, align 8
-  %status.addr = alloca i32 addrspace(1)*, align 8
-  %tid = alloca i64, align 8
-  %sum = alloca i32, align 4
-  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
-  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
-  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
-  store i32 addrspace(1)* %status, i32 addrspace(1)** %status.addr, align 8
-  %call = call i64 @__mux_get_global_id(i32 0) #4
-  store i64 %call, i64* %tid, align 8
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8
-  %1 = load i64, i64* %tid, align 8
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %1
-  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
-  %3 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8
-  %4 = load i64, i64* %tid, align 8
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %4
-  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4
-  %add = add nsw i32 %2, %5
-  store i32 %add, i32* %sum, align 4
-  %6 = load i32, i32* %sum, align 4
-  %7 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8
-  %8 = load i64, i64* %tid, align 8
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %7, i64 %8
-  store i32 %6, i32 addrspace(1)* %arrayidx2, align 4
-  %9 = load i64, i64* %tid, align 8
-  %conv = trunc i64 %9 to i32
-  %10 = load i32, i32* %sum, align 4
-  %11 = call i64 @__mux_get_num_groups(i32 0)
-  %12 = trunc i64 %11 to i32
-  %13 = call i64 @__mux_get_num_groups(i32 1)
-  %14 = trunc i64 %13 to i32
-  %15 = call i64 @__mux_get_num_groups(i32 2)
-  %16 = trunc i64 %15 to i32
-  %17 = call i64 @__mux_get_group_id(i32 0)
-  %18 = trunc i64 %17 to i32
-  %19 = call i64 @__mux_get_group_id(i32 1)
-  %20 = trunc i64 %19 to i32
-  %21 = call i64 @__mux_get_group_id(i32 2)
-  %22 = trunc i64 %21 to i32
-  %23 = mul i32 %12, %20
-  %24 = mul i32 %14, %16
-  %25 = mul i32 %22, %24
-  %26 = add i32 %23, %25
-  %27 = add i32 %18, %26
-  %28 = mul i32 %14, %16
-  %29 = mul i32 %12, %28
-  %30 = udiv i32 1048576, %29
-  %31 = and i32 %30, -4
-  %32 = mul i32 %27, %31
-  %33 = getelementptr i8, i8 addrspace(1)* %x, i32 %32
-  %34 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
-  %35 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
-  %36 = atomicrmw add i32 addrspace(1)* %35, i32 12 acq_rel
-  %37 = add i32 %36, 12
-  %38 = icmp ugt i32 %37, %31
-  br i1 %38, label %early_return.i, label %store.i
-
-early_return.i:                                   ; preds = %entry
-  %39 = bitcast i8 addrspace(1)* %33 to i32 addrspace(1)*
-  %40 = getelementptr i32, i32 addrspace(1)* %39, i32 1
-  %41 = atomicrmw add i32 addrspace(1)* %40, i32 12 acq_rel
-  br label %.exit
-
-store.i:                                          ; preds = %entry
-  %42 = getelementptr i8, i8 addrspace(1)* %33, i32 %36
-  %43 = bitcast i8 addrspace(1)* %42 to i32 addrspace(1)*
-  store i32 0, i32 addrspace(1)* %43, align 1
-  %44 = add i32 %36, 4
-  %45 = getelementptr i8, i8 addrspace(1)* %33, i32 %44
-  %46 = bitcast i8 addrspace(1)* %45 to i32 addrspace(1)*
-  store i32 %conv, i32 addrspace(1)* %46, align 1
-  %47 = add i32 %36, 8
-  %48 = getelementptr i8, i8 addrspace(1)* %33, i32 %47
-  %49 = bitcast i8 addrspace(1)* %48 to i32 addrspace(1)*
-  store i32 %10, i32 addrspace(1)* %49, align 1
-  br label %.exit
-
-.exit:                                            ; preds = %store.i, %early_return.i
-  %call31 = phi i32 [ -1, %early_return.i ], [ 0, %store.i ]
-  %50 = load i32 addrspace(1)*, i32 addrspace(1)** %status.addr, align 8
-  %51 = load i64, i64* %tid, align 8
-  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %50, i64 %51
-  store i32 %call31, i32 addrspace(1)* %arrayidx4, align 4
-  ret void
-}
-
-declare i64 @__mux_get_global_id(i32)
-declare i64 @__mux_get_group_id(i32)
-declare i64 @__mux_get_num_groups(i32)
-
-; We can't vectorize this control flow
-; CHECK: Error: Failed to vectorize function 'printf_add'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
index de4501013763a..b6beaae1e47c2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
@@ -22,7 +22,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Vecz: Could not apply masks for function "kernel"
 ; CHECK-NEXT: note: Could not apply mask to atomic instruction
-; CHECK-SAME:  %atomic = atomicrmw add ptr %arrayidx.in, i32 2 monotonic, align 4
+; CHECK-SAME:  atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4
 
 define spir_kernel void @kernel(ptr %in, ptr %out) {
 entry:
@@ -32,7 +32,8 @@ entry:
 
 if.then:
   %arrayidx.in = getelementptr inbounds i32, ptr %in, i64 %gid
-  %atomic = atomicrmw add ptr %arrayidx.in, i32 2 monotonic, align 4
+  %atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4
+  %atomic = extractvalue { i32, i1 } %atomic_success, 0
   br label %end
 
 end:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
new file mode 100644
index 0000000000000..2f11e37c275c2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p)
+define spir_kernel void @test_fn(ptr %p) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> <i64 3, i64 3, i64 3, i64 3>, 
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp sgt i64 3, %call
+; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
+  %wi_p_i32 = getelementptr i32, ptr %p, i64 %call
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+  %old0 = atomicrmw add ptr %p, i32 1 acquire
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+  %old1 = atomicrmw add ptr %wi_p_i32, i32 1 acquire
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+  %old2 = atomicrmw umin ptr %wi_p_i32, i32 1 monotonic, align 2
+; CHECK: = call <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i1> [[CMP]]
+  %old3 = atomicrmw volatile fmax ptr %wi_p_i32, float 1.0 syncscope("singlethread") seq_cst
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ]
+; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]]
+; CHECK: [[VAL:%.*]] = extractelement <4 x i32> [[VALS]], i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4
+; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[ATOM]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1
+
+; CHECK: exit:
+; CHECK: ret <4 x i32> [[MERGE]]
+
+; Assume that all masked atomicrmw operations follow the logic above. Just
+; check that the right atomicrmw instruction is being generated.
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: atomicrmw umin ptr {{%.*}}, i32 {{%.*}} monotonic, align 2
+
+
+; CHECK: define <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(<4 x ptr> [[PTRS:%0]], <4 x float> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: atomicrmw volatile fmax ptr {{%.*}}, float {{%.*}} syncscope("singlethread") seq_cst, align 4
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
new file mode 100644
index 0000000000000..6cab589dd89f8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(ptr %p) {
+  %ret = call i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 1, i1 true)
+  ret void
+}
+
+declare i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask)
+
+; CHECK: define i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask) {
+; CHECK: entry:
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr %p, i32 %val acquire, align 4
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[RET:%.*]] = phi i32 [ poison, %entry ], [ [[ATOM]], %if.then ]
+; CHECK: br label %exit
+
+; CHECK: exit:
+; CHECK: ret i32 [[RET]]

From 27cc2d9d1b0858f9e4a5f8039bfa4a8d8bb09513 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 20 Dec 2023 08:52:53 +0000
Subject: [PATCH 072/182] [compiler] Handle scalable structs as barrier live
 variables

The work-item loops pass would crash when faced with a live variable
whose type was a struct containing scalable vectors.

We aren't legally allowed to store a struct type containing a mixture of
scalable and fixed types (`{ <vscale x 1 x i8>, i8 }`) so we decompose
such types into their constituent elements and store each individually.
Note that scalable elements are stored in the scalable part of the live
variables struct, and fixed elements are stored in the fixed part; in
that way they are treated as if they were never a struct to begin with.

Note that there may be a future optimization possible here where we
store all-scalable structs "whole", but this isn't currently a priority.

Note that this problem doesn't currently surface in the default pipeline
in the main branch, because we only end up with scalable vectors when we
vectorize as such, and we don't currently scalably vectorize any IR
that's known to contain struct types, at least not in a way that creates
a struct containing scalable vectors; see the new negative scalable vecz
test as an example.

The plan is to start allowing this when we improve the vectorization of
`cmpxhg` instructions. This should also improve the codegen for these
structures; see the new fixed-length vecz test for an example of the
poor codegen currently emitted.
---
 .../test/lit/llvm/ScalableVectors/cmpxchg.ll  |  66 +++++++
 .../ScalableVectors/store_literal_struct.ll   |  38 +++++
 .../vecz/test/lit/llvm/cmpxchg.ll             | 161 ++++++++++++++++++
 3 files changed, 265 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
new file mode 100644
index 0000000000000..7558c290789c4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify \
+; RUN:   --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Note: we can't currently scalably packetize this kernel, due to the struct
+; type.
+; CHECK: Vecz: Could not packetize %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
+define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+
+  %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+  %val0 = extractvalue { i32, i1 } %old0, 0
+  %success0 = extractvalue { i32, i1 } %old0, 1
+
+  %out = getelementptr i32, ptr %q, i64 %call
+  store i32 %val0, ptr %out, align 4
+
+  %outsuccess = getelementptr i8, ptr %r, i64 %call
+  %outbyte = zext i1 %success0 to i8
+  store i8 %outbyte, ptr %outsuccess, align 1
+
+  ; Test a couple of insert/extract patterns
+
+  ; Test inserting a uniform value into a varying literal struct
+  %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1
+  %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1
+  %outbyte0 = zext i1 %testextract0 to i8
+  store i8 %outbyte0, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a varying literal struct
+  %byte1 = load i8, ptr %outsuccess, align 1
+  %bool1 = trunc i8 %byte1 to i1
+  %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1
+  %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1
+  %outbyte1 = zext i1 %testextract1 to i8
+  store i8 %outbyte1, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a uniform literal struct
+  %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1
+  %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1
+  %outbyte2 = zext i1 %testextract2 to i8
+  store i8 %outbyte2, ptr %outsuccess, align 1
+
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
new file mode 100644
index 0000000000000..ad8599ba51f50
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that we do something correct when scalably packetizing struct literals.
+; Right now we fail to packetize, but if we could packetize this we'd have to
+; be careful as storing a struct literal containing scalable vectors is invalid
+; IR.
+; RUN: veczc -w 4 -vecz-scalable -vecz-passes=verify,packetizer,verify \
+; RUN:   --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: Vecz: Could not packetize  %v = load { i32, i32 }, ptr %arrayidx.p, align 4
+define spir_kernel void @test_fn(ptr %p, ptr %q) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.p = getelementptr { i32, i32 }, ptr %p, i64 %idx
+  %v = load { i32, i32 }, ptr %arrayidx.p, align 4
+  %arrayidx.q = getelementptr { i32, i32 }, ptr %q, i64 %idx
+  store { i32, i32 } %v, ptr %arrayidx.q, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
new file mode 100644
index 0000000000000..d62486409b4e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
@@ -0,0 +1,161 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r)
+define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+
+; Test that this cmpxchg is scalarized. Not ideal, but hey.
+; CHECK: [[A0:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
+; CHECK: [[A1:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
+; CHECK: [[A2:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
+; CHECK: [[A3:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
+
+; Then we insert the values into a strange struct
+; CHECK: [[INS0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[A0]], 0
+; CHECK: [[INS1:%.*]] = insertvalue [4 x { i32, i1 }] [[INS0]], { i32, i1 } [[A1]], 1
+; CHECK: [[INS2:%.*]] = insertvalue [4 x { i32, i1 }] [[INS1]], { i32, i1 } [[A2]], 2
+; CHECK: [[INS3:%.*]] = insertvalue [4 x { i32, i1 }] [[INS2]], { i32, i1 } [[A3]], 3
+  %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+
+; To extract from this result, we extract each element individually then insert
+; each into a vector.
+; CHECK: [[ELT0_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 0
+; CHECK: [[ELT0_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 0
+; CHECK: [[ELT0_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 0
+; CHECK: [[ELT0_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 0
+; CHECK: [[INS0V0_0:%.*]] = insertelement <4 x i32> undef, i32 [[ELT0_0_0]], i32 0
+; CHECK: [[INS0V0_1:%.*]] = insertelement <4 x i32> [[INS0V0_0]], i32 [[ELT0_0_1]], i32 1
+; CHECK: [[INS0V0_2:%.*]] = insertelement <4 x i32> [[INS0V0_1]], i32 [[ELT0_0_2]], i32 2
+; CHECK: [[INS0V0_3:%.*]] = insertelement <4 x i32> [[INS0V0_2]], i32 [[ELT0_0_3]], i32 3
+  %val0 = extractvalue { i32, i1 } %old0, 0
+; Same again here
+; CHECK: [[ELT1_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 1
+; CHECK: [[ELT1_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 1
+; CHECK: [[ELT1_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 1
+; CHECK: [[ELT1_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 1
+; CHECK: [[INS1V0_0:%.*]] = insertelement <4 x i1> undef, i1 [[ELT1_0_0]], i32 0
+; CHECK: [[INS1V0_1:%.*]] = insertelement <4 x i1> [[INS1V0_0]], i1 [[ELT1_0_1]], i32 1
+; CHECK: [[INS1V0_2:%.*]] = insertelement <4 x i1> [[INS1V0_1]], i1 [[ELT1_0_2]], i32 2
+; CHECK: [[INS1V0_3:%.*]] = insertelement <4 x i1> [[INS1V0_2]], i1 [[ELT1_0_3]], i32 3
+  %success0 = extractvalue { i32, i1 } %old0, 1
+
+  %out = getelementptr i32, ptr %q, i64 %call
+; Stored as a vector
+; CHECK: store <4 x i32> [[INS0V0_3]], ptr
+  store i32 %val0, ptr %out, align 4
+
+; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call
+  %outsuccess = getelementptr i8, ptr %r, i64 %call
+; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[INS1V0_3]] to <4 x i8>
+  %outbyte = zext i1 %success0 to i8
+; Stored as a vector
+; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1
+  store i8 %outbyte, ptr %outsuccess, align 1
+
+  ; Test a couple of insert/extract patterns
+
+; Test inserting a uniform value into a varying literal struct
+; This is very inefficient
+; CHECK: [[INSS0_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 false, 1
+; CHECK: [[INSS0_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 false, 1
+; CHECK: [[INSS0_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 false, 1
+; CHECK: [[INSS0_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 false, 1
+; CHECK: [[INSS1_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS0_0]], 0
+; CHECK: [[INSS1_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_0]], { i32, i1 } [[INSS0_1]], 1
+; CHECK: [[INSS1_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_1]], { i32, i1 } [[INSS0_2]], 2
+; CHECK: [[INSS1_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_2]], { i32, i1 } [[INSS0_3]], 3
+; CHECK: [[EXTS1_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 0, 1
+; CHECK: [[EXTS1_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 1, 1
+; CHECK: [[EXTS1_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 2, 1
+; CHECK: [[EXTS1_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 3, 1
+; CHECK: [[INS1V1_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS1_0]], i32 0
+; CHECK: [[INS1V1_1:%.*]] = insertelement <4 x i1> [[INS1V1_0]], i1 [[EXTS1_1]], i32 1
+; CHECK: [[INS1V1_2:%.*]] = insertelement <4 x i1> [[INS1V1_1]], i1 [[EXTS1_2]], i32 2
+; CHECK: [[INS1V1_3:%.*]] = insertelement <4 x i1> [[INS1V1_2]], i1 [[EXTS1_3]], i32 3
+; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[INS1V1_3]] to <4 x i8>
+; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1
+  %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1
+  %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1
+  %outbyte0 = zext i1 %testextract0 to i8
+  store i8 %outbyte0, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a varying literal struct
+; CHECK: [[V4I8_LD:%.*]] = load <4 x i8>, ptr %outsuccess, align 1
+; CHECK: [[TRUNC:%.*]] = trunc <4 x i8> [[V4I8_LD]] to <4 x i1>
+; CHECK: [[EXTV0_0:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 0
+; CHECK: [[EXTV0_1:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 1
+; CHECK: [[EXTV0_2:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 2
+; CHECK: [[EXTV0_3:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 3
+; CHECK: [[INSS2_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 [[EXTV0_0]], 1
+; CHECK: [[INSS2_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 [[EXTV0_1]], 1
+; CHECK: [[INSS2_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 [[EXTV0_2]], 1
+; CHECK: [[INSS2_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 [[EXTV0_3]], 1
+; CHECK: [[INSS3_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS2_0]], 0
+; CHECK: [[INSS3_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_0]], { i32, i1 } [[INSS2_1]], 1
+; CHECK: [[INSS3_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_1]], { i32, i1 } [[INSS2_2]], 2
+; CHECK: [[INSS3_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_2]], { i32, i1 } [[INSS2_3]], 3
+; CHECK: [[EXTS3_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 0, 1
+; CHECK: [[EXTS3_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 1, 1
+; CHECK: [[EXTS3_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 2, 1
+; CHECK: [[EXTS3_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 3, 1
+; CHECK: [[INS1V2_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS3_0]], i32 0
+; CHECK: [[INS1V2_1:%.*]] = insertelement <4 x i1> [[INS1V2_0]], i1 [[EXTS3_1]], i32 1
+; CHECK: [[INS1V2_2:%.*]] = insertelement <4 x i1> [[INS1V2_1]], i1 [[EXTS3_2]], i32 2
+; CHECK: [[INS1V2_3:%.*]] = insertelement <4 x i1> [[INS1V2_2]], i1 [[EXTS3_3]], i32 3
+; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[INS1V2_3]] to <4 x i8>
+; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1
+  %byte1 = load i8, ptr %outsuccess, align 1
+  %bool1 = trunc i8 %byte1 to i1
+  %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1
+  %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1
+  %outbyte1 = zext i1 %testextract1 to i8
+  store i8 %outbyte1, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a uniform literal struct
+; CHECK: [[INSS4_0:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_0]], 1
+; CHECK: [[INSS4_1:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_1]], 1
+; CHECK: [[INSS4_2:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_2]], 1
+; CHECK: [[INSS4_3:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_3]], 1
+; CHECK: [[INSS5_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS4_0]], 0
+; CHECK: [[INSS5_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_0]], { i32, i1 } [[INSS4_1]], 1
+; CHECK: [[INSS5_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_1]], { i32, i1 } [[INSS4_2]], 2
+; CHECK: [[INSS5_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_2]], { i32, i1 } [[INSS4_3]], 3
+; CHECK: [[EXTS5_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 0, 1
+; CHECK: [[EXTS5_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 1, 1
+; CHECK: [[EXTS5_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 2, 1
+; CHECK: [[EXTS5_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 3, 1
+; CHECK: [[INS2V3_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS5_0]], i32 0
+; CHECK: [[INS2V3_1:%.*]] = insertelement <4 x i1> [[INS2V3_0]], i1 [[EXTS5_1]], i32 1
+; CHECK: [[INS2V3_2:%.*]] = insertelement <4 x i1> [[INS2V3_1]], i1 [[EXTS5_2]], i32 2
+; CHECK: [[INS2V3_3:%.*]] = insertelement <4 x i1> [[INS2V3_2]], i1 [[EXTS5_3]], i32 3
+; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[INS2V3_3]] to <4 x i8>
+; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1
+  %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1
+  %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1
+  %outbyte2 = zext i1 %testextract2 to i8
+  store i8 %outbyte2, ptr %outsuccess, align 1
+
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)

From 1a9491e8147ee6e06f4bcc19104bf32d91a6147b Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 19 Dec 2023 11:10:31 +0000
Subject: [PATCH 073/182] [vecz] Add support for masking cmpxchg instructions

This finishes off the support for masked atomic instructions.

The scheme is essentially identical to that of atomic RMW instructions,
except that the instruction returns a literal struct containing the
value and a boolean success value. These must also be packetized for
efficient results.

The packetization of literal struct types - those unnamed structs
returned by cmpxchg - has gone through some refactoring, so now the
vectorized type of `{ i32, i1 }`  is (e.g.) `{ <4 x i32>, <4 x i1> }`.
This in practice makes it much more efficient to extract component
vectors from the structs. We can see the effect on codegen in the
associated LIT tests. It also makes it possible to scalably vectorize
cmpxchg instructions.

Note that now even unmasked cmpxchg instructions generate the masked
builtin, albeit with an "all true" mask. This is to help maintain a
uniform internal representation of the packetized literal structure
type.
---
 .../analysis/uniform_value_analysis.cpp       |   2 +
 .../source/include/vectorization_context.h    |  46 ++-
 .../control_flow_conversion_pass.cpp          |  84 ++--
 .../transform/packetization_helpers.cpp       |  17 +-
 .../vecz/source/transform/packetizer.cpp      | 205 ++++++++--
 .../vecz/source/vectorization_context.cpp     | 384 +++++++++++-------
 .../test/lit/llvm/ScalableVectors/cmpxchg.ll  |  37 +-
 .../llvm/VectorPredication/masked_atomics.ll  | 106 +++++
 .../vecz/test/lit/llvm/cmpxchg.ll             | 115 ++----
 .../vecz/test/lit/llvm/diverging_atomic.ll    |  46 ---
 .../vecz/test/lit/llvm/masked_cmpxchg.ll      | 105 +++++
 .../test/lit/llvm/masked_cmpxchg_scalar.ll    |  48 +++
 12 files changed, 845 insertions(+), 350 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 3f4f495c13dde..e2a696aa06b2c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -198,6 +198,8 @@ void UniformValueResult::findVectorLeaves(
                Op->isMaskedScatterGatherMemOp())) {
             IsCallLeaf = true;
           }
+        } else if (Ctx.isMaskedAtomicFunction(*CI->getCalledFunction())) {
+          IsCallLeaf = true;
         }
         if (IsCallLeaf) {
           Leaves.push_back(CI);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index 3d580525c4117..ad140dd609697 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -153,36 +153,59 @@ class VectorizationContext {
   /// @return The masked version of the function
   llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
 
-  struct MaskedAtomicRMW {
+  /// @brief Represents either an atomicrmw or cmpxchg operation.
+  ///
+  /// Most fields are shared, with the exception of CmpXchgFailureOrdering and
+  /// IsWeak, which are only to be set for cmpxchg, and BinOp, which is only to
+  /// be set to a valid value for atomicrmw.
+  struct MaskedAtomic {
     llvm::Type *PointerTy;
     llvm::Type *ValTy;
+    /// @brief Must be set to BAD_BINOP for cmpxchg instructions
     llvm::AtomicRMWInst::BinOp BinOp;
     llvm::Align Align;
     bool IsVolatile = false;
     llvm::SyncScope::ID SyncScope;
     llvm::AtomicOrdering Ordering;
+    /// @brief Must be set for cmpxchg instructions
+    std::optional<llvm::AtomicOrdering> CmpXchgFailureOrdering = std::nullopt;
+    /// @brief Must only be set for cmpxchg instructions
+    bool IsWeak = false;
     // Vectorization info
     llvm::ElementCount VF;
     bool IsVectorPredicated = false;
+
+    /// @brief Returns true if this MaskedAtomic represents a cmpxchg operation.
+    bool isCmpXchg() const {
+      if (CmpXchgFailureOrdering.has_value()) {
+        // 'binop' only applies to atomicrmw
+        assert(BinOp == llvm::AtomicRMWInst::BAD_BINOP &&
+               "Invalid MaskedAtomic state");
+        return true;
+      }
+      // 'weak' only applies to cmpxchg
+      assert(!IsWeak && "Invalid MaskedAtomic state");
+      return false;
+    }
   };
 
-  /// @brief Check if the given function is a masked version of an atomic RMW
-  /// operation.
+  /// @brief Check if the given function is a masked version of an atomicrmw or
+  /// cmpxchg operation.
   ///
   /// @param[in] F The function to check
-  /// @return A MaskedAtomicRMW instance detailing the atomic operation if the
-  /// function is a masked atomic RMW, or std::nullopt otherwise
-  std::optional<MaskedAtomicRMW> isMaskedAtomicRMWFunction(
+  /// @return A MaskedAtomic instance detailing the atomic operation if the
+  /// function is a masked atomic, or std::nullopt otherwise
+  std::optional<MaskedAtomic> isMaskedAtomicFunction(
       const llvm::Function &F) const;
   /// @brief Get (if it exists already) or create the function representing the
-  /// masked version of an atomic RMW operation.
+  /// masked version of an atomicrmw/cmpxchg operation.
   ///
   /// @param[in] I Atomic to be masked
   /// @param[in] Choices Choices to mangle into the function name
   /// @param[in] VF The vectorization factor of the atomic operation
   /// @return The masked version of the function
-  llvm::Function *getOrCreateMaskedAtomicRMWFunction(
-      MaskedAtomicRMW &I, const VectorizationChoices &Choices,
+  llvm::Function *getOrCreateMaskedAtomicFunction(
+      MaskedAtomic &I, const VectorizationChoices &Choices,
       llvm::ElementCount VF);
 
   /// @brief Create a VectorizationUnit to use to vectorize the given scalar
@@ -296,10 +319,9 @@ class VectorizationContext {
   /// @brief Emit the body for a masked atomic builtin
   ///
   /// @param[in] F The empty (declaration only) function to emit the body in
-  /// @param[in] MA The MaskedAtomicRMW information
+  /// @param[in] MA The MaskedAtomic information
   /// @returns true on success, false otherwise
-  bool emitMaskedAtomicRMWBody(llvm::Function &F,
-                               const MaskedAtomicRMW &MA) const;
+  bool emitMaskedAtomicBody(llvm::Function &F, const MaskedAtomic &MA) const;
 
   /// @brief Helper for non-vectorization tasks.
   TargetInfo &VTI;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index e16eab41bf069..f7649b5b4e46c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -214,15 +214,14 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @return true if it is valid to mask this call, false otherwise
   bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
 
-  /// @brief Attempt to apply a mask to an AtomicRMW instruction via a builtin
+  /// @brief Attempt to apply a mask to an atomic instruction via a builtin
   /// call.
   ///
-  /// @param[in] atomicI The atomic instruction to apply the mask to
+  /// @param[in] I The (atomic) instruction to apply the mask to
   /// @param[in] mask The mask to apply to the masked atomic
   /// @param[out] toDelete mapping of deleted unmasked operations
   /// @return true if it is valid to mask this atomic, false otherwise
-  bool applyMaskToAtomicRMW(AtomicRMWInst &atomicI, Value *mask,
-                            DeletionMap &toDelete);
+  bool applyMaskToAtomic(Instruction &I, Value *mask, DeletionMap &toDelete);
 
   /// @brief Linearize a CFG.
   /// @return true if no problem occurred, false otherwise.
@@ -1138,9 +1137,7 @@ Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
       }
     } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
       // Turn atomics into calls to masked builtins if possible.
-      // FIXME: We don't yet support masked cmpxchg instructions.
-      if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I);
-          !atomicI || !applyMaskToAtomicRMW(*atomicI, mask, toDelete)) {
+      if (!applyMaskToAtomic(I, mask, toDelete)) {
         return makeStringError("Could not apply mask to atomic instruction", I);
       }
     } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
@@ -1372,41 +1369,66 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   return true;
 }
 
-bool ControlFlowConversionState::Impl::applyMaskToAtomicRMW(
-    AtomicRMWInst &atomicI, Value *mask, DeletionMap &toDelete) {
-  LLVM_DEBUG(dbgs() << "vecz-cf: Now at AtomicRMWInst " << atomicI << "\n");
+bool ControlFlowConversionState::Impl::applyMaskToAtomic(
+    Instruction &I, Value *mask, DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at atomic inst " << I << "\n");
 
-  VectorizationContext::MaskedAtomicRMW MA;
-  MA.Align = atomicI.getAlign();
-  MA.BinOp = atomicI.getOperation();
-  MA.IsVectorPredicated = VU.choices().vectorPredication();
-  MA.IsVolatile = atomicI.isVolatile();
-  MA.Ordering = atomicI.getOrdering();
-  MA.SyncScope = atomicI.getSyncScopeID();
+  SmallVector<Value *, 8> maskedFnArgs;
+  VectorizationContext::MaskedAtomic MA;
   MA.VF = ElementCount::getFixed(1);
-  MA.ValTy = atomicI.getType();
-  MA.PointerTy = atomicI.getPointerOperand()->getType();
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+
+  if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I)) {
+    MA.Align = atomicI->getAlign();
+    MA.BinOp = atomicI->getOperation();
+    MA.IsVolatile = atomicI->isVolatile();
+    MA.Ordering = atomicI->getOrdering();
+    MA.SyncScope = atomicI->getSyncScopeID();
+    MA.ValTy = atomicI->getType();
+    MA.PointerTy = atomicI->getPointerOperand()->getType();
+
+    // Set up the arguments to this function
+    maskedFnArgs = {atomicI->getPointerOperand(), atomicI->getValOperand(),
+                    mask};
+
+  } else if (auto *cmpxchgI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+    MA.Align = cmpxchgI->getAlign();
+    MA.BinOp = AtomicRMWInst::BAD_BINOP;
+    MA.IsWeak = cmpxchgI->isWeak();
+    MA.IsVolatile = cmpxchgI->isVolatile();
+    MA.Ordering = cmpxchgI->getSuccessOrdering();
+    MA.CmpXchgFailureOrdering = cmpxchgI->getFailureOrdering();
+    MA.SyncScope = cmpxchgI->getSyncScopeID();
+    MA.ValTy = cmpxchgI->getCompareOperand()->getType();
+    MA.PointerTy = cmpxchgI->getPointerOperand()->getType();
+
+    // Set up the arguments to this function
+    maskedFnArgs = {cmpxchgI->getPointerOperand(),
+                    cmpxchgI->getCompareOperand(), cmpxchgI->getNewValOperand(),
+                    mask};
+  } else {
+    return false;
+  }
+
   // Create the new function and replace the old one with it
   // Get the masked function
-  Function *newFunction = Ctx.getOrCreateMaskedAtomicRMWFunction(
+  Function *maskedAtomicFn = Ctx.getOrCreateMaskedAtomicFunction(
       MA, VU.choices(), ElementCount::getFixed(1));
-  VECZ_FAIL_IF(!newFunction);
-  SmallVector<Value *, 8> fnArgs = {atomicI.getPointerOperand(),
-                                    atomicI.getValOperand(), mask};
+  VECZ_FAIL_IF(!maskedAtomicFn);
   // We don't have a vector length just yet - pass in one as a dummy.
   if (MA.IsVectorPredicated) {
-    fnArgs.push_back(
-        ConstantInt::get(IntegerType::getInt32Ty(atomicI.getContext()), 1));
+    maskedFnArgs.push_back(
+        ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 1));
   }
 
-  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", &atomicI);
-  VECZ_FAIL_IF(!newCI);
+  CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs, "", &I);
+  VECZ_FAIL_IF(!maskedCI);
 
-  atomicI.replaceAllUsesWith(newCI);
-  toDelete.emplace_back(&atomicI, newCI);
+  I.replaceAllUsesWith(maskedCI);
+  toDelete.emplace_back(&I, maskedCI);
 
-  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << atomicI << "\n");
-  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << I << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *maskedCI << "\n");
 
   return true;
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 11e954e73bda2..ad08844ad1293 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -25,6 +25,7 @@
 #include <compiler/utils/group_collective_helpers.h>
 #include <llvm/ADT/Twine.h>
 #include <llvm/Analysis/VectorUtils.h>
+#include <llvm/IR/Constants.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
@@ -46,6 +47,18 @@ using namespace vecz;
 namespace {
 inline Type *getWideType(Type *ty, ElementCount factor) {
   if (!ty->isVectorTy()) {
+    // The wide type of a struct literal is the wide type of each of its
+    // elements.
+    if (auto *structTy = dyn_cast<StructType>(ty);
+        structTy && structTy->isLiteral()) {
+      SmallVector<Type *, 4> wideElts(structTy->elements());
+      for (unsigned i = 0, e = wideElts.size(); i != e; i++) {
+        wideElts[i] = getWideType(wideElts[i], factor);
+      }
+      return StructType::get(ty->getContext(), wideElts);
+    } else if (structTy) {
+      VECZ_ERROR("Can't create wide type for structure type");
+    }
     return VectorType::get(ty, factor);
   }
   bool const isScalable = isa<ScalableVectorType>(ty);
@@ -694,7 +707,9 @@ const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
   auto &F = packetizer.F;
   Value *result = nullptr;
   const auto &TI = packetizer.context().targetInfo();
-  if (isa<UndefValue>(scalar)) {
+  if (isa<PoisonValue>(scalar)) {
+    result = PoisonValue::get(getWideType(ty, factor));
+  } else if (isa<UndefValue>(scalar)) {
     result = UndefValue::get(getWideType(ty, factor));
   } else if (ty->isVectorTy() && factor.isScalable()) {
     IRBuilder<> B(buildAfter(scalar, F));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 9b80bca86e255..3191f4d86d36c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -302,14 +302,14 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeMemOp(MemOp &Op);
-  /// @brief Packetize a masked atomic RMW operation.
+  /// @brief Packetize a masked atomicrmw or cmpxchg operation.
   ///
-  /// @param[in] CI Masked atomic RMW builtin call to packetize.
-  /// @param[in] AtomicInfo Information about the masked atomic RMW.
+  /// @param[in] CI Masked atomic builtin call to packetize.
+  /// @param[in] AtomicInfo Information about the masked atomic.
   ///
   /// @return Packetized instruction.
-  ValuePacket packetizeMaskedAtomicRMW(
-      CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo);
+  ValuePacket packetizeMaskedAtomic(
+      CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo);
   /// @brief Packetize a GEP instruction.
   ///
   /// @param[in] GEP Instruction to packetize.
@@ -334,6 +334,12 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeFreeze(FreezeInst *FreezeI);
+  /// @brief Packetize an atomic cmpxchg instruction.
+  ///
+  /// @param[in] AtomicI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI);
   /// @brief Packetize a unary operator instruction.
   ///
   /// @param[in] UnOp Instruction to packetize.
@@ -402,6 +408,22 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement);
+  /// @brief Packetize an insert value instruction.
+  ///
+  /// Only packetizes inserts into literal struct types.
+  ///
+  /// @param[in] InsertValue Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeInsertValue(InsertValueInst *InsertValue);
+  /// @brief Packetize an extract value instruction.
+  ///
+  /// Only packetizes extracts from literal struct types.
+  ///
+  /// @param[in] ExtractValue Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeExtractValue(ExtractValueInst *ExtractValue);
   /// @brief Packetize a shuffle vector instruction.
   ///
   /// @param[in] Shuffle Instruction to packetize.
@@ -1157,12 +1179,21 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
     case Instruction::ExtractElement:
       results = packetizeExtractElement(cast<ExtractElementInst>(Ins));
       break;
+    case Instruction::InsertValue:
+      results = packetizeInsertValue(cast<InsertValueInst>(Ins));
+      break;
+    case Instruction::ExtractValue:
+      results = packetizeExtractValue(cast<ExtractValueInst>(Ins));
+      break;
     case Instruction::ShuffleVector:
       results = packetizeShuffleVector(cast<ShuffleVectorInst>(Ins));
       break;
     case Instruction::Freeze:
       results = packetizeFreeze(cast<FreezeInst>(Ins));
       break;
+    case Instruction::AtomicCmpXchg:
+      results = packetizeAtomicCmpXchg(cast<AtomicCmpXchgInst>(Ins));
+      break;
   }
 
   if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) {
@@ -2102,8 +2133,8 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
         return packetizeMemOp(*MaskedOp);
       }
     }
-    if (auto AtomicInfo = Ctx.isMaskedAtomicRMWFunction(*Callee)) {
-      return packetizeMaskedAtomicRMW(*CI, *AtomicInfo);
+    if (auto AtomicInfo = Ctx.isMaskedAtomicFunction(*Callee)) {
+      return packetizeMaskedAtomic(*CI, *AtomicInfo);
     }
   }
 
@@ -2778,16 +2809,18 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
-    CallInst &CI, VectorizationContext::MaskedAtomicRMW AtomicInfo) {
+ValuePacket Packetizer::Impl::packetizeMaskedAtomic(
+    CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo) {
   ValuePacket results;
 
-  Value *const ptr = CI.getArgOperand(0);
-  Value *const val = CI.getArgOperand(1);
-  Value *const mask = CI.getArgOperand(2);
+  bool const IsCmpXchg = AtomicInfo.isCmpXchg();
 
-  assert(AtomicInfo.ValTy == val->getType() && "AtomicInfo mismatch");
-  auto const packetWidth = getPacketWidthForType(val->getType());
+  Value *const ptrArg = CI.getArgOperand(0);
+  Value *const valOrCmpArg = CI.getArgOperand(1);
+  Value *const maskArg = CI.getArgOperand(2 + IsCmpXchg);
+
+  assert(AtomicInfo.ValTy == valOrCmpArg->getType() && "AtomicInfo mismatch");
+  auto const packetWidth = getPacketWidthForType(valOrCmpArg->getType());
 
   if (VL && packetWidth != 1) {
     emitVeczRemarkMissed(&F, &CI,
@@ -2795,20 +2828,29 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
     return {};
   }
 
-  ValuePacket valPacket;
-  Result valResult = packetize(val);
+  ValuePacket valOrCmpPacket;
+  Result valResult = packetize(valOrCmpArg);
   PACK_FAIL_IF(!valResult);
-  valResult.getPacketValues(packetWidth, valPacket);
-  PACK_FAIL_IF(valPacket.empty());
+  valResult.getPacketValues(packetWidth, valOrCmpPacket);
+  PACK_FAIL_IF(valOrCmpPacket.empty());
+
+  ValuePacket newValPacket;
+  if (IsCmpXchg) {
+    Value *const newValArg = CI.getArgOperand(2);
+    Result newValResult = packetize(newValArg);
+    PACK_FAIL_IF(!newValResult);
+    newValResult.getPacketValues(packetWidth, newValPacket);
+    PACK_FAIL_IF(newValPacket.empty());
+  }
 
   ValuePacket ptrPacket;
-  Result ptrResult = packetize(ptr);
+  Result ptrResult = packetize(ptrArg);
   PACK_FAIL_IF(!ptrResult);
   ptrResult.getPacketValues(packetWidth, ptrPacket);
   PACK_FAIL_IF(ptrPacket.empty());
 
   ValuePacket maskPacket;
-  Result maskResult = packetize(mask);
+  Result maskResult = packetize(maskArg);
   PACK_FAIL_IF(!maskResult);
   maskResult.getPacketValues(packetWidth, maskPacket);
   PACK_FAIL_IF(maskPacket.empty());
@@ -2817,16 +2859,20 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomicRMW(
   IC.deleteInstructionLater(&CI);
 
   for (unsigned i = 0; i != packetWidth; ++i) {
-    auto *const ptrI = ptrPacket[i];
-    auto *const valI = valPacket[i];
+    auto *const ptr = ptrPacket[i];
+    auto *const valOrCmp = valOrCmpPacket[i];
 
-    AtomicInfo.ValTy = valI->getType();
-    AtomicInfo.PointerTy = ptrI->getType();
+    AtomicInfo.ValTy = valOrCmp->getType();
+    AtomicInfo.PointerTy = ptr->getType();
     auto *maskedAtomicF =
-        Ctx.getOrCreateMaskedAtomicRMWFunction(AtomicInfo, Choices, SimdWidth);
+        Ctx.getOrCreateMaskedAtomicFunction(AtomicInfo, Choices, SimdWidth);
     PACK_FAIL_IF(!maskedAtomicF);
 
-    SmallVector<Value *, 4> args = {ptrI, valI, maskPacket[i]};
+    SmallVector<Value *, 4> args = {ptr, valOrCmp};
+    if (IsCmpXchg) {
+      args.push_back(newValPacket[i]);
+    }
+    args.push_back(maskPacket[i]);
     if (AtomicInfo.IsVectorPredicated) {
       assert(VL && "Missing vector length");
       args.push_back(VL);
@@ -2991,6 +3037,49 @@ ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) {
   return results;
 }
 
+ValuePacket Packetizer::Impl::packetizeAtomicCmpXchg(
+    AtomicCmpXchgInst *AtomicI) {
+  ValuePacket results;
+
+  VectorizationContext::MaskedAtomic MA;
+  MA.VF = SimdWidth;
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+
+  MA.Align = AtomicI->getAlign();
+  MA.BinOp = AtomicRMWInst::BAD_BINOP;
+  MA.IsWeak = AtomicI->isWeak();
+  MA.IsVolatile = AtomicI->isVolatile();
+  MA.Ordering = AtomicI->getSuccessOrdering();
+  MA.CmpXchgFailureOrdering = AtomicI->getFailureOrdering();
+  MA.SyncScope = AtomicI->getSyncScopeID();
+
+  IRBuilder<> B(AtomicI);
+
+  // Set up the arguments to this function
+  Value *Ptr = packetize(AtomicI->getPointerOperand()).getAsValue();
+  Value *Cmp = packetize(AtomicI->getCompareOperand()).getAsValue();
+  Value *New = packetize(AtomicI->getNewValOperand()).getAsValue();
+
+  MA.ValTy = Cmp->getType();
+  MA.PointerTy = Ptr->getType();
+
+  auto *const TrueMask = createAllTrueMask(B, SimdWidth);
+  SmallVector<Value *, 8> MaskedFnArgs = {Ptr, Cmp, New, TrueMask};
+  if (VL) {
+    MaskedFnArgs.push_back(VL);
+  }
+
+  Function *MaskedAtomicFn =
+      Ctx.getOrCreateMaskedAtomicFunction(MA, VU.choices(), SimdWidth);
+  PACK_FAIL_IF(!MaskedAtomicFn);
+
+  CallInst *MaskedCI = B.CreateCall(MaskedAtomicFn, MaskedFnArgs);
+
+  results.push_back(MaskedCI);
+
+  return results;
+}
+
 ValuePacket Packetizer::Impl::packetizeUnaryOp(UnaryOperator *UnOp) {
   ValuePacket results;
 
@@ -3716,6 +3805,70 @@ ValuePacket Packetizer::Impl::packetizeExtractElement(
   return results;
 }
 
+ValuePacket Packetizer::Impl::packetizeInsertValue(
+    InsertValueInst *InsertValue) {
+  ValuePacket results;
+
+  Value *const Val = InsertValue->getInsertedValueOperand();
+  Value *const Aggregate = InsertValue->getAggregateOperand();
+
+  // We can only packetize literal struct types
+  if (auto *StructTy = dyn_cast<StructType>(Aggregate->getType());
+      !StructTy || !StructTy->isLiteral()) {
+    return results;
+  }
+
+  Value *PackAggregate = packetizeIfVarying(Aggregate);
+  PACK_FAIL_IF(!PackAggregate);
+
+  Value *PackVal = packetizeIfVarying(Val);
+  PACK_FAIL_IF(!PackVal);
+
+  bool const IsValVarying = Val != PackVal;
+  bool const IsAggregateVarying = Aggregate != PackAggregate;
+  if (!IsAggregateVarying && IsValVarying) {
+    // If the aggregate wasn't varying but the value was
+    PackAggregate = packetize(Aggregate).getAsValue();
+  } else if (IsAggregateVarying && !IsValVarying) {
+    // If the aggregate was varying but the value wasn't
+    PackVal = packetize(Val).getAsValue();
+  } else if (!IsAggregateVarying && !IsValVarying) {
+    // If both were uniform
+    return results;
+  }
+
+  IRBuilder<> B(buildAfter(InsertValue, F));
+
+  results.push_back(
+      B.CreateInsertValue(PackAggregate, PackVal, InsertValue->getIndices()));
+
+  IC.deleteInstructionLater(InsertValue);
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeExtractValue(
+    ExtractValueInst *ExtractValue) {
+  ValuePacket results;
+
+  Value *const Aggregate = ExtractValue->getAggregateOperand();
+  // We can only packetize literal struct types
+  if (auto *StructTy = dyn_cast<StructType>(Aggregate->getType());
+      !StructTy || !StructTy->isLiteral()) {
+    return results;
+  }
+
+  Value *PackAggregate = packetizeIfVarying(Aggregate);
+  PACK_FAIL_IF(!PackAggregate);
+
+  IRBuilder<> B(buildAfter(ExtractValue, F));
+
+  results.push_back(
+      B.CreateExtractValue(PackAggregate, ExtractValue->getIndices()));
+
+  IC.deleteInstructionLater(ExtractValue);
+  return results;
+}
+
 ValuePacket Packetizer::Impl::packetizeShuffleVector(
     ShuffleVectorInst *Shuffle) {
   Value *const srcA = Shuffle->getOperand(0);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 821d62936f576..6880bbfc4de76 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -374,8 +374,8 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   return newFunction;
 }
 
-std::optional<VectorizationContext::MaskedAtomicRMW>
-VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const {
+std::optional<VectorizationContext::MaskedAtomic>
+VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
   auto VFInfo = decodeVectorizedFunctionName(F.getName());
   if (!VFInfo) {
     return std::nullopt;
@@ -383,55 +383,69 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const {
   auto [FnNameStr, VF, Choices] = *VFInfo;
 
   llvm::StringRef FnName = FnNameStr;
-  if (!FnName.consume_front("masked_atomicrmw_")) {
+  if (!FnName.consume_front("masked_")) {
     return std::nullopt;
   }
-  VectorizationContext::MaskedAtomicRMW AtomicInfo;
+  bool IsCmpXchg = FnName.consume_front("cmpxchg_");
+  if (!IsCmpXchg && !FnName.consume_front("atomicrmw_")) {
+    return std::nullopt;
+  }
+  VectorizationContext::MaskedAtomic AtomicInfo;
 
   AtomicInfo.VF = VF;
   AtomicInfo.IsVectorPredicated = Choices.vectorPredication();
 
+  if (IsCmpXchg) {
+    AtomicInfo.IsWeak = FnName.consume_front("weak_");
+  }
   AtomicInfo.IsVolatile = FnName.consume_front("volatile_");
 
-  if (FnName.consume_front("xchg")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg;
-  } else if (FnName.consume_front("add")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add;
-  } else if (FnName.consume_front("sub")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub;
-  } else if (FnName.consume_front("and")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::And;
-  } else if (FnName.consume_front("nand")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand;
-  } else if (FnName.consume_front("or")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or;
-  } else if (FnName.consume_front("xor")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor;
-  } else if (FnName.consume_front("max")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max;
-  } else if (FnName.consume_front("min")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min;
-  } else if (FnName.consume_front("umax")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax;
-  } else if (FnName.consume_front("umin")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin;
-  } else if (FnName.consume_front("fadd")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd;
-  } else if (FnName.consume_front("fsub")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub;
-  } else if (FnName.consume_front("fmax")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax;
-  } else if (FnName.consume_front("fmin")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin;
-  } else if (FnName.consume_front("uincwrap")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap;
-  } else if (FnName.consume_front("udecwrap")) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap;
+  if (IsCmpXchg) {
+    AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP;
   } else {
-    return std::nullopt;
+    if (FnName.consume_front("xchg")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg;
+    } else if (FnName.consume_front("add")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add;
+    } else if (FnName.consume_front("sub")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub;
+    } else if (FnName.consume_front("and")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::And;
+    } else if (FnName.consume_front("nand")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand;
+    } else if (FnName.consume_front("or")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or;
+    } else if (FnName.consume_front("xor")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor;
+    } else if (FnName.consume_front("max")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max;
+    } else if (FnName.consume_front("min")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min;
+    } else if (FnName.consume_front("umax")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax;
+    } else if (FnName.consume_front("umin")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin;
+    } else if (FnName.consume_front("fadd")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd;
+    } else if (FnName.consume_front("fsub")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub;
+    } else if (FnName.consume_front("fmax")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax;
+    } else if (FnName.consume_front("fmin")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin;
+    } else if (FnName.consume_front("uincwrap")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap;
+    } else if (FnName.consume_front("udecwrap")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap;
+    } else {
+      return std::nullopt;
+    }
+    if (!FnName.consume_front("_")) {
+      return std::nullopt;
+    }
   }
 
-  if (!FnName.consume_front("_align")) {
+  if (!FnName.consume_front("align")) {
     return std::nullopt;
   }
 
@@ -446,26 +460,38 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const {
     return std::nullopt;
   }
 
-  if (FnName.consume_front("acquire")) {
-    AtomicInfo.Ordering = AtomicOrdering::Acquire;
-  } else if (FnName.consume_front("acqrel")) {
-    AtomicInfo.Ordering = AtomicOrdering::AcquireRelease;
-  } else if (FnName.consume_front("monotonic")) {
-    AtomicInfo.Ordering = AtomicOrdering::Monotonic;
-  } else if (FnName.consume_front("notatomic")) {
-    AtomicInfo.Ordering = AtomicOrdering::NotAtomic;
-  } else if (FnName.consume_front("release")) {
-    AtomicInfo.Ordering = AtomicOrdering::Release;
-  } else if (FnName.consume_front("seqcst")) {
-    AtomicInfo.Ordering = AtomicOrdering::SequentiallyConsistent;
-  } else if (FnName.consume_front("unordered")) {
-    AtomicInfo.Ordering = AtomicOrdering::Unordered;
+  auto demangleOrdering = [&FnName]() -> std::optional<AtomicOrdering> {
+    if (FnName.consume_front("acquire_")) {
+      return AtomicOrdering::Acquire;
+    } else if (FnName.consume_front("acqrel_")) {
+      return AtomicOrdering::AcquireRelease;
+    } else if (FnName.consume_front("monotonic_")) {
+      return AtomicOrdering::Monotonic;
+    } else if (FnName.consume_front("notatomic_")) {
+      return AtomicOrdering::NotAtomic;
+    } else if (FnName.consume_front("release_")) {
+      return AtomicOrdering::Release;
+    } else if (FnName.consume_front("seqcst_")) {
+      return AtomicOrdering::SequentiallyConsistent;
+    } else if (FnName.consume_front("unordered_")) {
+      return AtomicOrdering::Unordered;
+    } else {
+      return std::nullopt;
+    }
+  };
+
+  if (auto Ordering = demangleOrdering()) {
+    AtomicInfo.Ordering = *Ordering;
   } else {
     return std::nullopt;
   }
 
-  if (!FnName.consume_front("_")) {
-    return std::nullopt;
+  if (IsCmpXchg) {
+    if (auto Ordering = demangleOrdering()) {
+      AtomicInfo.CmpXchgFailureOrdering = *Ordering;
+    } else {
+      return std::nullopt;
+    }
   }
 
   unsigned SyncScopeID = 0;
@@ -489,91 +515,114 @@ VectorizationContext::isMaskedAtomicRMWFunction(const Function &F) const {
   return AtomicInfo;
 }
 
-Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction(
-    MaskedAtomicRMW &I, const VectorizationChoices &Choices, ElementCount VF) {
+Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
+    MaskedAtomic &I, const VectorizationChoices &Choices, ElementCount VF) {
+  bool const isCmpXchg = I.isCmpXchg();
   LLVMContext &ctx = I.ValTy->getContext();
 
   SmallVector<Type *, 8> argTys;
 
   argTys.push_back(I.PointerTy);
   argTys.push_back(I.ValTy);
+  if (isCmpXchg) {
+    argTys.push_back(I.ValTy);
+  }
   // Add one extra argument for the mask, which is always the same length
   // (scalar or vector) as the value type.
   auto *i1Ty = Type::getInt1Ty(ctx);
-  argTys.push_back(
+  auto *maskTy =
       !I.ValTy->isVectorTy()
           ? dyn_cast<Type>(i1Ty)
-          : VectorType::get(i1Ty,
-                            cast<VectorType>(I.ValTy)->getElementCount()));
+          : VectorType::get(i1Ty, cast<VectorType>(I.ValTy)->getElementCount());
+  argTys.push_back(maskTy);
   if (Choices.vectorPredication()) {
     argTys.push_back(Type::getInt32Ty(ctx));
   }
 
   std::string maskedFnName;
   raw_string_ostream O(maskedFnName);
-  O << "masked_atomicrmw_";
+  O << (isCmpXchg ? "masked_cmpxchg_" : "masked_atomicrmw_");
+
+  if (I.IsWeak) {
+    assert(isCmpXchg && "Bad MaskedAtomic state");
+    O << "weak_";
+  }
 
   if (I.IsVolatile) {
     O << "volatile_";
   }
 
+  if (!isCmpXchg) {
 #define BINOP_CASE(BINOP, STR) \
   case AtomicRMWInst::BINOP:   \
     O << (STR);                \
     break
 
-  switch (I.BinOp) {
-    BINOP_CASE(Xchg, "xchg");
-    BINOP_CASE(Add, "add");
-    BINOP_CASE(Sub, "sub");
-    BINOP_CASE(And, "and");
-    BINOP_CASE(Nand, "nand");
-    BINOP_CASE(Or, "or");
-    BINOP_CASE(Xor, "xor");
-    BINOP_CASE(Max, "max");
-    BINOP_CASE(Min, "min");
-    BINOP_CASE(UMax, "umax");
-    BINOP_CASE(UMin, "umin");
-    BINOP_CASE(FAdd, "fadd");
-    BINOP_CASE(FSub, "fsub");
-    BINOP_CASE(FMax, "fmax");
-    BINOP_CASE(FMin, "fmin");
-    BINOP_CASE(UIncWrap, "uincwrap");
-    BINOP_CASE(UDecWrap, "udecwrap");
-    case llvm::AtomicRMWInst::BAD_BINOP:
-      return nullptr;
-  }
+    switch (I.BinOp) {
+      BINOP_CASE(Xchg, "xchg");
+      BINOP_CASE(Add, "add");
+      BINOP_CASE(Sub, "sub");
+      BINOP_CASE(And, "and");
+      BINOP_CASE(Nand, "nand");
+      BINOP_CASE(Or, "or");
+      BINOP_CASE(Xor, "xor");
+      BINOP_CASE(Max, "max");
+      BINOP_CASE(Min, "min");
+      BINOP_CASE(UMax, "umax");
+      BINOP_CASE(UMin, "umin");
+      BINOP_CASE(FAdd, "fadd");
+      BINOP_CASE(FSub, "fsub");
+      BINOP_CASE(FMax, "fmax");
+      BINOP_CASE(FMin, "fmin");
+      BINOP_CASE(UIncWrap, "uincwrap");
+      BINOP_CASE(UDecWrap, "udecwrap");
+      case llvm::AtomicRMWInst::BAD_BINOP:
+        return nullptr;
+    }
 
 #undef BINOP_CASE
+    O << "_";
+  }
+
+  O << "align" << I.Align.value() << "_";
 
-  O << "_align" << I.Align.value() << "_";
   // Mangle ordering
-  switch (I.Ordering) {
-    default:
-      O << static_cast<unsigned>(I.Ordering);
-      break;
-    case AtomicOrdering::Acquire:
-      O << "acquire";
-      break;
-    case AtomicOrdering::AcquireRelease:
-      O << "acqrel";
-      break;
-    case AtomicOrdering::Monotonic:
-      O << "monotonic";
-      break;
-    case AtomicOrdering::NotAtomic:
-      O << "notatomic";
-      break;
-    case AtomicOrdering::Release:
-      O << "release";
-      break;
-    case AtomicOrdering::SequentiallyConsistent:
-      O << "seqcst";
-      break;
-    case AtomicOrdering::Unordered:
-      O << "unordered";
-      break;
+  auto mangleOrdering = [&O](AtomicOrdering Ordering) {
+    switch (Ordering) {
+      default:
+        O << static_cast<unsigned>(Ordering);
+        break;
+      case AtomicOrdering::Acquire:
+        O << "acquire";
+        break;
+      case AtomicOrdering::AcquireRelease:
+        O << "acqrel";
+        break;
+      case AtomicOrdering::Monotonic:
+        O << "monotonic";
+        break;
+      case AtomicOrdering::NotAtomic:
+        O << "notatomic";
+        break;
+      case AtomicOrdering::Release:
+        O << "release";
+        break;
+      case AtomicOrdering::SequentiallyConsistent:
+        O << "seqcst";
+        break;
+      case AtomicOrdering::Unordered:
+        O << "unordered";
+        break;
+    }
+  };
+
+  mangleOrdering(I.Ordering);
+  // Failure Ordering
+  if (I.CmpXchgFailureOrdering) {
+    O << "_";
+    mangleOrdering(*I.CmpXchgFailureOrdering);
   }
+
   // Syncscope
   O << "_" << static_cast<unsigned>(I.SyncScope) << "_";
 
@@ -588,9 +637,11 @@ Function *VectorizationContext::getOrCreateMaskedAtomicRMWFunction(
   maskedFnName =
       getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true);
 
+  Type *maskedFnRetTy = isCmpXchg ? StructType::get(I.ValTy, maskTy) : I.ValTy;
+
   // Create the function type
   FunctionType *maskedFnTy =
-      FunctionType::get(I.ValTy, argTys, /*isVarArg=*/false);
+      FunctionType::get(maskedFnRetTy, argTys, /*isVarArg=*/false);
 
   return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy);
 }
@@ -687,8 +738,8 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) {
     return emitSubgroupScanBody(*F, isInclusive, opKind, isVP);
   }
 
-  if (auto AtomicInfo = isMaskedAtomicRMWFunction(*F)) {
-    return emitMaskedAtomicRMWBody(*F, *AtomicInfo);
+  if (auto AtomicInfo = isMaskedAtomicFunction(*F)) {
+    return emitMaskedAtomicBody(*F, *AtomicInfo);
   }
 
   return false;
@@ -1008,40 +1059,54 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   return true;
 }
 
-bool VectorizationContext::emitMaskedAtomicRMWBody(
-    Function &F, const VectorizationContext::MaskedAtomicRMW &MA) const {
+bool VectorizationContext::emitMaskedAtomicBody(
+    Function &F, const VectorizationContext::MaskedAtomic &MA) const {
   LLVMContext &Ctx = F.getContext();
+  bool IsCmpXchg = MA.isCmpXchg();
 
   auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F);
 
+  IRBuilder<> B(EntryBB);
+
+  BasicBlock *LoopEntryBB = EntryBB;
+  if (MA.IsVectorPredicated) {
+    auto *const VL = F.getArg(3 + IsCmpXchg);
+    // Early exit if the vector length is zero. We're going to unconditionally
+    // jump into the loop after this.
+    auto *const EarlyExitBB = BasicBlock::Create(Ctx, "earlyexit", &F);
+    auto *const CmpZero =
+        B.CreateICmpEQ(VL, ConstantInt::get(VL->getType(), 0));
+
+    LoopEntryBB = BasicBlock::Create(Ctx, "loopentry", &F);
+
+    B.CreateCondBr(CmpZero, EarlyExitBB, LoopEntryBB);
+
+    B.SetInsertPoint(EarlyExitBB);
+    B.CreateRet(PoisonValue::get(F.getReturnType()));
+  }
+
+  B.SetInsertPoint(LoopEntryBB);
+
   auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
 
   auto *const PtrArg = F.getArg(0);
   auto *const ValArg = F.getArg(1);
-  Value *MaskArg = F.getArg(2);
+  Value *MaskArg = F.getArg(2 + IsCmpXchg);
 
   const bool IsVector = ValArg->getType()->isVectorTy();
 
-  IRBuilder<> B(EntryBB);
   Value *const IdxStart = B.getInt32(0);
   ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue());
-  Value *IdxEnd = !MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin);
-
-  // For vector-predicated masked atomics, we have to merge the incoming mask
-  // with a mask corresponding to the number of elements left active by the
-  // runtime vector length.
-  if (MA.IsVectorPredicated) {
-    auto *const VL = F.getArg(3);
-    auto *const IndexTy = VectorType::get(VL->getType(), MA.VF);
-    auto *const step = B.CreateStepVector(IndexTy);
-    auto *const VLMask = B.CreateICmpULT(step, B.CreateVectorSplat(MA.VF, VL));
-    MaskArg = B.CreateAnd(MaskArg, VLMask);
-  }
+  Value *IdxEnd =
+      MA.IsVectorPredicated
+          ? F.getArg(3 + IsCmpXchg)
+          : (!MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin));
 
   Value *RetVal = nullptr;
+  Value *RetSuccessVal = nullptr;
 
   auto CreateLoopBody = [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal,
-                         IsVector](
+                         &RetSuccessVal, IsVector, IsCmpXchg](
                             BasicBlock *BB, Value *Idx, ArrayRef<Value *> IVs,
                             MutableArrayRef<Value *> IVsNext) -> BasicBlock * {
     IRBuilder<> IRB(BB);
@@ -1066,14 +1131,39 @@ bool VectorizationContext::emitMaskedAtomicRMWBody(
         Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr");
         Val = IRB.CreateExtractElement(ValArg, Idx, "val");
       }
-      auto *const AtomicRMW = IRB.CreateAtomicRMW(MA.BinOp, Ptr, Val, MA.Align,
-                                                  MA.Ordering, MA.SyncScope);
-      AtomicRMW->setVolatile(MA.IsVolatile);
 
-      if (IsVector) {
-        RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec");
+      if (IsCmpXchg) {
+        Value *NewValArg = F.getArg(2);
+        Value *NewVal = NewValArg;
+        if (IsVector) {
+          NewVal = IRB.CreateExtractElement(NewValArg, Idx, "newval");
+        }
+        auto *const CmpXchg =
+            IRB.CreateAtomicCmpXchg(Ptr, Val, NewVal, MA.Align, MA.Ordering,
+                                    *MA.CmpXchgFailureOrdering, MA.SyncScope);
+        CmpXchg->setWeak(MA.IsWeak);
+        CmpXchg->setVolatile(MA.IsVolatile);
+
+        if (IsVector) {
+          RetVal = IRB.CreateInsertElement(
+              IVs[0], IRB.CreateExtractValue(CmpXchg, 0), Idx, "retvec");
+          RetSuccessVal = IRB.CreateInsertElement(
+              IVs[1], IRB.CreateExtractValue(CmpXchg, 1), Idx, "retsuccess");
+        } else {
+          RetVal = IRB.CreateExtractValue(CmpXchg, 0);
+          RetSuccessVal = IRB.CreateExtractValue(CmpXchg, 1);
+        }
+
       } else {
-        RetVal = AtomicRMW;
+        auto *const AtomicRMW = IRB.CreateAtomicRMW(
+            MA.BinOp, Ptr, Val, MA.Align, MA.Ordering, MA.SyncScope);
+        AtomicRMW->setVolatile(MA.IsVolatile);
+
+        if (IsVector) {
+          RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec");
+        } else {
+          RetVal = AtomicRMW;
+        }
       }
 
       IRB.CreateBr(ElseBB);
@@ -1089,6 +1179,15 @@ bool VectorizationContext::emitMaskedAtomicRMWBody(
     }
     IVsNext[0] = RetVal;
 
+    if (IsCmpXchg) {
+      auto *MergePhi =
+          IRB.CreatePHI(RetSuccessVal->getType(), 2, "mergesuccess");
+      MergePhi->addIncoming(IVs[1], BB);
+      MergePhi->addIncoming(RetSuccessVal, IfBB);
+      RetSuccessVal = MergePhi;
+      IVsNext[1] = RetSuccessVal;
+    }
+
     // Move the exit block right to the end of the function.
     ExitBB->moveAfter(ElseBB);
 
@@ -1100,11 +1199,22 @@ bool VectorizationContext::emitMaskedAtomicRMWBody(
     Opts.IVs.push_back(PoisonValue::get(MA.ValTy));
     Opts.loopIVNames.push_back("retvec.prev");
   }
-  compiler::utils::createLoop(EntryBB, ExitBB, IdxStart, IdxEnd, Opts,
+  if (IsCmpXchg) {
+    Opts.IVs.push_back(PoisonValue::get(MaskArg->getType()));
+    Opts.loopIVNames.push_back("retsuccess.prev");
+  }
+  compiler::utils::createLoop(LoopEntryBB, ExitBB, IdxStart, IdxEnd, Opts,
                               CreateLoopBody);
 
   B.SetInsertPoint(ExitBB);
-  B.CreateRet(RetVal);
+  if (IsCmpXchg) {
+    Value *RetStruct = PoisonValue::get(F.getReturnType());
+    RetStruct = B.CreateInsertValue(RetStruct, RetVal, 0);
+    RetStruct = B.CreateInsertValue(RetStruct, RetSuccessVal, 1);
+    B.CreateRet(RetStruct);
+  } else {
+    B.CreateRet(RetVal);
+  }
   return true;
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
index 7558c290789c4..85b4c865d0e07 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
@@ -14,39 +14,62 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify \
-; RUN:   --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s
+; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-; Note: we can't currently scalably packetize this kernel, due to the struct
-; type.
-; CHECK: Vecz: Could not packetize %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
+; CHECK: define spir_kernel void @__vecz_nxv4_test_fn(ptr %p, ptr %q, ptr %r)
 define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
 entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <vscale x 4 x ptr> [[SPLAT_PTR_INS]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
   %call = call i64 @__mux_get_global_id(i32 0)
 
+; Test that this cmpxchg is packetized by generating a call to an all-true masked version.
+; CHECK: [[A0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @__vecz_b_nxv4_masked_cmpxchg_align4_acquire_monotonic_1_u9nxv4u3ptru5nxv4ju5nxv4ju5nxv4b(
+; CHECK-SAME: <vscale x 4 x ptr> [[SPLAT_PTR]],
+; CHECK-SAME: <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-SAME: <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-SAME: <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
   %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+; CHECK: [[EXT0:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], 0
   %val0 = extractvalue { i32, i1 } %old0, 0
+; CHECK: [[EXT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], 1
   %success0 = extractvalue { i32, i1 } %old0, 1
 
   %out = getelementptr i32, ptr %q, i64 %call
+; Stored as a vector
+; CHECK: store <vscale x 4 x i32> [[EXT0]], ptr
   store i32 %val0, ptr %out, align 4
 
+; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call
   %outsuccess = getelementptr i8, ptr %r, i64 %call
+; CHECK: [[ZEXT0:%.*]] = zext <vscale x 4 x i1> [[EXT1]] to <vscale x 4 x i8>
   %outbyte = zext i1 %success0 to i8
+; Stored as a vector
+; CHECK: store <vscale x 4 x i8> [[ZEXT0]], ptr [[PTR]], align 1
   store i8 %outbyte, ptr %outsuccess, align 1
 
   ; Test a couple of insert/extract patterns
 
   ; Test inserting a uniform value into a varying literal struct
+; CHECK: [[INS0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], <vscale x 4 x i1> zeroinitializer, 1
+; CHECK: [[EXT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[INS0]], 1
+; CHECK: [[ZEXT1:%.*]] = zext <vscale x 4 x i1> [[EXT2]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[ZEXT1]], ptr [[PTR]], align 1
   %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1
   %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1
   %outbyte0 = zext i1 %testextract0 to i8
   store i8 %outbyte0, ptr %outsuccess, align 1
 
   ; Test inserting a varying value into a varying literal struct
+; CHECK: [[LD:%.*]] = load <vscale x 4 x i8>, ptr
+; CHECK: [[VBOOL:%.*]] = trunc <vscale x 4 x i8> [[LD]] to <vscale x 4 x i1>
+; CHECK: [[INS1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], <vscale x 4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[INS1]], 1
+; CHECK: [[ZEXT2:%.*]] = zext <vscale x 4 x i1> [[EXT3]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[ZEXT2]], ptr [[PTR]], align 1
   %byte1 = load i8, ptr %outsuccess, align 1
   %bool1 = trunc i8 %byte1 to i1
   %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1
@@ -55,6 +78,10 @@ entry:
   store i8 %outbyte1, ptr %outsuccess, align 1
 
   ; Test inserting a varying value into a uniform literal struct
+; CHECK: [[INS2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } poison, <vscale x 4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[INS2]], 1
+; CHECK: [[ZEXT3:%.*]] = zext <vscale x 4 x i1> [[EXT4]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[ZEXT3]], ptr [[PTR]], align 1
   %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1
   %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1
   %outbyte2 = zext i1 %testextract2 to i8
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
new file mode 100644
index 0000000000000..35a478caaaee7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
@@ -0,0 +1,106 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(<vscale x 1 x ptr> %p) {
+  %ret0 = call <vscale x 1 x i32> @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 4)
+  %ret1 = call { <vscale x 1 x i32>, <vscale x 1 x i1> } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 4)
+  ret void
+}
+
+declare <vscale x 1 x i32> @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %val, <vscale x 1 x i1> %mask, i32 %vl)
+
+declare { <vscale x 1 x i32>, <vscale x 1 x i1> } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %cmp, <vscale x 1 x i32> %newval, <vscale x 1 x i1> %mask, i32 %vl)
+
+; CHECK: define <vscale x 1 x i32> @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %val, <vscale x 1 x i1> %mask, i32 %vl) {
+; CHECK: entry:
+; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0
+; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry
+
+; CHECK: earlyexit:
+; CHECK: ret <vscale x 1 x i32> poison
+
+; CHECK: loopentry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ]
+; CHECK: [[RET_PREV:%.*]] = phi <vscale x 1 x i32> [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <vscale x 1 x i1> %mask, i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <vscale x 1 x ptr> %p, i32 [[IDX]]
+; CHECK: [[VAL:%.*]] = extractelement <vscale x 1 x i32> %val, i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4
+; CHECK: [[RET_NEXT:%.*]] = insertelement <vscale x 1 x i32> [[RET_PREV]], i32 [[ATOM]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE:%.*]] = phi <vscale x 1 x i32> [ [[RET_PREV]], %loopIR ], [ [[RET_NEXT]], %if.then ]
+; CHECK: [[INC]] = add i32 [[IDX]], 1
+; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
+
+; CHECK: exit:
+; CHECK: ret <vscale x 1 x i32> [[MERGE]]
+
+; CHECK: define { <vscale x 1 x i32>, <vscale x 1 x i1> } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %cmp, <vscale x 1 x i32> %newval, <vscale x 1 x i1> %mask, i32 %vl) {
+; CHECK: entry:
+; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0
+; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry
+
+; CHECK: earlyexit:
+; CHECK: ret { <vscale x 1 x i32>, <vscale x 1 x i1> } poison
+
+; CHECK: loopentry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ]
+; CHECK: [[RET_PREV:%.*]] = phi <vscale x 1 x i32> [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[SUCCESS_PREV:%.*]] = phi <vscale x 1 x i1> [ poison, %loopentry ], [ [[MERGE_SUCCESS:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <vscale x 1 x i1> %mask, i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <vscale x 1 x ptr> %p, i32 [[IDX]]
+; CHECK: [[CMP:%.*]] = extractelement <vscale x 1 x i32> %cmp, i32 [[IDX]]
+; CHECK: [[NEWVAL:%.*]] = extractelement <vscale x 1 x i32> %newval, i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEWVAL]] acquire acquire, align 4
+; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0
+; CHECK: [[RET:%.*]] = insertelement <vscale x 1 x i32> [[RET_PREV]], i32 [[EXT0]], i32 [[IDX]]
+; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1
+; CHECK: [[SUCCESS:%.*]] = insertelement <vscale x 1 x i1> [[SUCCESS_PREV]], i1 [[EXT1]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE:%.*]] = phi <vscale x 1 x i32> [ [[RET_PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[MERGE_SUCCESS:%.*]] = phi <vscale x 1 x i1> [ [[SUCCESS_PREV]], %loopIR ], [ [[SUCCESS]], %if.then ]
+; CHECK: [[INC]] = add i32 [[IDX]], 1
+; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
+
+; CHECK: exit:
+; CHECK: [[RETTMP:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i1> } poison, <vscale x 1 x i32> [[MERGE]], 0
+; CHECK: [[RETVAL:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i1> } [[RETTMP]], <vscale x 1 x i1> [[MERGE_SUCCESS]], 1
+; CHECK: ret { <vscale x 1 x i32>, <vscale x 1 x i1> } [[RETVAL]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
index d62486409b4e3..bf2175364861f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
@@ -22,51 +22,29 @@ target triple = "spir64-unknown-unknown"
 ; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r)
 define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
 entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
   %call = call i64 @__mux_get_global_id(i32 0)
 
-; Test that this cmpxchg is scalarized. Not ideal, but hey.
-; CHECK: [[A0:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
-; CHECK: [[A1:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
-; CHECK: [[A2:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
-; CHECK: [[A3:%.*]] = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic, align 4
-
-; Then we insert the values into a strange struct
-; CHECK: [[INS0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[A0]], 0
-; CHECK: [[INS1:%.*]] = insertvalue [4 x { i32, i1 }] [[INS0]], { i32, i1 } [[A1]], 1
-; CHECK: [[INS2:%.*]] = insertvalue [4 x { i32, i1 }] [[INS1]], { i32, i1 } [[A2]], 2
-; CHECK: [[INS3:%.*]] = insertvalue [4 x { i32, i1 }] [[INS2]], { i32, i1 } [[A3]], 3
+; Test that this cmpxchg is packetized by generating a call to an all-true masked version.
+; CHECK: [[A0:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+; CHECK-SAME: <4 x i32> <i32 2, i32 2, i32 2, i32 2>,
+; CHECK-SAME: <4 x i1> <i1 true, i1 true, i1 true, i1 true>
   %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
-
-; To extract from this result, we extract each element individually then insert
-; each into a vector.
-; CHECK: [[ELT0_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 0
-; CHECK: [[ELT0_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 0
-; CHECK: [[ELT0_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 0
-; CHECK: [[ELT0_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 0
-; CHECK: [[INS0V0_0:%.*]] = insertelement <4 x i32> undef, i32 [[ELT0_0_0]], i32 0
-; CHECK: [[INS0V0_1:%.*]] = insertelement <4 x i32> [[INS0V0_0]], i32 [[ELT0_0_1]], i32 1
-; CHECK: [[INS0V0_2:%.*]] = insertelement <4 x i32> [[INS0V0_1]], i32 [[ELT0_0_2]], i32 2
-; CHECK: [[INS0V0_3:%.*]] = insertelement <4 x i32> [[INS0V0_2]], i32 [[ELT0_0_3]], i32 3
+; CHECK: [[EXT0:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 0
   %val0 = extractvalue { i32, i1 } %old0, 0
-; Same again here
-; CHECK: [[ELT1_0_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 0, 1
-; CHECK: [[ELT1_0_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 1, 1
-; CHECK: [[ELT1_0_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 2, 1
-; CHECK: [[ELT1_0_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INS3]], 3, 1
-; CHECK: [[INS1V0_0:%.*]] = insertelement <4 x i1> undef, i1 [[ELT1_0_0]], i32 0
-; CHECK: [[INS1V0_1:%.*]] = insertelement <4 x i1> [[INS1V0_0]], i1 [[ELT1_0_1]], i32 1
-; CHECK: [[INS1V0_2:%.*]] = insertelement <4 x i1> [[INS1V0_1]], i1 [[ELT1_0_2]], i32 2
-; CHECK: [[INS1V0_3:%.*]] = insertelement <4 x i1> [[INS1V0_2]], i1 [[ELT1_0_3]], i32 3
+; CHECK: [[EXT1:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 1
   %success0 = extractvalue { i32, i1 } %old0, 1
 
   %out = getelementptr i32, ptr %q, i64 %call
 ; Stored as a vector
-; CHECK: store <4 x i32> [[INS0V0_3]], ptr
+; CHECK: store <4 x i32> [[EXT0]], ptr
   store i32 %val0, ptr %out, align 4
 
 ; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call
   %outsuccess = getelementptr i8, ptr %r, i64 %call
-; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[INS1V0_3]] to <4 x i8>
+; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[EXT1]] to <4 x i8>
   %outbyte = zext i1 %success0 to i8
 ; Stored as a vector
 ; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1
@@ -74,25 +52,10 @@ entry:
 
   ; Test a couple of insert/extract patterns
 
-; Test inserting a uniform value into a varying literal struct
-; This is very inefficient
-; CHECK: [[INSS0_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 false, 1
-; CHECK: [[INSS0_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 false, 1
-; CHECK: [[INSS0_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 false, 1
-; CHECK: [[INSS0_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 false, 1
-; CHECK: [[INSS1_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS0_0]], 0
-; CHECK: [[INSS1_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_0]], { i32, i1 } [[INSS0_1]], 1
-; CHECK: [[INSS1_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_1]], { i32, i1 } [[INSS0_2]], 2
-; CHECK: [[INSS1_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS1_2]], { i32, i1 } [[INSS0_3]], 3
-; CHECK: [[EXTS1_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 0, 1
-; CHECK: [[EXTS1_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 1, 1
-; CHECK: [[EXTS1_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 2, 1
-; CHECK: [[EXTS1_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS1_3]], 3, 1
-; CHECK: [[INS1V1_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS1_0]], i32 0
-; CHECK: [[INS1V1_1:%.*]] = insertelement <4 x i1> [[INS1V1_0]], i1 [[EXTS1_1]], i32 1
-; CHECK: [[INS1V1_2:%.*]] = insertelement <4 x i1> [[INS1V1_1]], i1 [[EXTS1_2]], i32 2
-; CHECK: [[INS1V1_3:%.*]] = insertelement <4 x i1> [[INS1V1_2]], i1 [[EXTS1_3]], i32 3
-; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[INS1V1_3]] to <4 x i8>
+  ; Test inserting a uniform value into a varying literal struct
+; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> zeroinitializer, 1
+; CHECK: [[EXT2:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS0]], 1
+; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[EXT2]] to <4 x i8>
 ; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1
   %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1
   %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1
@@ -100,29 +63,11 @@ entry:
   store i8 %outbyte0, ptr %outsuccess, align 1
 
   ; Test inserting a varying value into a varying literal struct
-; CHECK: [[V4I8_LD:%.*]] = load <4 x i8>, ptr %outsuccess, align 1
-; CHECK: [[TRUNC:%.*]] = trunc <4 x i8> [[V4I8_LD]] to <4 x i1>
-; CHECK: [[EXTV0_0:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 0
-; CHECK: [[EXTV0_1:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 1
-; CHECK: [[EXTV0_2:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 2
-; CHECK: [[EXTV0_3:%.*]] = extractelement <4 x i1> [[TRUNC]], i32 3
-; CHECK: [[INSS2_0:%.*]] = insertvalue { i32, i1 } [[A0]], i1 [[EXTV0_0]], 1
-; CHECK: [[INSS2_1:%.*]] = insertvalue { i32, i1 } [[A1]], i1 [[EXTV0_1]], 1
-; CHECK: [[INSS2_2:%.*]] = insertvalue { i32, i1 } [[A2]], i1 [[EXTV0_2]], 1
-; CHECK: [[INSS2_3:%.*]] = insertvalue { i32, i1 } [[A3]], i1 [[EXTV0_3]], 1
-; CHECK: [[INSS3_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS2_0]], 0
-; CHECK: [[INSS3_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_0]], { i32, i1 } [[INSS2_1]], 1
-; CHECK: [[INSS3_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_1]], { i32, i1 } [[INSS2_2]], 2
-; CHECK: [[INSS3_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS3_2]], { i32, i1 } [[INSS2_3]], 3
-; CHECK: [[EXTS3_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 0, 1
-; CHECK: [[EXTS3_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 1, 1
-; CHECK: [[EXTS3_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 2, 1
-; CHECK: [[EXTS3_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS3_3]], 3, 1
-; CHECK: [[INS1V2_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS3_0]], i32 0
-; CHECK: [[INS1V2_1:%.*]] = insertelement <4 x i1> [[INS1V2_0]], i1 [[EXTS3_1]], i32 1
-; CHECK: [[INS1V2_2:%.*]] = insertelement <4 x i1> [[INS1V2_1]], i1 [[EXTS3_2]], i32 2
-; CHECK: [[INS1V2_3:%.*]] = insertelement <4 x i1> [[INS1V2_2]], i1 [[EXTS3_3]], i32 3
-; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[INS1V2_3]] to <4 x i8>
+; CHECK: [[LD:%.*]] = load <4 x i8>, ptr
+; CHECK: [[VBOOL:%.*]] = trunc <4 x i8> [[LD]] to <4 x i1>
+; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT3:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS1]], 1
+; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[EXT3]] to <4 x i8>
 ; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1
   %byte1 = load i8, ptr %outsuccess, align 1
   %bool1 = trunc i8 %byte1 to i1
@@ -132,23 +77,9 @@ entry:
   store i8 %outbyte1, ptr %outsuccess, align 1
 
   ; Test inserting a varying value into a uniform literal struct
-; CHECK: [[INSS4_0:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_0]], 1
-; CHECK: [[INSS4_1:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_1]], 1
-; CHECK: [[INSS4_2:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_2]], 1
-; CHECK: [[INSS4_3:%.*]] = insertvalue { i32, i1 } poison, i1 [[EXTV0_3]], 1
-; CHECK: [[INSS5_0:%.*]] = insertvalue [4 x { i32, i1 }] undef, { i32, i1 } [[INSS4_0]], 0
-; CHECK: [[INSS5_1:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_0]], { i32, i1 } [[INSS4_1]], 1
-; CHECK: [[INSS5_2:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_1]], { i32, i1 } [[INSS4_2]], 2
-; CHECK: [[INSS5_3:%.*]] = insertvalue [4 x { i32, i1 }] [[INSS5_2]], { i32, i1 } [[INSS4_3]], 3
-; CHECK: [[EXTS5_0:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 0, 1
-; CHECK: [[EXTS5_1:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 1, 1
-; CHECK: [[EXTS5_2:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 2, 1
-; CHECK: [[EXTS5_3:%.*]] = extractvalue [4 x { i32, i1 }] [[INSS5_3]], 3, 1
-; CHECK: [[INS2V3_0:%.*]] = insertelement <4 x i1> undef, i1 [[EXTS5_0]], i32 0
-; CHECK: [[INS2V3_1:%.*]] = insertelement <4 x i1> [[INS2V3_0]], i1 [[EXTS5_1]], i32 1
-; CHECK: [[INS2V3_2:%.*]] = insertelement <4 x i1> [[INS2V3_1]], i1 [[EXTS5_2]], i32 2
-; CHECK: [[INS2V3_3:%.*]] = insertelement <4 x i1> [[INS2V3_2]], i1 [[EXTS5_3]], i32 3
-; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[INS2V3_3]] to <4 x i8>
+; CHECK: [[INS2:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS2]], 1
+; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[EXT4]] to <4 x i8>
 ; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1
   %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1
   %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
deleted file mode 100644
index b6beaae1e47c2..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_atomic.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify -S \
-; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
-
-target triple = "spir64-unknown-unknown"
-target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK: Vecz: Could not apply masks for function "kernel"
-; CHECK-NEXT: note: Could not apply mask to atomic instruction
-; CHECK-SAME:  atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4
-
-define spir_kernel void @kernel(ptr %in, ptr %out) {
-entry:
-  %gid = tail call i64 @__mux_get_global_id(i32 0)
-  %cmp = icmp eq i64 %gid, 0
-  br i1 %cmp, label %if.then, label %end
-
-if.then:
-  %arrayidx.in = getelementptr inbounds i32, ptr %in, i64 %gid
-  %atomic_success = cmpxchg ptr %arrayidx.in, i32 2, i32 4 acq_rel monotonic, align 4
-  %atomic = extractvalue { i32, i1 } %atomic_success, 0
-  br label %end
-
-end:
-  %merge = phi i32 [ 0, %entry ], [ %atomic, %if.then ]
-  %arrayidx.out = getelementptr inbounds i32, ptr %out, i64 %gid
-  store i32 %merge, ptr %arrayidx.out, align 4
-  ret void
-}
-
-declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
new file mode 100644
index 0000000000000..73aec6dfc4caa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
@@ -0,0 +1,105 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r)
+define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> <i64 3, i64 3, i64 3, i64 3>, 
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp sgt i64 3, %call
+; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
+  %wi_p_i32 = getelementptr i32, ptr %p, i64 %call
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+; CHECK: [[CALL:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+; CHECK-SAME: <4 x i32> <i32 2, i32 2, i32 2, i32 2>, <4 x i1> [[CMP]]
+  %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+  %val0 = extractvalue { i32, i1 } %old0, 0
+  %success0 = extractvalue { i32, i1 } %old0, 1
+
+  %out = getelementptr i32, ptr %q, i64 %call
+  store i32 %val0, ptr %out, align 4
+
+  %outsuccess = getelementptr i8, ptr %r, i64 %call
+  %outbyte = zext i1 %success0 to i8
+  store i8 %outbyte, ptr %outsuccess, align 1
+
+  ; Test a couple of insert/extract patterns
+; CHECK: [[INS:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[CALL]], <4 x i1> [[CMP]], 1
+; CHECK: [[EXT:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS]], 1
+  %testinsert = insertvalue { i32, i1 } %old0, i1 %cmp, 1
+  %testextract = extractvalue { i32, i1 } %testinsert, 1
+
+  %outbyte0 = zext i1 %testextract to i8
+  store i8 %outbyte0, ptr %outsuccess, align 1
+
+; CHECK: = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(
+  %old1 = cmpxchg weak volatile ptr %wi_p_i32, i32 1, i32 2 syncscope("singlethread") monotonic seq_cst, align 8
+
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ]
+; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[PREVSUCCESS:%.*]] = phi <4 x i1> [ poison, %entry ], [ [[MERGESUCCESS:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]]
+; CHECK: [[CMP:%.*]] = extractelement <4 x i32> [[CMPS]], i32 [[IDX]]
+; CHECK: [[NEW:%.*]] = extractelement <4 x i32> [[NEWS]], i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEW]] acquire monotonic, align 4
+; CHECK: [[VAL:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0
+; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[VAL]], i32 [[IDX]]
+; CHECK: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1
+; CHECK: [[RETSUCCESS:%.*]] = insertelement <4 x i1> [[PREVSUCCESS]], i1 [[SUCCESS]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[MERGESUCCESS]] = phi <4 x i1> [ [[PREVSUCCESS]], %loopIR ], [ [[RETSUCCESS]], %if.then ]
+; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1
+
+; CHECK: exit:
+; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[MERGE]], 0
+; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[INS0]], <4 x i1> [[MERGESUCCESS]], 1
+; CHECK: ret { <4 x i32>, <4 x i1> } [[INS1]]
+
+; Assume that all masked cmpxchg operations follow the logic above. Just
+; check that the right cmpxchg instruction is being generated.
+; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) {
+; CHECK: cmpxchg weak volatile ptr {{%.*}}, i32 {{%.*}}, i32 {{%.*}} syncscope("singlethread") monotonic seq_cst, align 8
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
new file mode 100644
index 0000000000000..831b6cca8fae8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(ptr %p) {
+  %ret = call { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 1, i32 2, i1 true)
+  ret void
+}
+
+declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask)
+
+; CHECK: define { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) {
+; CHECK: entry:
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[ATOM:%.*]] = cmpxchg ptr %p, i32 %cmp, i32 %newval acquire monotonic, align 4
+; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0
+; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[RETVAL:%.*]] = phi i32 [ poison, %entry ], [ [[EXT0]], %if.then ]
+; CHECK: [[RETSUCC:%.*]] = phi i1 [ poison, %entry ], [ [[EXT1]], %if.then ]
+; CHECK: br label %exit
+
+; CHECK: exit:
+; CHECK: [[INS0:%.*]] = insertvalue { i32, i1 } poison, i32 [[RETVAL]], 0
+; CHECK: [[INS1:%.*]] = insertvalue { i32, i1 } [[INS0]], i1 [[RETSUCC]], 1
+; CHECK: ret { i32, i1 } [[INS1]]

From 783cd9baa4067687da1a91bf024ff02c2288f484 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 2 Jan 2024 12:14:12 +0000
Subject: [PATCH 074/182] [NFC] Change startswith/endswith to
 starts_with/ends_with.

For consistency with C++20, LLVM renamed startswith to starts_with, and
likewise for endswith. startswith and endswith are still available as
wrappers that do nothing but call starts_with and ends_with, but as of
LLVM 18 have been deprecated. As the oldest LLVM we support (LLVM 16)
already had starts_with and ends_with, we can just use that
unconditionally.
---
 .../compiler_passes/vecz/source/vectorization_context.cpp       | 2 +-
 .../compiler_passes/vecz/source/vectorizer.cpp                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 6880bbfc4de76..15b1420e64a18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -198,7 +198,7 @@ VectorizationResult VectorizationContext::getVectorizedFunction(
 }
 
 bool VectorizationContext::isInternalBuiltin(const Function *F) {
-  return F->getName().startswith(VectorizationContext::InternalBuiltinPrefix);
+  return F->getName().starts_with(VectorizationContext::InternalBuiltinPrefix);
 }
 
 Function *VectorizationContext::getOrCreateInternalBuiltin(StringRef Name,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index e5d875c644493..f9a2adf8e8ad6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -183,7 +183,7 @@ void collectStatistics(VectorizationUnit &VU, Function *Scalar,
       // Detect vector splats
       // Count insert/extractelement instructions
       if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I)) {
-        if (I.getName().startswith(".splatinsert")) {
+        if (I.getName().starts_with(".splatinsert")) {
           ++VeczSplats;
         }
         ++VeczInsertExtract;

From bcae693b7bdf804e3957c989ba96dd0beddfc0c4 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 2 Jan 2024 16:58:07 +0000
Subject: [PATCH 075/182] [vecz] Fix missing CHECKs in LIT tests

---
 .../vecz/test/lit/llvm/Boscc/partial_linearization5.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll    | 2 +-
 .../vecz/test/lit/llvm/masked_interleaved_group.ll              | 2 +-
 .../vecz/test/lit/llvm/masked_interleaved_group2.ll             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
index a2ff9ce17de79..cb6acb6c70594 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -254,7 +254,7 @@ attributes #2 = { nobuiltin nounwind readonly }
 ; CHECK: [[FLOOPEXIT1]]:
 ; CHECK: br label %[[IFTHEN]]
 
-; CHECK; [[F]]:
+; CHECK: [[F]]:
 ; CHECK: br label %[[G]]
 
 ; CHECK: [[G]]:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
index 8283037ef3059..4c3d9e82970fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -71,4 +71,4 @@ attributes #2 = { nobuiltin }
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %3 = getelementptr i32, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 2, i64 4, i64 6>
 ; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %3, i32{{( immarg)?}} 4, <4 x i1> %2) #
-; CHECK ret void
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
index 6360d5b5313bb..c61afc692155a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
@@ -96,4 +96,4 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 
 ; Definitely no unmasked stores:
 ; CHECK-NOT: store <16 x i8>
-; CHECK ret void
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
index ce530c9ad73e5..0890b70d7c6b9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
@@ -115,4 +115,4 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 
 ; Definitely no unmasked stores:
 ; CHECK-NOT: store <16 x i8>
-; CHECK ret void
+; CHECK: ret void

From eb3737ff96c68a057117faa18c99777ea6f2ce13 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 2 Jan 2024 17:22:27 +0000
Subject: [PATCH 076/182] [vecz] Fix missing CHECK in LIT test

This one was more complicated as the CHECKs, when added, weren't passing
because the instructions were in a different order.

While this wasn't very difficult to fix, this change does take the
opportunity to update and simplify the test by removing an unused
function.
---
 .../llvm/interleaved_defuse_instantiated.ll   | 45 ++++++-------------
 1 file changed, 14 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
index 42e076af5e4c4..72ca3181302aa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
@@ -14,20 +14,32 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: veczc -k printf_kernel -vecz-simd-width=4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
+; CHECK: @.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
 @.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
 @.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
 
 ; Function Attrs: nounwind
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_printf_kernel(
+; CHECK: if.then:
+; CHECK: [[ELT0:%.*]] = extractelement
+; CHECK: [[ELT1:%.*]] = extractelement
+; CHECK: [[ELT2:%.*]] = extractelement
+; CHECK: [[ELT3:%.*]] = extractelement
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT0]]
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT1]]
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT2]]
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT3]]
+; CHECK: ret void
 define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #3
-  %cmp = icmp eq i32 %width, 13
+  %cmp = icmp eq i64 %call, 13
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
@@ -41,19 +53,6 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-define spir_kernel void @test_float(float* %in) {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %mul = fmul float %0, %0
-  %conv = fpext float %mul to double
-  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(2)* @.strf, i64 0, i64 0), double %conv)
-  ret void
-}
-
-
-
 declare i64 @__mux_get_global_id(i32) #1
 
 declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
@@ -72,19 +71,3 @@ attributes #2 = { nobuiltin nounwind }
 !4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"}
 !5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""}
 !6 = !{!"clang version 3.8.0 "}
-
-; CHECK: entry:
-; CHECK: if.then:
-; CHECK  extractelement
-; CHECK-NEXT  extractelement
-; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
-; CHECK  extractelement
-; CHECK-NEXT  extractelement
-; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
-; CHECK  extractelement
-; CHECK-NEXT  extractelement
-; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
-; CHECK  extractelement
-; CHECK-NEXT  extractelement
-; CHECK-NEXT    %4 = call spir_func i32 @__vecz_b_masked_printf_PU3AS2hjb(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
-; CHECK: ret void

From 153037f9635ebfaf8f5b63c04176fa5f86ae5356 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 2 Jan 2024 17:00:14 +0000
Subject: [PATCH 077/182] [vecz] Enable XFAIL tests

These two tests were actually passing in terms of the vectorizer, but
were then using a FileCheck invocation which failed, because there were
no CHECKs.

The two tests were originally testing that we couldn't packetize or
scalarize really wide vectors. However, we can. So this change updates
the tests to test the positive cases instead.
---
 .../lit/llvm/too_large_simdwidth_packetization.ll     | 11 ++++++-----
 .../lit/llvm/too_large_simdwidth_scalarization.ll     |  9 ++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
index b2ec9fe8ef2ef..65b3015cf0289 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
@@ -14,14 +14,18 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: linux
-; RUN: veczc -k add -vecz-simd-width=128 -S < %s | FileCheck %s
+; RUN: veczc -vecz-simd-width=128 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
+; CHECK-LABEL: define spir_kernel void @__vecz_v128_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) 
+; CHECK: = load <128 x i32>, ptr addrspace(1)
+; CHECK: = load <128 x i32>, ptr addrspace(1)
+; CHECK: = add nsw <128 x i32>
+; CHECK: store <128 x i32>
 define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
 entry:
   %in1.addr = alloca i32 addrspace(1)*, align 8
@@ -112,6 +116,3 @@ attributes #3 = { nobuiltin }
 !33 = !DILocation(line: 6, scope: !4)
 !34 = !DILocation(line: 7, scope: !4)
 !35 = !DILocation(line: 8, scope: !4)
-
-; We do not expect this test to succeed
-; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
index a509bc5563d8b..2b51497d7158c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
@@ -14,14 +14,16 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: linux
-; RUN: veczc -k add -vecz-simd-width=4 -S < %s | FileCheck %s
+; RUN: veczc -w 4 -vecz-passes=scalarizer -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_add(ptr %in1, ptr %in2, ptr %out)
+; CHECK-COUNT-128: = extractelement <128 x i32> %in1v,
+; CHECK-COUNT-128: insertelement <128 x i32>
 define spir_kernel void @add(<128 x i32>* %in1, <128 x i32>* %in2, <128 x i32>* %out) {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0)
@@ -36,6 +38,3 @@ entry:
 }
 
 declare i64 @__mux_get_global_id(i32) #2
-
-; We do not expect this test to succeed
-; XFAIL: *

From f3764ec164aab73c3364ec5cdd68b1f9ea8e928f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 2 Jan 2024 17:13:09 +0000
Subject: [PATCH 078/182] [vecz] Merge and fix call instantiation LIT tests

Some of the 'failure' tests were broken for reasons seen in previous
commits: the test was actually passing, but the incorrect use of
FileCheck was satisfying the XFAIL.

These tests all use the same input, so are now merged. Those which test
cases we can packetize, test that codegen. Those that can't, check we
don't produce a vectorized function.
---
 .../vecz/test/lit/llvm/call_instantiation.ll  | 160 ++++++++++++++++++
 ...all_instantiation_failure_cantduplicate.ll | 128 --------------
 .../call_instantiation_failure_cantinline.ll  | 128 --------------
 .../call_instantiation_failure_optnone.ll     | 128 --------------
 ...ll_instantiation_failure_user_undefined.ll | 128 --------------
 .../call_instantiation_success_builtin.ll     | 129 --------------
 .../call_instantiation_success_instrinsic.ll  | 129 --------------
 ...call_instantiation_success_user_defined.ll | 129 --------------
 8 files changed, 160 insertions(+), 899 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
new file mode 100644
index 0000000000000..90d3eb156f5c2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
@@ -0,0 +1,160 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+; We should be able to handle intrinsics
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_instrinsic(ptr %in1, ptr %in2, ptr %in3, ptr %out)
+; CHECK: call <4 x float> @llvm.fmuladd.v4f32(<4 x float> {{%.*}}, <4 x float> {{%.*}}, <4 x float> {{%.*}})
+define spir_kernel void @instrinsic(ptr %in1, ptr %in2, ptr %in3, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr %in1, i64 %call
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %in2, i64 %call
+  %1 = load float, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr %in3, i64 %call
+  %2 = load float, ptr %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, ptr %out, i64 %call
+  store float %3, ptr %arrayidx3, align 4
+  ret void
+}
+
+; We should be able to handle builtins for which we have a vector declaration
+; in the module.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_builtin(ptr %in, ptr %out)
+; CHECK: = call spir_func <4 x i32> @_Z3absDv4_i(<4 x i32> {{%.*}})
+define spir_kernel void @builtin(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call
+  %0 = load i32, ptr %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %out, i64 %call
+  store i32 %call1, ptr %arrayidx2, align 4
+  ret void
+}
+
+; We should be able to handle user functions for which we have a definition
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_user_defined(ptr %in, ptr %out)
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+define spir_kernel void @user_defined(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call
+  call spir_func void @defined(ptr %add.ptr, ptr %add.ptr1)
+  ret void
+}
+
+; We should be able to handle user functions (or builtins) for which we have no
+; definition
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_user_undefined(ptr %in, ptr %out)
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+define spir_kernel void @user_undefined(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call
+  call spir_func void @undefined(ptr %add.ptr, ptr %add.ptr1)
+  ret void
+}
+
+; We should be able to handle user functions (or builtins) which we can't
+; inline
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_cantinline(ptr %in, ptr %out)
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+define spir_kernel void @cantinline(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call
+  call spir_func void @dontinline(ptr %add.ptr, ptr %add.ptr1)
+  ret void
+}
+
+; If we can't duplicate a function, we can't packetize it.
+; CHECK-NOT: @__vecz_v4_cantduplicate
+define spir_kernel void @cantduplicate(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call
+  %0 = load i32, ptr %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, ptr %out, i64 %call
+  store i32 %call1, ptr %arrayidx2, align 4
+  ret void
+}
+
+; The optnone attribute has no impact when directly running the packetizer
+; pass. The higher-level vectorization factor decisions must take this into
+; account instead.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_optnone(ptr %in, ptr %out)
+define spir_kernel void @optnone(ptr %in, ptr %out) #2 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %out, i64 %call
+  store i32 %0, ptr %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func <4 x i32> @_Z3absDv4_i(<4 x i32>)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare i64 @__mux_get_global_id(i32)
+declare spir_func void @undefined(ptr, ptr)
+
+; Functions with definitions
+
+define spir_func void @defined(ptr %in, ptr %out) {
+entry:
+  %0 = load i32, ptr %in, align 4
+  store i32 %0, ptr %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(ptr %in, ptr %out) #0 {
+entry:
+  %0 = load i32, ptr %in, align 4
+  store i32 %0, ptr %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
deleted file mode 100644
index 1874c37800a31..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantduplicate.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k cantduplicate -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
deleted file mode 100644
index eb9ffc770c4b9..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_cantinline.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k cantinline -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
deleted file mode 100644
index c287439fe810c..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_optnone.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k optnone -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
deleted file mode 100644
index b87aa662bbe9c..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_failure_user_undefined.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k user_undefined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; XFAIL: *
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
deleted file mode 100644
index aee863ac6ff7c..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_builtin.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k builtin -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; We should be able to handle builtins
-; CHECK: define spir_kernel void @__vecz_v4_builtin
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
deleted file mode 100644
index e8ef695bafbea..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_instrinsic.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k instrinsic -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; We should be able to handle intrinsics
-; CHECK: define spir_kernel void @__vecz_v4_instrinsic
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
deleted file mode 100644
index f9169c7420165..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation_success_user_defined.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k user_defined -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Kernels
-
-define spir_kernel void @instrinsic(float* %in1, float* %in2, float* %in3, float* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds float, float* %in1, i64 %call
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %in2, i64 %call
-  %1 = load float, float* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %in3, i64 %call
-  %2 = load float, float* %arrayidx2, align 4
-  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-  %arrayidx3 = getelementptr inbounds float, float* %out, i64 %call
-  store float %3, float* %arrayidx3, align 4
-  ret void
-}
-
-define spir_kernel void @builtin(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @user_defined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @defined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @user_undefined(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @undefined(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantinline(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %call
-  %add.ptr1 = getelementptr inbounds i32, i32* %out, i64 %call
-  call spir_func void @dontinline(i32* %add.ptr, i32* %add.ptr1)
-  ret void
-}
-
-define spir_kernel void @cantduplicate(i32* %in, i32* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
-  %arrayidx2 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %call1, i32* %arrayidx2, align 4
-  ret void
-}
-
-define spir_kernel void @optnone(i32* %in, i32* %out) #2 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %call
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %call
-  store i32 %0, i32* %arrayidx1, align 4
-  ret void
-}
-
-; Declaration only functions
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare spir_func i32 @_Z3absi(i32)
-declare spir_func i32 @_Z3clzi(i32) #1
-declare i64 @__mux_get_global_id(i32)
-declare spir_func void @undefined(i32*, i32*)
-
-; Functions with definitions
-
-define spir_func void @defined(i32* %in, i32* %out) {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-define spir_func void @dontinline(i32* %in, i32* %out) #0 {
-entry:
-  %0 = load i32, i32* %in, align 4
-  store i32 %0, i32* %out, align 4
-  ret void
-}
-
-; Attributes
-
-attributes #0 = { noinline }
-attributes #1 = { noduplicate }
-attributes #2 = { optnone noinline }
-
-; We should be able to handle user functions for which we have a definition
-; CHECK: define spir_kernel void @__vecz_v4_user_defined

From c46bae64b0525c8c2bd8d4f9d265c49837b8b22a Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 2 Jan 2024 18:37:35 +0000
Subject: [PATCH 079/182] [vecz] Add nounwind/norecurse attributes to internal
 vecz builtins

Though in practice we almost always have these attributes set on the
functions that call these, this change should help codegen in the rare
case that the builtins aren't inlined.
---
 .../compiler_passes/vecz/source/vectorization_context.cpp | 7 +++++++
 .../llvm/OpaquePointers/control_flow_conversion_ptrs.ll   | 4 +++-
 .../vecz/test/lit/llvm/OpaquePointers/masked_store.ll     | 6 ++++--
 .../ScalableVectors/define_interleaved_store_as_masked.ll | 4 +++-
 .../test/lit/llvm/ScalableVectors/interleaved_load.ll     | 7 ++++---
 .../VectorPredication/define_interleaved_load_store.ll    | 6 ++++--
 .../llvm/VectorPredication/define_masked_load_store.ll    | 6 ++++--
 .../test/lit/llvm/VectorPredication/scatter_gather.ll     | 4 ++--
 .../vecz/test/lit/llvm/control_flow_conversion_ptrs.ll    | 4 +++-
 .../vecz/test/lit/llvm/define_gather_load.ll              | 4 +++-
 .../vecz/test/lit/llvm/define_gather_load_as_masked.ll    | 4 +++-
 .../vecz/test/lit/llvm/define_scatter_store.ll            | 4 +++-
 .../vecz/test/lit/llvm/define_scatter_store_as_masked.ll  | 4 +++-
 .../compiler_passes/vecz/test/lit/llvm/masked_atomics.ll  | 8 +++++---
 .../compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll  | 6 ++++--
 .../vecz/test/lit/llvm/masked_interleaved.ll              | 4 +++-
 .../vecz/test/lit/llvm/masked_interleaved_as_scatter.ll   | 4 +++-
 17 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 15b1420e64a18..d31b6cb3aa921 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -207,6 +207,13 @@ Function *VectorizationContext::getOrCreateInternalBuiltin(StringRef Name,
   if (!F && FT) {
     F = dyn_cast_or_null<Function>(
         Module.getOrInsertFunction(Name, FT).getCallee());
+    if (F) {
+      // Set some default attributes on the function.
+      // We never use exceptions
+      F->addFnAttr(Attribute::NoUnwind);
+      // Recursion is not supported in ComputeMux
+      F->addFnAttr(Attribute::NoRecurse);
+    }
   }
 
   return F;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
index 689cf30575889..853fb9229ce48 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
@@ -43,10 +43,12 @@ if.end:
   ret void
 }
 
-; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
 ; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
 ; CHECK:     [[IF]]:
 ; CHECK-NEXT:  store ptr [[A]], ptr [[B]], align 4
 ; CHECK-NEXT:  br label %[[EXIT]]
 ; CHECK:     [[EXIT]]:
 ; CHECK-NEXT:  ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
index 43e027f0bf8b8..cd1652f0d9910 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
@@ -65,7 +65,7 @@ if.end:
   ret void
 }
 
-; CHECK:     define void @__vecz_b_masked_store4_fu3ptrb(float [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:     define void @__vecz_b_masked_store4_fu3ptrb(float [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
 ; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
 ; CHECK:     [[IF]]:
 ; CHECK-NEXT:  store float [[A]], ptr [[B]], align 4
@@ -73,10 +73,12 @@ if.end:
 ; CHECK:     [[EXIT]]:
 ; CHECK-NEXT:  ret void
 
-; CHECK:     define void @__vecz_b_masked_store4_fu3ptrU3AS3b(float [[A:%.*]], ptr addrspace(3) [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:     define void @__vecz_b_masked_store4_fu3ptrU3AS3b(float [[A:%.*]], ptr addrspace(3) [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS]] {
 ; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
 ; CHECK:     [[IF]]:
 ; CHECK-NEXT:  store float [[A]], ptr addrspace(3) [[B]], align 4
 ; CHECK-NEXT:  br label %[[EXIT]]
 ; CHECK:     [[EXIT]]:
 ; CHECK-NEXT:  ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index 8954814274853..b1199cf8423d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -53,7 +53,7 @@ declare void @__mux_work_group_barrier(i32, i32, i32)
 declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
 
 ; Test if the interleaved store is defined correctly
-; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double> %0, ptr addrspace(1) %1) {
+; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double> %0, ptr addrspace(1) %1) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
@@ -63,3 +63,5 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK:   call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32 immarg 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK:   ret void
 ; CHECK: }
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index 5b71067b8d4a0..1bff13b48c65a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -44,7 +44,7 @@ entry:
 
 declare i64 @__mux_get_global_id(i32)
 
-; CHECK: define void @__vecz_b_interleaved_store4_V_u5nxv4ju3ptrU3AS1(<vscale x 4 x i32> [[ARG0:%.*]], ptr addrspace(1) [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
+; CHECK: define void @__vecz_b_interleaved_store4_V_u5nxv4ju3ptrU3AS1(<vscale x 4 x i32> [[ARG0:%.*]], ptr addrspace(1) [[ARG1:%.*]], i64 [[ARG2:%.*]]) [[ATTRS:#[0-9]+]] {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) [[ARG1]], {{i32|i64}} 0
 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <vscale x 4 x ptr addrspace(1)> [[TMP0]], <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
@@ -53,8 +53,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <vscale x 4 x ptr addrspace(1)> [[TMP1]], <vscale x 4 x i64> [[TMP5]]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)) #[[ATTRS:[0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)) [[MASKED_ATTRS:#[0-9]+]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 
-; CHECK: attributes #[[ATTRS]] = {
+; CHECK-DAG: attributes [[ATTRS]] = { norecurse nounwind }
+; CHECK-DAG: attributes [[MASKED_ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index fd80c369026c7..2d01057a6170e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -54,7 +54,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 
 ; Test if the interleaved load is defined correctly
 ; Vector-predicated interleaved loads are always masked
-; CHECK: define <vscale x 4 x double> @__vecz_b_masked_interleaved_load8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(ptr addrspace(1){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}}) {
+; CHECK: define <vscale x 4 x double> @__vecz_b_masked_interleaved_load8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(ptr addrspace(1){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
@@ -68,7 +68,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 
 ; Test if the interleaved store is defined correctly
 ; Vector-predicated interleaved stores are always masked
-; CHECK: define void @__vecz_b_masked_interleaved_store8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <vscale x 4 x i1>{{( %2)?}}, i32{{( %3)?}})
+; CHECK: define void @__vecz_b_masked_interleaved_store8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <vscale x 4 x i1>{{( %2)?}}, i32{{( %3)?}}) [[ATTRS]]
 ; CHECK: entry:
 ; CHECK:  %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:  %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
@@ -78,3 +78,5 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK:  call void @llvm.vp.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %6, <vscale x 4 x i1> %2, i32 %3)
 ; CHECK:  ret void
 ; CHECK: }
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
index 210a95872cdd5..549f8cb8e79f9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -64,13 +64,15 @@ declare i64 @__mux_get_local_size(i32)
 declare i64 @__mux_get_group_id(i32)
 
 ; Test if the masked store is defined correctly
-; CHECK: define void @__vecz_b_masked_store4_vp_Dv4_ju3ptrU3AS1Dv4_bj(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}, i32{{( %3)?}}) {
+; CHECK: define void @__vecz_b_masked_store4_vp_Dv4_ju3ptrU3AS1Dv4_bj(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}, i32{{( %3)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK: call void @llvm.vp.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, <4 x i1> %2, i32 %3)
 ; CHECK: ret void
 
 ; Test if the masked load is defined correctly
-; CHECK: define <4 x i32> @__vecz_b_masked_load4_vp_Dv4_ju3ptrU3AS2Dv4_bj(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}}, i32{{( %2)?}})
+; CHECK: define <4 x i32> @__vecz_b_masked_load4_vp_Dv4_ju3ptrU3AS2Dv4_bj(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}}, i32{{( %2)?}}) [[ATTRS]] {
 ; CHECK: entry:
 ; CHECK: %3 = call <4 x i32> @llvm.vp.load.v4i32.p2(ptr addrspace(2) %0, <4 x i1> %1, i32 %2)
 ; CHECK: ret <4 x i32> %3
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
index 1c4fccb05352e..15d66ea84ae28 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -55,11 +55,11 @@ entry:
 ; CHECK: [[v:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p1(
 ; CHECK: call void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[v]],
 
-; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2) {
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2) [[ATTRS:#[0-9]+]] {
 ; CHECK:   %3 = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2)
 ; CHECK:   ret <vscale x 4 x i32> %3
 
-; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3) {
+; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3) [[ATTRS]] {
 ; CHECK: entry:
 ; CHECK:   call void @llvm.vp.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3)
 ; CHECK:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
index 5232baa40e5eb..457568e631a90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -41,7 +41,7 @@ if.else:
 
 if.end:
   ret void
-; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) {
+; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
 ; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
 ; CHECK:     [[IF]]:
 ; CHECK-NEXT:  store ptr [[A]], ptr [[B]], align 4
@@ -71,3 +71,5 @@ if.else:
 if.end:
   ret void
 }
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
index 45a177ad18b3f..eff4f12e6bfa7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -38,6 +38,8 @@ entry:
 declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
-; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) {
+; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: %[[V1:[0-9]+]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>,
 ; CHECK: ret <4 x i64> %[[V1]]
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
index b287804080553..c25c29af33ede 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -38,6 +38,8 @@ entry:
 declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
-; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) {
+; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> undef)
 ; CHECK: ret <4 x i64>
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
index e41f41d52715d..a035ca05d3dc8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -38,7 +38,9 @@ entry:
 declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
-; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) {
+; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry
 ; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
index 768599fcb72d1..fd7a7570b2527 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -38,7 +38,9 @@ entry:
 declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
-; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) {
+; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
index 2f11e37c275c2..7413f6ca6b345 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
@@ -50,7 +50,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK: br label %loopIR
 
@@ -77,11 +77,13 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; Assume that all masked atomicrmw operations follow the logic above. Just
 ; check that the right atomicrmw instruction is being generated.
-; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS]] {
 ; CHECK: atomicrmw umin ptr {{%.*}}, i32 {{%.*}} monotonic, align 2
 
 
-; CHECK: define <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(<4 x ptr> [[PTRS:%0]], <4 x float> [[VALS:%1]], <4 x i1> [[MASK:%2]]) {
+; CHECK: define <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(<4 x ptr> [[PTRS:%0]], <4 x float> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS]] {
 ; CHECK: atomicrmw volatile fmax ptr {{%.*}}, float {{%.*}} syncscope("singlethread") seq_cst, align 4
 
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
+
 declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
index 73aec6dfc4caa..80576d6aa3f15 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
@@ -64,7 +64,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) {
+; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK: br label %loopIR
 
@@ -99,7 +99,9 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; Assume that all masked cmpxchg operations follow the logic above. Just
 ; check that the right cmpxchg instruction is being generated.
-; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) {
+; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) [[ATTRS]] {
 ; CHECK: cmpxchg weak volatile ptr {{%.*}}, i32 {{%.*}}, i32 {{%.*}} syncscope("singlethread") monotonic seq_cst, align 8
 
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
+
 declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
index 4c3d9e82970fe..43dcc6217ce77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -65,10 +65,12 @@ attributes #2 = { nobuiltin }
 !6 = !{!"clang version 3.8.0 "}
 
 
-; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) {
+; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %3 = getelementptr i32, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 2, i64 4, i64 6>
 ; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %3, i32{{( immarg)?}} 4, <4 x i1> %2) #
 ; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
index 5166cab218e80..11d14417f9ecf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -65,7 +65,7 @@ attributes #2 = { nobuiltin }
 !6 = !{!"clang version 3.8.0 "}
 
 
-; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) {
+; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] {
 
 ; Check for the address splat
 ; CHECK: %[[BROADCASTADDRSPLATINSERT:.+]] = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %{{.+}}, {{i32|i64}} 0
@@ -73,3 +73,5 @@ attributes #2 = { nobuiltin }
 ; CHECK: getelementptr i32, <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLAT]], <4 x i64> <i64 0, i64 2, i64 4, i64 6>
 
 ; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }

From 2f7e00c79ec72da92672eca86a503c24a1ae047b Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 9 Jan 2024 12:36:40 +0000
Subject: [PATCH 080/182] [vecz] Ensure inactive lanes don't contribute to
 branch conditions

In divergent blocks, we must ensure that masked-out - or inactive -
work-items don't contribute a 'true' value towards the branch condition.
Masked-out values may be poison or undef, such as those coming from a
masked-load operation.

Note that this can happen even with ostensibly 'uniform' values, as our
uniform/divergent analysis isn't capable of discerning an
unconditionally uniform value from one that's uniform but produces
poison/garbage when masked out.

To work around this, we ensure that branch conditions in divergent
blocks are masked with the block's entry mask. This should ensure that
only lanes that are meant to be active at that point contribute their
values.

This is likely not a 100% complete fix, as we don't perform this on
uniform blocks or loops. There are tests with 'uniform' loops which are
unconditionally entered despite no work-items being active. As such, if
we mask the loop exit condition with the entry mask, it will never be
true, and loop will never exit. This should be good enough to fix some
known regressions - the more correct fix would likely involve a lot more
work.

Note also that it is pessimistic in many of the vecz test changes. Some
(unmasked) uniform conditions don't need this applied, as they truly
always produce the same value even on inactive lanes. We don't have to
tools to distinguish these values from the ones that do need masking.
---
 .../control_flow_conversion_pass.cpp          |  46 ++++-
 .../vecz/test/lit/llvm/Boscc/boscc_merge.ll   |   7 +-
 .../lit/llvm/Boscc/partial_linearization13.ll |   6 +-
 .../lit/llvm/Boscc/partial_linearization5.ll  |   6 +-
 .../lit/llvm/Boscc/partial_linearization6.ll  |   6 +-
 .../vecz/test/lit/llvm/divergent_loop_bug.ll  | 186 ++++++++++++++++++
 .../test/lit/llvm/partial_linearization13.ll  |   6 +-
 .../test/lit/llvm/partial_linearization5.ll   |   6 +-
 8 files changed, 262 insertions(+), 7 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index f7649b5b4e46c..2b6ae432554c5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -25,12 +25,15 @@
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/Analysis/PostDominators.h>
 #include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/Argument.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/CFG.h>
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Dominators.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstrTypes.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/Support/Casting.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
 #include <llvm/Support/TypeSize.h>
@@ -401,6 +404,27 @@ static inline Error makeStringError(const Twine &message, Instruction &I) {
   helper_stream << " " << I;
   return make_error<StringError>(helper_stream.str(), inconvertibleErrorCode());
 }
+
+// A conservative helper method to determine whether a branch condition
+// (expected to be an i1 result of a comparison instruction) is truly uniform.
+// Note that we can't (currently) rely on UniformValueAnalysis for this
+// purpose. We need to be able to discern "truly" uniform values from uniform
+// values which are only uniform on active lanes.
+// FIXME: This is pessimistic. We could expand on this, or enhance the
+// UniformValueAnalysis.
+static bool isBranchCondTrulyUniform(Value *cond) {
+  const auto *cmp = dyn_cast_if_present<CmpInst>(cond);
+  if (!cmp || cmp->getType()->isVectorTy()) {
+    return false;
+  }
+
+  // Pessimistically assume that only arguments and constants are truly
+  // uniform: i.e., they won't given different reuslts on active vs inactive
+  // lanes.
+  return llvm::all_of(cmp->operands(), [](Value *op) {
+    return isa<Argument>(op) || isa<Constant>(op);
+  });
+}
 }  // namespace
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1503,11 +1527,31 @@ bool ControlFlowConversionState::Impl::createBranchReductions() {
     auto *TI = BB.getTerminator();
     if (BranchInst *Branch = dyn_cast<BranchInst>(TI)) {
       if (Branch->isConditional()) {
-        auto *const cond = Branch->getCondition();
+        auto *cond = Branch->getCondition();
         if (isa<Constant>(cond)) {
           continue;
         }
 
+        // On divergent paths, ensure that only active lanes contribute to a
+        // branch condition; merge the branch condition with the active lane
+        // mask. This ensures that disabled lanes don't spuriously contribute a
+        // 'true' value into the reduced branch condition.
+        // Note that the distinction between 'uniform' and 'divergent' isn't
+        // 100% sufficient for our purposes here, because even uniform values
+        // may read undefined/poison values when masked out.
+        // Don't perform this on uniform loops as those may be unconditionally
+        // entered even when no work-items are active. Masking the loop exit
+        // with the entry mask would mean that the loop never exits.
+        // FIXME: Is this missing incorrect branches in uniform blocks/loops?
+        if (auto *LTag = DR->getTag(&BB).loop;
+            DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) {
+          if (!isBranchCondTrulyUniform(cond)) {
+            cond = BinaryOperator::Create(Instruction::BinaryOps::And, cond,
+                                          MaskInfos[&BB].entryMask,
+                                          cond->getName() + "_active", Branch);
+          }
+        }
+
         const auto &name = needsAllOfMask ? nameAll : nameAny;
         Function *const F = Ctx.getOrCreateInternalBuiltin(
             Twine(baseName).concat(name).str(), FT);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
index e8e6062d818f1..d9d0aa467205c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -122,6 +122,7 @@ if.else6:                                             ; preds = %if.then6, %if.e
 ; CHECK:  br i1 %[[CMP1]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
 
 ; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
 ; CHECK: br i1 %{{.+}}, label %[[IFTHEN2UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
 
 ; CHECK: [[IFELSE2PREHEADERUNIFORM:.+]]:
@@ -220,7 +221,11 @@ if.else6:                                             ; preds = %if.then6, %if.e
 
 ; CHECK: [[IFTHEN2:.+]]:
 ; CHECK: %[[CMP3:.+]] = icmp
-; CHECK: br i1 %[[CMP3]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]]
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[CMP3_ACTIVE:.+]] = and i1 %[[CMP3]], %[[CMP2]]
+; CHECK: %[[CMP3_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP3_ACTIVE]])
+; CHECK: br i1 %[[CMP3_ACTIVE_ANY]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]]
 
 ; CHECK: [[IFELSE3PREHEADER]]:
 ; CHECK: br label %[[IFELSE3]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
index f7937ad89512f..7212b71ed9e23 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -228,7 +228,11 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[IFTHEN4]]:
 ; CHECK: %[[TRUNC:.+]] = icmp
-; CHECK: br i1 %[[TRUNC]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[TRUNC_ACTIVE:.+]] = and i1 %[[TRUNC]], {{%.*}}
+; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]])
+; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
 
 ; CHECK: [[SWBB]]:
 ; CHECK: br label %[[SWBB8]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
index cb6acb6c70594..fb52cd7854755 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -227,7 +227,11 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[IFELSE5]]:
 ; CHECK: %[[CMP7:.+]] = icmp
-; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[CMP7_ACTIVE:.+]] = and i1 %[[CMP7]], {{%.*}}
+; CHECK: %[[CMP7_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP7_ACTIVE]])
+; CHECK: br i1 %[[CMP7_ACTIVE_ANY]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
 
 ; CHECK: [[FORCOND14PREHEADER]]:
 ; CHECK: br label %[[FORCOND14:.+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
index c5a8af2b8b89b..b4a295e3f90c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -168,7 +168,11 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[WHILEBODY]]:
 ; CHECK: %[[CMP:.+]] = icmp
-; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}}
+; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]])
+; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
 
 ; CHECK: [[IFTHEN]]:
 ; CHECK: %[[CMP2:.+]] = icmp
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
new file mode 100644
index 0000000000000..9eacfe58ca85c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
@@ -0,0 +1,186 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_uniform_if_then_in_divergent_block(
+; CHECK-SAME:                      ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out)
+define spir_kernel void @uniform_if_then_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 {
+; CHECK: entry:
+; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold
+; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false
+; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC)
+; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge
+entry:
+  %cosa = alloca float, align 4
+  %call = tail call i64 @__mux_get_global_id(i32 0) #5
+  %sext = mul i64 %call, 51539607552
+  %idx.ext = ashr exact i64 %sext, 32
+  %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext
+  %0 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %cmp.not = icmp slt i32 %0, %threshold
+  br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then
+
+; CHECK: entry.ROSCC:
+; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true
+; CHECK: br label %if.then
+
+entry.if.end17_crit_edge:                          ; preds = %entry
+  br label %if.end17
+
+; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the
+; %or.cond branch.
+; CHECK: if.then:
+; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) #9
+; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %mul7 = fmul float %2, -2.950000e+01
+; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
+; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01
+; CHECK: %or.cond = and i1 %cmp11, %cmp14
+; CHECK: %or.cond_active = and i1 %or.cond, [[CMP_NOT_NOT]]
+; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active)
+; CHECK: br i1 %or.cond_active_any, label %if.then.if.end_crit_edge, label %if.then16
+if.then:                                           ; preds = %entry
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6
+  store float 0.000000e+00, ptr %cosa, align 4
+  %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7
+  %1 = load float, ptr %cosa, align 4
+  %mul7 = fmul float %1, -2.950000e+01
+  %cmp11 = fcmp uge float %mul7, 0.000000e+00
+  %cmp14 = fcmp ult float %mul7, 6.400000e+01
+  %or.cond = and i1 %cmp11, %cmp14
+  br i1 %or.cond, label %if.then.if.end_crit_edge, label %if.then16
+
+if.then.if.end_crit_edge:                          ; preds = %if.then
+  br label %if.end
+
+if.then16:                                         ; preds = %if.then
+  %sext2 = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext2, 32
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom
+  store float %mul7, ptr addrspace(1) %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then.if.end_crit_edge, %if.then16
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6
+  br label %if.end17
+
+if.end17:                                         ; preds = %entry.if.end17_crit_edge, %if.end
+  ret void
+}
+
+define spir_kernel void @uniform_if_else_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 {
+; CHECK: entry:
+; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold
+; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false
+; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC)
+; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge
+entry:
+  %cosa = alloca float, align 4
+  %call = tail call i64 @__mux_get_global_id(i32 0) #5
+  %sext = mul i64 %call, 51539607552
+  %idx.ext = ashr exact i64 %sext, 32
+  %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext
+  %0 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %cmp.not = icmp slt i32 %0, %threshold
+  br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then
+
+; CHECK: entry.ROSCC:
+; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true
+; CHECK: br label %if.then
+
+entry.if.end17_crit_edge:                          ; preds = %entry
+  br label %if.end17
+
+; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the
+; %or.cond branch.
+; CHECK: if.then:
+; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) #9
+; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %mul7 = fmul float %2, -2.950000e+01
+; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
+; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01
+; CHECK: %or.cond = and i1 %cmp11, %cmp14
+; CHECK: %or.cond_active = and i1 %or.cond, [[CMP_NOT_NOT]]
+; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active)
+; CHECK: br i1 %or.cond_active_any, label %if.else.crit_edge, label %if.then16
+if.then:                                           ; preds = %entry
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6
+  store float 0.000000e+00, ptr %cosa, align 4
+  %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7
+  %1 = load float, ptr %cosa, align 4
+  %mul7 = fmul float %1, -2.950000e+01
+  %cmp11 = fcmp uge float %mul7, 0.000000e+00
+  %cmp14 = fcmp ult float %mul7, 6.400000e+01
+  %or.cond = and i1 %cmp11, %cmp14
+  br i1 %or.cond, label %if.else.crit_edge, label %if.then16
+
+if.else.crit_edge:                                 ; preds = %if.then
+  br label %if.else
+
+if.then16:                                         ; preds = %if.then
+  %sext2 = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext2, 32
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom
+  store float %mul7, ptr addrspace(1) %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom
+  store float 1.0, ptr addrspace(1) %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then16
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6
+  br label %if.end17
+
+if.end17:                                         ; preds = %entry.if.end17_crit_edge, %if.end
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind
+declare spir_func float @_Z6sincosfPf(float, ptr) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: alwaysinline norecurse nounwind memory(read)
+declare i64 @__mux_get_global_id(i32) #3
+
+attributes #0 = { norecurse nounwind "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) "vecz-mode"="auto" }
+attributes #2 = { nounwind "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "vecz-mode"="auto" }
+attributes #3 = { alwaysinline norecurse nounwind memory(read) "vecz-mode"="auto" }
+attributes #4 = { norecurse nounwind "mux-base-fn-name"="get_lines" "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" }
+attributes #5 = { alwaysinline norecurse nounwind memory(read) }
+attributes #6 = { nounwind }
+attributes #7 = { nobuiltin nounwind "no-builtins" }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!2}
+!opencl.spir.version = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 1, i32 2}
+!10 = !{i32 2, i32 1, i32 1}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
index 5385a5ab95d69..fbef9b7593967 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -198,7 +198,11 @@ attributes #2 = { nobuiltin nounwind readonly }
 ; CHECK: [[IFTHEN4]]:
 ; CHECK: %[[TMP:.+]] = and i64 %call1, 1
 ; CHECK: %[[TRUNC:.+]] = icmp eq i64 %[[TMP]], 0
-; CHECK: br i1 %[[TRUNC]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[TRUNC_ACTIVE:.+]] = and i1 %[[TRUNC]], {{%.*}}
+; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]])
+; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
 
 ; CHECK: [[SWBB]]:
 ; CHECK: br label %[[SWBB8]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
index 520b069f53c85..e4e6badc21dea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -187,7 +187,11 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[IFELSE5]]:
 ; CHECK: %[[CMP7:.+]] = icmp
-; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[CMP7_ACTIVE:.+]] = and i1 %[[CMP7]], {{%.*}}
+; CHECK: %[[CMP7_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP7_ACTIVE]])
+; CHECK: br i1 %[[CMP7_ACTIVE_ANY]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
 
 ; CHECK: [[FORCOND14PREHEADER]]:
 ; CHECK: br label %[[FORCOND14:.+]]

From 3bddeaa645a1f6ec30563d013701b6477a7bdbc6 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 11 Oct 2023 12:58:55 +0100
Subject: [PATCH 081/182] [compiler] Ensure createLoop always creates a loop

This simplifies the `createLoop` API so that users can rely on it having
generated a loop CFG structure. It correspondingly removes the
`allowUnroll` option.

Now, even when the loop bounds are constant and wouldn't generate a loop
(i.e., 0 or 1 iterations), `createLoop` will generate a loop. Users can
rely on standard LLVM passes to eliminate/simplify the loop.
---
 .../vecz/test/lit/llvm/masked_atomics_scalar.ll      |  9 +++++++--
 .../vecz/test/lit/llvm/masked_cmpxchg_scalar.ll      | 12 +++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
index 6cab589dd89f8..ffe4fd78b8419 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
@@ -28,6 +28,10 @@ declare i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i
 
 ; CHECK: define i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask) {
 ; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[RET_PREV:%.*]] = phi i32 [ poison, %entry ], [ [[RET:%.*]], %if.else ]
 ; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
 ; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
 
@@ -36,8 +40,9 @@ declare i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i
 ; CHECK: br label %if.else
 
 ; CHECK: if.else:
-; CHECK: [[RET:%.*]] = phi i32 [ poison, %entry ], [ [[ATOM]], %if.then ]
-; CHECK: br label %exit
+; CHECK: [[RET]] = phi i32 [ [[RET_PREV]], %loopIR ], [ [[ATOM]], %if.then ]
+; CHECK: [[CMP:%.*]] = icmp ult i32 %{{.*}}, 1
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
 
 ; CHECK: exit:
 ; CHECK: ret i32 [[RET]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
index 831b6cca8fae8..148776ddda6dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
@@ -28,6 +28,11 @@ declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptr
 
 ; CHECK: define { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) {
 ; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[RETVAL_PREV:%.*]] = phi i32 [ poison, %entry ], [ [[RETVAL:%.*]], %if.else ]
+; CHECK: [[RETSUCC_PREV:%.*]] = phi i1 [ poison, %entry ], [ [[RETSUCC:%.*]], %if.else ]
 ; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
 ; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
 
@@ -38,9 +43,10 @@ declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptr
 ; CHECK: br label %if.else
 
 ; CHECK: if.else:
-; CHECK: [[RETVAL:%.*]] = phi i32 [ poison, %entry ], [ [[EXT0]], %if.then ]
-; CHECK: [[RETSUCC:%.*]] = phi i1 [ poison, %entry ], [ [[EXT1]], %if.then ]
-; CHECK: br label %exit
+; CHECK: [[RETVAL]] = phi i32 [ [[RETVAL_PREV]], %loopIR ], [ [[EXT0]], %if.then ]
+; CHECK: [[RETSUCC]] = phi i1 [ [[RETSUCC_PREV]], %loopIR ], [ [[EXT1]], %if.then ]
+; CHECK: [[CMP:%.*]] = icmp ult i32 %{{.*}}, 1
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
 
 ; CHECK: exit:
 ; CHECK: [[INS0:%.*]] = insertvalue { i32, i1 } poison, i32 [[RETVAL]], 0

From 73fd00604d039846844e1561a6a5a4439ea7885e Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 16 Jan 2024 11:21:07 +0000
Subject: [PATCH 082/182] [NFC] Merge OpaquePointers with main tests.

We created OpaquePointers directories for opaque pointer-specific tests
when our main tests still had to support both typed and opaque pointers.
This is no longer the case, we only support opaque pointers, and it has
resulted in some duplicated tests that no longer have any benefit to
us.
---
 .../lit/llvm/OpaquePointers/basic_mem2reg.ll  | 63 ------------------
 .../OpaquePointers/builtin_pointer_return.ll  | 66 -------------------
 .../control_flow_conversion_ptrs.ll           | 54 ---------------
 .../OpaquePointers/interleaved_load_ooo.ll    | 57 ----------------
 .../lit/llvm/OpaquePointers/remove_intptr.ll  | 54 ---------------
 .../basic_vecz_mem2reg.ll                     |  0
 .../builtin_inlining_mem.ll                   |  0
 .../test/lit/llvm/builtin_pointer_return.ll   | 16 ++---
 .../lit/llvm/control_flow_conversion_ptrs.ll  | 35 ++--------
 .../{OpaquePointers => }/load_add_store.ll    |  0
 .../llvm/{OpaquePointers => }/masked_store.ll |  0
 .../{OpaquePointers => }/ternary_transform.ll |  0
 12 files changed, 15 insertions(+), 330 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
 rename llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/{OpaquePointers => }/basic_vecz_mem2reg.ll (100%)
 rename llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/{OpaquePointers => }/builtin_inlining_mem.ll (100%)
 rename llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/{OpaquePointers => }/load_add_store.ll (100%)
 rename llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/{OpaquePointers => }/masked_store.ll (100%)
 rename llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/{OpaquePointers => }/ternary_transform.ll (100%)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
deleted file mode 100644
index 5a23321bf148a..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_mem2reg.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-define spir_kernel void @test(i32 %a, i32 %b, i32* %c, float %rf) {
-entry:
-  %d = alloca i32
-  %e = alloca i32
-  %f = alloca float
-  %gid = call i64 @__mux_get_global_id(i32 0)
-  %sum = add i32 %a, %b
-  store i32 %sum, i32* %d, align 4
-  store i32 %sum, i32* %e, align 4
-  %call = call spir_func i32 @foo(i32* %e)
-  %d.load = load i32, i32* %d, align 4
-  %e.load = load i32, i32* %e, align 4
-  %c0 = getelementptr i32, i32* %c, i64 %gid
-  store i32 %d.load, i32* %c0, align 4
-  %c1 = getelementptr i32, i32* %c0, i64 1
-  store i32 %e.load, i32* %c1, align 4
-  store float %rf, float* %f
-  %ri = bitcast float* %f to i32*
-  %ri.load = load i32, i32* %ri, align 4
-  %c2 = getelementptr i32, i32* %c1, i64 2
-  store i32 %ri.load, i32* %c2, align 4
-  ret void
-}
-
-declare i64 @__mux_get_global_id(i32)
-declare spir_func i32 @foo(i32*)
-
-; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf)
-; CHECK: %e = alloca i32
-; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
-; CHECK: %sum = add i32 %a, %b
-; CHECK: store i32 %sum, ptr %e
-; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e)
-; CHECK: %e.load = load i32, ptr %e
-; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
-; CHECK: store i32 %sum, ptr %c0
-; CHECK: %c1 = getelementptr i32, ptr %c0, i64 1
-; CHECK: store i32 %e.load, ptr %c1
-; CHECK: %0 = bitcast float %rf to i32
-; CHECK: %c2 = getelementptr i32, ptr %c1, i64 2
-; CHECK: store i32 %0, ptr %c2, align 4
-; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
deleted file mode 100644
index 74f64b5b77c12..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_pointer_return.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-target triple = "spir64-unknown-unknown"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
-
-declare i64 @__mux_get_global_id(i32)
-
-declare spir_func float @_Z5fractfPf(float, float*)
-declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
-declare spir_func <4 x float> @_Z5fractDv4_fPS_(<4 x float>, <4 x float>*)
-declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
-
-; FIXME: Both of these are instantiating when we have vector equivalents: see
-; CA-4046.
-
-define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
-  %iouta = alloca float
-  %idx = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx
-  %x = load float, float* %arrayidx.x, align 4
-  %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta)
-  %arrayidx.out = getelementptr inbounds float, float* %outptr, i64 %idx
-  %arrayidx.iout = getelementptr inbounds float, float* %ioutptr, i64 %idx
-  store float %out, float* %arrayidx.out, align 4
-  %iout = load float, float* %iouta, align 4
-  store float %iout, float* %arrayidx.iout, align 4
-  ret void
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
-}
-
-define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
-  %iouta = alloca <2 x float>
-  %idx = call i64 @__mux_get_global_id(i32 0)
-  %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx
-  %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8
-  %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta)
-  %arrayidx.out = getelementptr inbounds <2 x float>, <2 x float>* %outptr, i64 %idx
-  %arrayidx.iout = getelementptr inbounds <2 x float>, <2 x float>* %ioutptr, i64 %idx
-  store <2 x float> %out, <2 x float>* %arrayidx.out, align 8
-  %iout = load <2 x float>, <2 x float>* %iouta, align 8
-  store <2 x float> %iout, <2 x float>* %arrayidx.iout, align 8
-  ret void
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
-}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
deleted file mode 100644
index 853fb9229ce48..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/control_flow_conversion_ptrs.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-declare i64 @__mux_get_global_id(i32)
-
-define spir_kernel void @test_varying_if_ptr(i32 %a, ptr %b, ptr %on_true, ptr %on_false) {
-entry:
-  %conv = sext i32 %a to i64
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %cmp = icmp eq i64 %conv, %call
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom
-  store ptr %on_true, ptr %arrayidx, align 4
-  br label %if.end
-
-if.else:
-  %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42
-  store ptr %on_false, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
-; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
-; CHECK:     [[IF]]:
-; CHECK-NEXT:  store ptr [[A]], ptr [[B]], align 4
-; CHECK-NEXT:  br label %[[EXIT]]
-; CHECK:     [[EXIT]]:
-; CHECK-NEXT:  ret void
-
-; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
deleted file mode 100644
index 81a08efe6f618..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/interleaved_load_ooo.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; This test checks that we can optimize interleaved accesses out of order.
-
-define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %conv = trunc i64 %call to i32
-  %call1 = tail call i64 @__mux_get_global_id(i32 1)
-  %conv2 = trunc i64 %call1 to i32
-  %mul = mul nsw i32 %conv2, %stride
-  %add = add nsw i32 %conv, %mul
-  %mul3 = shl nsw i32 %add, 1
-  %add4 = or i32 %mul3, 1
-  %idxprom = sext i32 %add4 to i64
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
-  %0 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx)
-  %idxprom8 = sext i32 %mul3 to i64
-  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
-  %1 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx9)
-  %sub1 = sub nsw <4 x i32> %0, %1
-  %idxprom12 = sext i32 %add to i64
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12
-  %2 = bitcast i32 addrspace(1)* %arrayidx13 to <4 x i32> addrspace(1)*
-  store <4 x i32> %sub1, <4 x i32> addrspace(1)* %2, align 4
-  ret void
-}
-
-; CHECK: __vecz_v4_interleaved_load_4(
-; CHECK:  [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR:%.*]], align 4
-; CHECK:  [[TMP1:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
-; CHECK:  [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; CHECK:  %deinterleave = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK:  %deinterleave1 = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK:  %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave
-
-declare i64 @__mux_get_global_id(i32)
-declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
deleted file mode 100644
index 6872f1118377f..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/remove_intptr.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i8(
-; CHECK: %shl = shl i64 %call, 2
-; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
-; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
-; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
-define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %0 = ptrtoint i8 addrspace(1)* %in to i64
-  %shl = shl i64 %call, 2
-  %add = add i64 %shl, %0
-  store i64 %add, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; Note that unlike with typed pointers, we don't need a bitcast to i8 here.
-
-; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i16(
-; CHECK: %shl = shl i64 %call, 2
-; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
-; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
-; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
-define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) {
-entry:
-  %call = tail call i64 @__mux_get_global_id(i32 0)
-  %0 = ptrtoint i16 addrspace(1)* %in to i64
-  %shl = shl i64 %call, 2
-  %add = add i64 %shl, %0
-  store i64 %add, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
similarity index 100%
rename from llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/basic_vecz_mem2reg.ll
rename to llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
similarity index 100%
rename from llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/builtin_inlining_mem.ll
rename to llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
index 18f0d818d694d..74f64b5b77c12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -41,10 +41,10 @@ define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr)
   %iout = load float, float* %iouta, align 4
   store float %iout, float* %arrayidx.iout, align 4
   ret void
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
-; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, {{(ptr|float\*)}} nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
 }
 
 define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
@@ -59,8 +59,8 @@ define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x
   %iout = load <2 x float>, <2 x float>* %iouta, align 8
   store <2 x float> %iout, <2 x float>* %arrayidx.iout, align 8
   ret void
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
-; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, {{(ptr|<2 x float>\*)}} nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
index 457568e631a90..853fb9229ce48 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -21,7 +21,7 @@ target triple = "spir64-unknown-unknown"
 
 declare i64 @__mux_get_global_id(i32)
 
-define spir_kernel void @test_varying_if_ptr(i32 %a, i32** %b, i32* %on_true, i32* %on_false) {
+define spir_kernel void @test_varying_if_ptr(i32 %a, ptr %b, ptr %on_true, ptr %on_false) {
 entry:
   %conv = sext i32 %a to i64
   %call = call i64 @__mux_get_global_id(i32 0)
@@ -30,17 +30,19 @@ entry:
 
 if.then:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds i32*, i32** %b, i64 %idxprom
-  store i32* %on_true, i32** %arrayidx, align 4
+  %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom
+  store ptr %on_true, ptr %arrayidx, align 4
   br label %if.end
 
 if.else:
-  %arrayidx2 = getelementptr inbounds i32*, i32** %b, i64 42
-  store i32* %on_false, i32** %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42
+  store ptr %on_false, ptr %arrayidx2, align 4
   br label %if.end
 
 if.end:
   ret void
+}
+
 ; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
 ; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
 ; CHECK:     [[IF]]:
@@ -48,28 +50,5 @@ if.end:
 ; CHECK-NEXT:  br label %[[EXIT]]
 ; CHECK:     [[EXIT]]:
 ; CHECK-NEXT:  ret void
-}
-
-define spir_kernel void @test_varying_if_ptrptr(i32 %a, i32*** %b, i32** %on_true, i32** %on_false) {
-entry:
-  %conv = sext i32 %a to i64
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %cmp = icmp eq i64 %conv, %call
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds i32**, i32*** %b, i64 %idxprom
-  store i32** %on_true, i32*** %arrayidx, align 4
-  br label %if.end
-
-if.else:
-  %arrayidx2 = getelementptr inbounds i32**, i32*** %b, i64 42
-  store i32** %on_false, i32*** %arrayidx2, align 4
-  br label %if.end
-
-if.end:
-  ret void
-}
 
 ; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
similarity index 100%
rename from llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/load_add_store.ll
rename to llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
similarity index 100%
rename from llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/masked_store.ll
rename to llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
similarity index 100%
rename from llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/OpaquePointers/ternary_transform.ll
rename to llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll

From 39f8f4652b05c364a28c4c732170d942745b2957 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 16 Jan 2024 17:47:37 +0000
Subject: [PATCH 083/182] [NFC] Pick a style for const.

In cases where const may appear before or after the type, LLVM style is
to place it before the type, and we have many more instances ourselves
where we place it before the type than we do where we place it after.
This commit sets that style in .clang-format and does a reformat across
the code base.
---
 .../source/analysis/divergence_analysis.cpp   |  58 +++++-----
 .../analysis/instantiation_analysis.cpp       |  12 +--
 .../analysis/packetization_analysis.cpp       |   6 +-
 .../source/analysis/simd_width_analysis.cpp   |   6 +-
 .../vecz/source/analysis/stride_analysis.cpp  |   6 +-
 .../analysis/uniform_value_analysis.cpp       |  14 +--
 .../vectorizable_function_analysis.cpp        |   4 +-
 .../vecz/source/control_flow_boscc.cpp        |  56 +++++-----
 .../include/analysis/divergence_analysis.h    |  22 ++--
 .../include/analysis/instantiation_analysis.h |   2 +-
 .../source/include/analysis/stride_analysis.h |  10 +-
 .../analysis/vectorizable_function_analysis.h |   2 +-
 .../vecz/source/include/control_flow_boscc.h  |   2 +-
 .../vecz/source/include/offset_info.h         |   4 +-
 .../transform/control_flow_conversion_pass.h  |   2 +-
 .../source/include/vectorization_context.h    |  10 +-
 .../source/include/vectorization_helpers.h    |   6 +-
 .../vecz/source/offset_info.cpp               |  60 +++++------
 .../compiler_passes/vecz/source/pass.cpp      |  10 +-
 .../control_flow_conversion_pass.cpp          |  66 ++++++------
 .../inline_post_vectorization_pass.cpp        |   2 +-
 .../source/transform/instantiation_pass.cpp   |   4 +-
 .../transform/packetization_helpers.cpp       |  12 +--
 .../vecz/source/transform/packetizer.cpp      | 100 +++++++++---------
 .../source/transform/scalarization_pass.cpp   |   2 +-
 .../vecz/source/transform/scalarizer.cpp      |  12 +--
 .../transform/squash_small_vectors_pass.cpp   |  32 +++---
 .../transform/ternary_transform_pass.cpp      |  12 +--
 .../transform/uniform_reassociation_pass.cpp  |   4 +-
 .../vecz/source/vector_target_info.cpp        |  44 ++++----
 .../vecz/source/vector_target_info_riscv.cpp  |  40 +++----
 .../vecz/source/vectorization_context.cpp     |  36 +++----
 .../vecz/source/vectorization_helpers.cpp     |  16 +--
 .../vecz/source/vectorization_heuristics.cpp  |   4 +-
 .../vecz/source/vectorizer.cpp                |   6 +-
 .../vecz/tools/source/veczc.cpp               |   2 +-
 36 files changed, 343 insertions(+), 343 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
index 15003bbe08b34..f2f1b9844d804 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -40,8 +40,8 @@ namespace {
 using RPOT = ReversePostOrderTraversal<Function *>;
 }  // namespace
 
-BlockQueue::BlockQueue(DivergenceResult const &dr,
-                       DenseSet<BasicBlock *> const &blocks)
+BlockQueue::BlockQueue(const DivergenceResult &dr,
+                       const DenseSet<BasicBlock *> &blocks)
     : DR(dr) {
   indices.reserve(blocks.size());
   for (auto *const BB : blocks) {
@@ -56,7 +56,7 @@ BlockQueue::BlockQueue(DivergenceResult const &dr,
 const BasicBlockTag &BlockQueue::pop() {
   assert(!indices.empty() && "Trying to pop from an empty BlockQueue");
   std::pop_heap(indices.begin(), indices.end(), std::greater<index_type>());
-  auto const popped_index = indices.back();
+  const auto popped_index = indices.back();
   indices.pop_back();
 
   return DR.getBlockTag(popped_index);
@@ -67,7 +67,7 @@ void BlockQueue::push(size_t index) {
   std::push_heap(indices.begin(), indices.end(), std::greater<index_type>());
 }
 
-void BlockQueue::push(BasicBlock const *bb) {
+void BlockQueue::push(const BasicBlock *bb) {
   indices.push_back(DR.getTagIndex(bb));
   std::push_heap(indices.begin(), indices.end(), std::greater<index_type>());
 }
@@ -84,7 +84,7 @@ size_t DivergenceResult::getTagIndex(const llvm::BasicBlock *BB) const {
 
 BasicBlockTag &DivergenceResult::getOrCreateTag(BasicBlock *BB) {
   assert(BB && "Trying to get the tag of a null BasicBlock");
-  auto const &result = BBMap.try_emplace(BB, basicBlockTags.size());
+  const auto &result = BBMap.try_emplace(BB, basicBlockTags.size());
   if (result.second) {
     // It's a new map entry, so create the new tag and return it.
     basicBlockTags.emplace_back();
@@ -208,7 +208,7 @@ bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
       graph.emplace_back();
       graph.back().BB = BB;
 
-      if (auto const *const LTag = getTag(BB).loop) {
+      if (const auto *const LTag = getTag(BB).loop) {
         graph.back().depth = LTag->loop->getLoopDepth();
       }
     }
@@ -221,8 +221,8 @@ bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
   SmallVector<unsigned, 16> children;
   SmallVector<unsigned, 16> loopExits;
   while (!stack.empty()) {
-    auto const u = stack.pop_back_val();
-    auto const &uNode = graph[u];
+    const auto u = stack.pop_back_val();
+    const auto &uNode = graph[u];
 
     getTag(uNode.BB).pos = pos++;
 
@@ -233,7 +233,7 @@ bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
     auto *const DTNode = DT.getNode(uNode.BB);
     unsigned stacked = 0;
     for (auto *const childNode : make_range(DTNode->begin(), DTNode->end())) {
-      auto const child = indexMap[childNode->getBlock()];
+      const auto child = indexMap[childNode->getBlock()];
       auto &cNode = graph[child];
       if (cNode.depth >= uNode.depth) {
         stack.push_back(child);
@@ -257,8 +257,8 @@ bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
     std::sort(stack.end() - stacked, stack.end(), std::greater<unsigned>());
 
     if (!loopExits.empty()) {
-      unsigned const curDepth = stack.empty() ? 0 : graph[stack.back()].depth;
-      unsigned const depth = std::max(curDepth, graph[loopExits.back()].depth);
+      const unsigned curDepth = stack.empty() ? 0 : graph[stack.back()].depth;
+      const unsigned depth = std::max(curDepth, graph[loopExits.back()].depth);
       unsigned count = 0;
       while (!loopExits.empty() && depth == graph[loopExits.back()].depth) {
         stack.push_back(loopExits.pop_back_val());
@@ -299,7 +299,7 @@ void DivergenceResult::reorderTags(size_t n) {
 
 bool DivergenceResult::computeLoopOrdering() {
   loopOrdering.clear();
-  for (auto const &pair : LMap) {
+  for (const auto &pair : LMap) {
     loopOrdering.push_back(pair.second.get());
   }
 
@@ -327,7 +327,7 @@ void DivergenceResult::markDivCausing(BasicBlock &BB, DivergenceInfo &DI,
 
   // If a block is a joint point (blend) of `BB`, then it is divergent (unless
   // it is the post-dominator of `BB`).
-  auto const &joins = joinPoints(BB);
+  const auto &joins = joinPoints(BB);
   for (BasicBlock *const join : joins) {
     setFlag(*join, BlockDivergenceFlag::eBlockIsBlend);
     LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n");
@@ -363,7 +363,7 @@ void DivergenceResult::markDivLoopDivBlocks(BasicBlock &BB, Loop &L,
   // divergent).
   SmallVector<BasicBlock *, 1> exits;
   L.getExitBlocks(exits);
-  auto const &divergentExits = escapePoints(BB, L);
+  const auto &divergentExits = escapePoints(BB, L);
   for (BasicBlock *E : exits) {
     if (divergentExits.count(E)) {
       markDivergent(*E);
@@ -422,8 +422,8 @@ void DivergenceResult::markByAll(BasicBlock &src) {
     for (BasicBlock *D : descendants) {
       if (D != BB) {
         if (PDT.dominates(D, BB)) {
-          auto const DIndex = getTagIndex(D);
-          auto const *const DLoopTag = basicBlockTags[DIndex].loop;
+          const auto DIndex = getTagIndex(D);
+          const auto *const DLoopTag = basicBlockTags[DIndex].loop;
           // If we are not in a loop, or the loop we live in does not diverge
           // nor does the one englobing us if it exists, then mark by_all.
           Loop *parentLoop;
@@ -473,7 +473,7 @@ bool DivergenceResult::isReachable(BasicBlock *src, BasicBlock *dst,
       return true;
     }
 
-    auto const &BBTag = getTag(BB);
+    const auto &BBTag = getTag(BB);
     for (BasicBlock *succ : successors(BB)) {
       if (!allowLatch && BBTag.isLoopBackEdge(succ)) {
         continue;
@@ -495,13 +495,13 @@ DenseSet<BasicBlock *> DivergenceResult::joinPoints(BasicBlock &src) const {
   Function &F = *src.getParent();
   PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
 
-  DenseMap<BasicBlock const *, BasicBlock const *> defMap;
+  DenseMap<const BasicBlock *, const BasicBlock *> defMap;
   DenseSet<BasicBlock *> joins;
 
   BlockQueue queue(*this);
 
   auto schedule = [&defMap, &joins, &queue](BasicBlock *block,
-                                            BasicBlock const *defBlock) {
+                                            const BasicBlock *defBlock) {
     auto defIt = defMap.find(block);
     // First time we meet this block; not a join (yet).
     if (defIt == defMap.end()) {
@@ -533,9 +533,9 @@ DenseSet<BasicBlock *> DivergenceResult::joinPoints(BasicBlock &src) const {
       continue;
     }
 
-    BasicBlock const *const defBlock = defMap.find(cur)->second;
+    const BasicBlock *const defBlock = defMap.find(cur)->second;
 
-    auto const *const curLTag = curTag.loop;
+    const auto *const curLTag = curTag.loop;
     // If the successor is the header of a nested loop pretend its a single
     // node with the loop's exits as successors.
     if (curLTag && curLTag->header == cur) {
@@ -561,20 +561,20 @@ DenseSet<BasicBlock *> DivergenceResult::joinPoints(BasicBlock &src) const {
   return joins;
 }
 
-DenseSet<BasicBlock *> DivergenceResult::escapePoints(BasicBlock const &src,
-                                                      Loop const &L) const {
-  LoopTag const &LTag = getTag(&L);
+DenseSet<BasicBlock *> DivergenceResult::escapePoints(const BasicBlock &src,
+                                                      const Loop &L) const {
+  const LoopTag &LTag = getTag(&L);
 
   DenseSet<BasicBlock *> divergentExits;
 
-  DenseSet<BasicBlock const *> visited;
+  DenseSet<const BasicBlock *> visited;
   BlockQueue queue(*this);
 
   queue.push(&src);
   visited.insert(&src);
 
   while (!queue.empty()) {
-    auto const &BBTag = queue.pop();
+    const auto &BBTag = queue.pop();
     auto *const BB = BBTag.BB;
 
     // We found a divergent loop exit.
@@ -712,7 +712,7 @@ DivergenceResult DivergenceAnalysis::run(llvm::Function &F,
 
   while (!uniformBranches.empty()) {
     // Partition the list so all the varying branches are grouped at the end.
-    auto const varyingBranches =
+    const auto varyingBranches =
         std::partition(uniformBranches.begin(), uniformBranches.end(),
                        [&UVR](std::pair<BasicBlock *, Value *> &p) -> bool {
                          return !UVR.isVarying(p.second);
@@ -726,7 +726,7 @@ DivergenceResult DivergenceAnalysis::run(llvm::Function &F,
       // Find blocks diverged by varying branch block.
       Res.markDivCausing(*BB, divergenceInfo, PDT);
 
-      if (auto const *const LTag = Res.getTag(BB).loop) {
+      if (const auto *const LTag = Res.getTag(BB).loop) {
         Loop *L = LTag->loop;
         while (L) {
           // If BB is a varying branch, mark the loop as diverging if any two
@@ -757,7 +757,7 @@ DivergenceResult DivergenceAnalysis::run(llvm::Function &F,
     // divergent loops are varying.
     bool updated = false;
     for (BasicBlock *BB : divergenceInfo) {
-      bool const exitedLoop = Res.getTag(BB).outermostExitedLoop;
+      const bool exitedLoop = Res.getTag(BB).outermostExitedLoop;
       for (Instruction &I : *BB) {
         if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
           // Loop exits might have constant phi nodes (lcssa value).
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index 3458dd504f956..0ed8726694daa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -40,7 +40,7 @@ bool analyzeMemOp(MemOp &Op) {
   return analyzeType(Op.getDataType());
 }
 
-bool analyzeCall(VectorizationContext const &Ctx, CallInst *CI) {
+bool analyzeCall(const VectorizationContext &Ctx, CallInst *CI) {
   Function *Callee = CI->getCalledFunction();
   VECZ_FAIL_IF(!Callee);
 
@@ -63,7 +63,7 @@ bool analyzeCall(VectorizationContext const &Ctx, CallInst *CI) {
     return true;
   }
 
-  auto const Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
+  const auto Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
 
   // Intrinsics without side-effects can be safely instantiated.
   if (Callee->isIntrinsic() &&
@@ -88,7 +88,7 @@ bool analyzeCall(VectorizationContext const &Ctx, CallInst *CI) {
   return analyzeType(CI->getType());
 }
 
-bool analyzeAlloca(VectorizationContext const &Ctx, AllocaInst *alloca) {
+bool analyzeAlloca(const VectorizationContext &Ctx, AllocaInst *alloca) {
   // Possibly, we could packetize by creating a wider array, but for now let's
   // just let instantiation deal with it.
   if (alloca->isArrayAllocation()) {
@@ -100,14 +100,14 @@ bool analyzeAlloca(VectorizationContext const &Ctx, AllocaInst *alloca) {
   // have to be sure it divides the type allocation size, otherwise only the
   // first vector element would necessarily be correctly aligned.
   auto *const dataTy = alloca->getAllocatedType();
-  uint64_t const memSize = Ctx.dataLayout()->getTypeAllocSize(dataTy);
-  uint64_t const align = alloca->getAlign().value();
+  const uint64_t memSize = Ctx.dataLayout()->getTypeAllocSize(dataTy);
+  const uint64_t align = alloca->getAlign().value();
   return (align != 0 && (memSize % align) != 0);
 }
 }  // namespace
 
 namespace vecz {
-bool needsInstantiation(VectorizationContext const &Ctx, Instruction &I) {
+bool needsInstantiation(const VectorizationContext &Ctx, Instruction &I) {
   if (CallInst *CI = dyn_cast<CallInst>(&I)) {
     return analyzeCall(Ctx, CI);
   } else if (LoadInst *Load = dyn_cast<LoadInst>(&I)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
index fc840022d31ba..f6f4d9a920574 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
@@ -107,7 +107,7 @@ void PacketizationAnalysisResult::markForPacketization(Value *V) {
   if (mo) {
     auto *const ptr = mo->getPointerOperand();
     if (ptr && UVR.isVarying(ptr)) {
-      auto const *info = SAR.getInfo(ptr);
+      const auto *info = SAR.getInfo(ptr);
       assert(info && "markForPacketization: Unable to obtain stride info");
 
       bool hasValidStride = info->hasStride();
@@ -123,7 +123,7 @@ void PacketizationAnalysisResult::markForPacketization(Value *V) {
           // No interleaved memops exist for vector element types or pointer
           // types. We can only vectorize pointer loads/stores or widen vector
           // load/stores if they are contiguous.
-          auto const stride = info->getConstantMemoryStride(
+          const auto stride = info->getConstantMemoryStride(
               eltTy, &F.getParent()->getDataLayout());
           if (stride != 1) {
             hasValidStride = false;
@@ -152,7 +152,7 @@ void PacketizationAnalysisResult::markForPacketization(Value *V) {
   }
 
   if (auto *const intrinsic = dyn_cast<llvm::IntrinsicInst>(I)) {
-    auto const intrinsicID = intrinsic->getIntrinsicID();
+    const auto intrinsicID = intrinsic->getIntrinsicID();
     if (intrinsicID == llvm::Intrinsic::lifetime_end ||
         intrinsicID == llvm::Intrinsic::lifetime_start) {
       // We don't trace through lifetime intrinsics.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
index fa5c64bb78480..a81ae64dd9cad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -55,7 +55,7 @@ bool definedOrUsedInLoop(Value *V, Loop *L) {
     return true;
   }
 
-  auto const *const I = dyn_cast<Instruction>(V);
+  const auto *const I = dyn_cast<Instruction>(V);
   if (I && L->contains(I)) {
     // It's defined in the current loop.
     return true;
@@ -65,8 +65,8 @@ bool definedOrUsedInLoop(Value *V, Loop *L) {
   // Values defined outwith the loop, but used only by a PHI node within it must
   // be loop-carried variable initial values. If these are not otherwise used
   // directly within the loop, then they are not really live inside the loop.
-  for (auto const *const U : V->users()) {
-    auto const *const I = dyn_cast<Instruction>(U);
+  for (const auto *const U : V->users()) {
+    const auto *const I = dyn_cast<Instruction>(U);
     if (I && !isa<PHINode>(I) && L->contains(I)) {
       return true;
     }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
index 126558e518f38..673997d4fae29 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 llvm::AnalysisKey StrideAnalysis::Key;
 
 OffsetInfo &StrideAnalysisResult::analyze(Value *V) {
-  auto const find = analyzed.find(V);
+  const auto find = analyzed.find(V);
   if (find != analyzed.end()) {
     return find->second;
   }
@@ -44,7 +44,7 @@ OffsetInfo &StrideAnalysisResult::analyze(Value *V) {
   // the constructor itself can create more things in the map and constructing
   // it in-place could result in the storage being re-allocated while the
   // constructor is still running.
-  auto const OI = OffsetInfo(*this, V);
+  const auto OI = OffsetInfo(*this, V);
   return analyzed.try_emplace(V, OI).first->second;
 }
 
@@ -67,7 +67,7 @@ StrideAnalysisResult::StrideAnalysisResult(llvm::Function &f,
 }
 
 void StrideAnalysisResult::manifestAll(IRBuilder<> &B) {
-  auto const saved = B.GetInsertPoint();
+  const auto saved = B.GetInsertPoint();
   for (auto &info : analyzed) {
     info.second.manifest(B, *this);
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index e2a696aa06b2c..4b1f571feb9e7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -156,7 +156,7 @@ void UniformValueResult::findVectorLeaves(
           // uniform then add it to the leaves
           if (!Callee->isIntrinsic() && CI->use_empty()) {
             // Try to identify the called function
-            auto const Builtin = BI.analyzeBuiltin(*Callee);
+            const auto Builtin = BI.analyzeBuiltin(*Callee);
             if (!Builtin.isValid()) {
               Leaves.push_back(CI);
             }
@@ -218,8 +218,8 @@ void UniformValueResult::findVectorRoots(std::vector<Value *> &Roots) const {
       if (!CI || !CI->getCalledFunction()) {
         continue;
       }
-      auto const Builtin = BI.analyzeBuiltinCall(*CI, dimension);
-      auto const Uniformity = Builtin.uniformity;
+      const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension);
+      const auto Uniformity = Builtin.uniformity;
       if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
           Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
         // Calls to `get_global_id`/`get_local_id` are roots.
@@ -280,8 +280,8 @@ void UniformValueResult::markVaryingValues(Value *V, Value *From) {
     Function *Callee = CI->getCalledFunction();
     if (Callee) {
       compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-      auto const Builtin = BI.analyzeBuiltinCall(*CI, dimension);
-      auto const Uniformity = Builtin.uniformity;
+      const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension);
+      const auto Uniformity = Builtin.uniformity;
       if (Uniformity == compiler::utils::eBuiltinUniformityAlways) {
         return;
       }
@@ -405,7 +405,7 @@ Value *UniformValueResult::extractMemBase(Value *Address) {
       // If it's a simple loop iterator, the base can be analyzed from the
       // initial value.
       if (GEP->getPointerOperand() == Phi) {
-        for (auto const &index : GEP->indices()) {
+        for (const auto &index : GEP->indices()) {
           if (isVarying(index.get())) {
             return nullptr;
           }
@@ -455,7 +455,7 @@ UniformValueResult UniformValueAnalysis::run(
       // The same goes for the atomic builtins as well
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (Function *Callee = CI->getCalledFunction()) {
-          auto const Builtin = BI.analyzeBuiltin(*Callee);
+          const auto Builtin = BI.analyzeBuiltin(*Callee);
           if (Builtin.properties & compiler::utils::eBuiltinPropertyAtomic) {
             Res.markVaryingValues(&I);
             continue;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
index 6043238dd4886..988f3208ac213 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -73,7 +73,7 @@ bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
       // All builtins should be vectorizable, in principle. "Invalid builtins"
       // correspond to user functions.
       const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-      auto const Builtin = BI.analyzeBuiltin(*Callee);
+      const auto Builtin = BI.analyzeBuiltin(*Callee);
       if (!Builtin.isValid()) {
         // If it is a user function missing a definition, we cannot safely
         // instantiate it. For example, what if it contains calls to
@@ -100,7 +100,7 @@ bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
 ///
 /// @return the Instruction that prevents the function from vectorizing, or
 /// nullptr if the function can be vectorized.
-Value const *canVectorize(const Function &F, const VectorizationContext &Ctx) {
+const Value *canVectorize(const Function &F, const VectorizationContext &Ctx) {
   // Look for things that are not (yet?) supported.
   for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 4c24097961066..0620b26722157 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -140,8 +140,8 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
     }
     std::sort(predicatedBlockIndices.begin(), predicatedBlockIndices.end());
 
-    for (auto const index : predicatedBlockIndices) {
-      auto const &BTag = DR->getBlockTag(index);
+    for (const auto index : predicatedBlockIndices) {
+      const auto &BTag = DR->getBlockTag(index);
       auto *const B = BTag.BB;
       auto *const LTag = BTag.loop;
 
@@ -200,7 +200,7 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
 
   // Fix the duplicated instructions arguments.
   for (BasicBlock *B : newBlocks) {
-    bool const notHeader = !DR->getTag(B).isLoopHeader();
+    const bool notHeader = !DR->getTag(B).isLoopHeader();
 
     for (Instruction &I : *B) {
       RemapInstruction(&I, VMap,
@@ -235,7 +235,7 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
 }
 
 bool ControlFlowConversionState::BOSCCGadget::duplicateUniformLoops(Loop *L) {
-  LoopTag const &LTag = DR->getTag(L);
+  const LoopTag &LTag = DR->getTag(L);
   Loop *const uniformL = LI->AllocateLoop();
 
   // Either add 'uniformL' as a child of a loop or as a top level loop.
@@ -286,9 +286,9 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformLoops(Loop *L) {
 }
 
 bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
-    DenseSet<BasicBlock *> const &noDuplicateBlocks) {
+    const DenseSet<BasicBlock *> &noDuplicateBlocks) {
   auto discardRegion =
-      [&noDuplicateBlocks](UniformRegion const &region) -> bool {
+      [&noDuplicateBlocks](const UniformRegion &region) -> bool {
     // To determine if it is worth it to duplicate the uniform region, we must
     // take several elements into account:
     // - The length of the duplicated code
@@ -412,11 +412,11 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
   };
 
   // Collect all the blocks in the worklist
-  auto const &DCBI = DR->getBlockOrdering();
-  size_t const numBlocks = DCBI.size();
+  const auto &DCBI = DR->getBlockOrdering();
+  const size_t numBlocks = DCBI.size();
   SmallVector<SESEInfo, 16> SESE;
   SESE.reserve(numBlocks);
-  for (auto const &BBTag : DCBI) {
+  for (const auto &BBTag : DCBI) {
     SESE.emplace_back();
     SESE.back().BB = BBTag.BB;
   }
@@ -436,7 +436,7 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
 
     uniformRegions.emplace_back();
     auto &region = uniformRegions.back();
-    size_t const entryPos = i;
+    const size_t entryPos = i;
     size_t exitPos = 0u;
     size_t firstPredicated = numBlocks;
 
@@ -447,11 +447,11 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
 
     // If we are in a divergent loop, then the whole loop needs a uniform
     // version.
-    auto const *const entryLoopTag = DR->getTag(info.BB).loop;
+    const auto *const entryLoopTag = DR->getTag(info.BB).loop;
     if (entryLoopTag && entryLoopTag->isLoopDivergent()) {
       auto *const loop = entryLoopTag->loop;
       for (BasicBlock *loopB : loop->blocks()) {
-        size_t const pos = DR->getTagIndex(loopB);
+        const size_t pos = DR->getTagIndex(loopB);
         firstPredicated = std::min(firstPredicated, pos);
         SESE[pos].predicated = true;
         region.predicatedBlocks.insert(loopB);
@@ -467,7 +467,7 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
     while (!stack.empty()) {
       auto *const cur = SESE[stack.pop_back_val()].BB;
       for (BasicBlock *succ : successors(cur)) {
-        size_t const succPos = DR->getTagIndex(succ);
+        const size_t succPos = DR->getTagIndex(succ);
 
         auto *const succLoopTag = DR->getBlockTag(succPos).loop;
         if ((!succLoopTag || !succLoopTag->isLoopDivergent()) &&
@@ -582,7 +582,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
       }
     }
 
-    auto const &LTag = DR->getTag(L);
+    const auto &LTag = DR->getTag(L);
     BasicBlock *preheader = LTag.preheader;
     if (!VMap.count(preheader)) {
       auto *T = preheader->getTerminator();
@@ -624,7 +624,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
 
   // If a uniform block targets a predicated block, the latter needs its
   // operands that have a uniform and predicated version blended.
-  for (auto const &predicatedBTag : DR->getBlockOrdering()) {
+  for (const auto &predicatedBTag : DR->getBlockOrdering()) {
     if (BasicBlock *uniformB = getBlock(predicatedBTag.BB)) {
       for (BasicBlock *succ : successors(uniformB)) {
         // We've found a uniform block that targets a predicated block prior
@@ -780,7 +780,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
 
   BasicBlock *connectionPoint = target;
 
-  auto const *const LTag = DR->getTag(predicatedB).loop;
+  const auto *const LTag = DR->getTag(predicatedB).loop;
   const bool needsStore = LTag && LMap.count(LTag->loop);
   if (needsStore) {
     // 'store' is a block that will contain all the uniform versions of the
@@ -856,7 +856,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
 
 bool ControlFlowConversionState::BOSCCGadget::blendConnectionPoint(
     BasicBlock *CP, const std::pair<BasicBlock *, BasicBlock *> &incoming) {
-  auto const *const CPLTag = DR->getTag(CP).loop;
+  const auto *const CPLTag = DR->getTag(CP).loop;
   for (auto &region : uniformRegions) {
     // Create blend instructions at each blend point following 'CP'.
     if (region.contains(CP) || (CP == region.exitBlock) ||
@@ -964,7 +964,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
     }
   }
 
-  for (auto const &tag : DR->getBlockOrdering()) {
+  for (const auto &tag : DR->getBlockOrdering()) {
     BasicBlock *blendPoint = tag.BB;
     if (blendBlocks.count(blendPoint) == 0) {
       continue;
@@ -1124,7 +1124,7 @@ Loop *ControlFlowConversionState::BOSCCGadget::getLoop(Loop *L) {
 
 void ControlFlowConversionState::BOSCCGadget::getUnduplicatedEntryBlocks(
     SmallVectorImpl<BasicBlock *> &blocks) const {
-  for (auto const &region : uniformRegions) {
+  for (const auto &region : uniformRegions) {
     if (VMap.count(region.entryBlock) == 0) {
       blocks.push_back(region.entryBlock);
     }
@@ -1192,7 +1192,7 @@ void ControlFlowConversionState::BOSCCGadget::updateValue(Value *from,
 }
 
 bool ControlFlowConversionState::BOSCCGadget::linkMasks() {
-  for (auto const &BTag : DR->getBlockOrdering()) {
+  for (const auto &BTag : DR->getBlockOrdering()) {
     auto *const BB = BTag.BB;
     if (auto *const uniformB = getBlock(BB)) {
       // Both sets of masks had better exist by this point.
@@ -1287,7 +1287,7 @@ bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues(
 
 bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() {
   // Create a map from entry blocks to their uniform regions
-  DenseMap<BasicBlock *, UniformRegion const *> entryMap;
+  DenseMap<BasicBlock *, const UniformRegion *> entryMap;
   unsigned maxUBlocks = 0;
   for (const auto &region : uniformRegions) {
     if (!region.uniformBlocks.empty()) {
@@ -1302,11 +1302,11 @@ bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() {
   // Also note that we can't use pointers to BasicBlockTags here since
   // `PassState.computeBlockOrdering()` re-orders the tags vector.
   SmallVector<BasicBlock *, 16> filtered;
-  for (auto const &tag : DR->getBlockOrdering()) {
+  for (const auto &tag : DR->getBlockOrdering()) {
     filtered.push_back(tag.BB);
-    auto const found = entryMap.find(tag.BB);
+    const auto found = entryMap.find(tag.BB);
     if (found != entryMap.end()) {
-      auto const *const region = found->second;
+      const auto *const region = found->second;
       filtered.resize(filtered.size() + region->uniformBlocks.size());
     }
   }
@@ -1320,17 +1320,17 @@ bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() {
   for (auto it = filtered.begin(), ie = filtered.end(); it != ie;) {
     auto *const BB = *it;
 
-    auto const found = entryMap.find(BB);
+    const auto found = entryMap.find(BB);
     if (found != entryMap.end()) {
       // If the entry block of the region is NOT duplicated, add the uniform
       // blocks after it.
-      bool const entryDupe = getBlock(BB);
+      const bool entryDupe = getBlock(BB);
       if (!entryDupe) {
         ++it;
       }
 
       // Gather the indices of the uniform blocks and sort them.
-      auto const &region = *found->second;
+      const auto &region = *found->second;
       uniformBlocks.clear();
       for (auto *const uBB : region.uniformBlocks) {
         uniformBlocks.push_back(DR->getTagIndex(uBB));
@@ -1338,7 +1338,7 @@ bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() {
       std::sort(uniformBlocks.begin(), uniformBlocks.end());
 
       // Insert the uniform blocks into the gap.
-      for (auto const uBBi : uniformBlocks) {
+      for (const auto uBBi : uniformBlocks) {
         (*it++) = DR->getBlockTag(uBBi).BB;
       }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
index f941eda04a3c0..49d2eafa5186c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
@@ -95,7 +95,7 @@ struct BlockQueue {
   using index_type = uint32_t;
   using index_list = std::vector<index_type>;
 
-  DivergenceResult const &DR;
+  const DivergenceResult &DR;
 
   /// @brief The DCBI indices of the blocks in the queue, in min-heap order.
   /// Since we can easily retrieve the BasicBlockTag from the DCBI ordered
@@ -105,11 +105,11 @@ struct BlockQueue {
   index_list indices;
 
   /// @brief Constructs an empty BlockQueue
-  BlockQueue(DivergenceResult const &dr) : DR(dr){};
+  BlockQueue(const DivergenceResult &dr) : DR(dr){};
 
   /// @brief Constructs a BlockQueue from a set of blocks.
-  BlockQueue(DivergenceResult const &dr,
-             llvm::DenseSet<llvm::BasicBlock *> const &blocks);
+  BlockQueue(const DivergenceResult &dr,
+             const llvm::DenseSet<llvm::BasicBlock *> &blocks);
 
   /// @brief Returns the number of blocks in the queue.
   size_t size() const { return indices.size(); }
@@ -122,7 +122,7 @@ struct BlockQueue {
 
   /// @brief Pushes a block on the queue by pointer.
   /// Prefer `push(size_t)` if the tag index is available.
-  void push(llvm::BasicBlock const *bb);
+  void push(const llvm::BasicBlock *bb);
 
   /// @brief Pops a block from the queue and returns it.
   const BasicBlockTag &pop();
@@ -178,7 +178,7 @@ struct BasicBlockTag {
   /// @brief Deleted address-of operator
   BasicBlockTag *operator&() = delete;
   /// @brief Deleted const address-of operator
-  BasicBlockTag const *operator&() const = delete;
+  const BasicBlockTag *operator&() const = delete;
 
   BlockDivergenceFlag divergenceFlag = BlockDivergenceFlag::eBlockHasNoFlag;
 
@@ -236,7 +236,7 @@ class DivergenceResult {
   /// @brief Gets a BasicBlockTag by its DCBI index
   /// @param[in] index the DCBI index
   /// @returns reference to the BasicBlockTag
-  BasicBlockTag const &getBlockTag(size_t index) const {
+  const BasicBlockTag &getBlockTag(size_t index) const {
     return basicBlockTags[index];
   }
 
@@ -259,7 +259,7 @@ class DivergenceResult {
     return basicBlockTags[getTagIndex(BB)];
   }
 
-  BasicBlockTag const &getTag(const llvm::BasicBlock *BB) const {
+  const BasicBlockTag &getTag(const llvm::BasicBlock *BB) const {
     return basicBlockTags[getTagIndex(BB)];
   }
 
@@ -375,7 +375,7 @@ class DivergenceResult {
                    bool allowLatch = false) const;
 
   /// @brief List of blocks having a divergent branch.
-  std::vector<llvm::BasicBlock *> const &getDivCausingBlocks() const {
+  const std::vector<llvm::BasicBlock *> &getDivCausingBlocks() const {
     return divCausingBlocks;
   }
 
@@ -417,8 +417,8 @@ class DivergenceResult {
   /// @param[in] src Divergent branch
   /// @param[in] L Divergent loop
   /// @return List of exit blocks some work-item may leave through.
-  llvm::DenseSet<llvm::BasicBlock *> escapePoints(llvm::BasicBlock const &src,
-                                                  llvm::Loop const &L) const;
+  llvm::DenseSet<llvm::BasicBlock *> escapePoints(const llvm::BasicBlock &src,
+                                                  const llvm::Loop &L) const;
 
   /// @brief the Function the analysis was run on
   llvm::Function &F;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
index 1a837347d6d39..cadb756cef6a5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
@@ -30,7 +30,7 @@ class VectorizationContext;
 /// @param[in] I Instruction to analyze.
 ///
 /// @return true iff the instruction requires instantiation.
-bool needsInstantiation(VectorizationContext const &Ctx, llvm::Instruction &I);
+bool needsInstantiation(const VectorizationContext &Ctx, llvm::Instruction &I);
 };  // namespace vecz
 
 #endif  // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
index 45561ea9285ec..5dc1c64676d08 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -57,13 +57,13 @@ class StrideAnalysisResult {
 
   /// @brief gets a pointer to the info struct for this value's analysis.
   OffsetInfo *getInfo(llvm::Value *V) {
-    auto const find = analyzed.find(V);
+    const auto find = analyzed.find(V);
     return (find != analyzed.end()) ? &find->second : nullptr;
   }
 
   /// @brief gets a pointer to the info struct for this value's analysis.
-  OffsetInfo const *getInfo(llvm::Value *V) const {
-    auto const find = analyzed.find(V);
+  const OffsetInfo *getInfo(llvm::Value *V) const {
+    const auto find = analyzed.find(V);
     return (find != analyzed.end()) ? &find->second : nullptr;
   }
 
@@ -75,8 +75,8 @@ class StrideAnalysisResult {
   /// not allowed to construct them during an analysis pass. However, note that
   /// information about manifested stride `Value`s will survive until the
   /// analysis is invalidated.
-  OffsetInfo const &manifest(llvm::IRBuilder<> &B, llvm::Value *V) {
-    auto const find = analyzed.find(V);
+  const OffsetInfo &manifest(llvm::IRBuilder<> &B, llvm::Value *V) {
+    const auto find = analyzed.find(V);
     assert(find != analyzed.end() &&
            "Trying to manifest unanalyzed OffsetInfo");
     return find->second.manifest(B, *this);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
index bb4f248b121a8..4309e05aa476f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -42,7 +42,7 @@ class VectorizableFunctionAnalysis
 
     /// @brief If the function can not be vectorized, the value (if any) that
     /// is the cause of the problem.
-    llvm::Value const *failedAt = nullptr;
+    const llvm::Value *failedAt = nullptr;
 
    public:
     /// @brief Handle invalidation events from the new pass manager.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
index d24c0ca6b31bb..51bd562be6f00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
@@ -221,7 +221,7 @@ class ControlFlowConversionState::BOSCCGadget final {
   /// @brief Create uniform regions
   /// @return true if no problem occurred, false otherwise.
   bool createUniformRegions(
-      llvm::DenseSet<llvm::BasicBlock *> const &noDuplicateBlocks);
+      const llvm::DenseSet<llvm::BasicBlock *> &noDuplicateBlocks);
   /// @brief Duplicate a loop, creating a new looptag and updating all the
   ///        relevant information.
   /// @param[in] L The loop to duplicate
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
index 77e2c49da72db..32d48a351988d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -124,7 +124,7 @@ struct OffsetInfo {
   /// @return The memory stride as number of elements.
 
   uint64_t getConstantMemoryStride(llvm::Type *PtrEleTy,
-                                   llvm::DataLayout const *DL) const;
+                                   const llvm::DataLayout *DL) const;
 
   /// @brief Convert the bytewise stride into an element-wise stride based on
   /// the data type and data layout, building instructions where needed. Note
@@ -135,7 +135,7 @@ struct OffsetInfo {
   /// @param[in] DL The Data Layout.
   /// @return The memory stride as number of elements.
   llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Type *PtrEleTy,
-                                 llvm::DataLayout const *DL) const;
+                                 const llvm::DataLayout *DL) const;
 
   /// @brief Create Values that represent or compute strides.
   ///
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
index 6290119f88df1..417f57a218516 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -121,7 +121,7 @@ class ControlFlowConversionState {
   /// @param[in] BB the BasicBlock
   /// @returns a reference to the MaskInfo
   const MaskInfo &getMaskInfo(llvm::BasicBlock *BB) const {
-    auto const found = MaskInfos.find(BB);
+    const auto found = MaskInfos.find(BB);
     assert(found != MaskInfos.end() &&
            "Mask Info not constructed for Basic Block!");
     return found->second;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index ad140dd609697..756e9db6bf3a3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -268,13 +268,13 @@ class VectorizationContext {
   /// @param[in] F The empty (declaration only) function to emit the body in
   /// @param[in] Desc The MemOpDesc for the memory operation
   /// @returns true on success, false otherwise
-  bool emitMaskedMemOpBody(llvm::Function &F, MemOpDesc const &Desc) const;
+  bool emitMaskedMemOpBody(llvm::Function &F, const MemOpDesc &Desc) const;
   /// @brief Emit the body for the interleaved load or store internal builtins
   ///
   /// @param[in] F The empty (declaration only) function to emit the body in
   /// @param[in] Desc The MemOpDesc for the memory operation
   /// @returns true on success, false otherwise
-  bool emitInterleavedMemOpBody(llvm::Function &F, MemOpDesc const &Desc) const;
+  bool emitInterleavedMemOpBody(llvm::Function &F, const MemOpDesc &Desc) const;
   /// @brief Emit the body for the masked interleaved load/store internal
   /// builtins
   ///
@@ -282,21 +282,21 @@ class VectorizationContext {
   /// @param[in] Desc The MemOpDesc for the memory operation
   /// @returns true on success, false otherwise
   bool emitMaskedInterleavedMemOpBody(llvm::Function &F,
-                                      MemOpDesc const &Desc) const;
+                                      const MemOpDesc &Desc) const;
   /// @brief Emit the body for the scatter or gather internal builtins
   ///
   /// @param[in] F The empty (declaration only) function to emit the body in
   /// @param[in] Desc The MemOpDesc for the memory operation
   /// @returns true on success, false otherwise
   bool emitScatterGatherMemOpBody(llvm::Function &F,
-                                  MemOpDesc const &Desc) const;
+                                  const MemOpDesc &Desc) const;
   /// @brief Emit the body for the masked scatter or gather internal builtins
   ///
   /// @param[in] F The empty (declaration only) function to emit the body in
   /// @param[in] Desc The MemOpDesc for the memory operation
   /// @returns true on success, false otherwise
   bool emitMaskedScatterGatherMemOpBody(llvm::Function &F,
-                                        MemOpDesc const &Desc) const;
+                                        const MemOpDesc &Desc) const;
   /// @brief Add the masked function to the tracking set
   ///
   /// @param[in] F The function to add
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
index febd373bf5abc..c4b004892ffd9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
@@ -62,12 +62,12 @@ decodeVectorizedFunctionName(llvm::StringRef Name);
 /// @param[in] VU the Vectorization Unit of the scalar function to clone.
 ///
 /// @return The cloned function.
-llvm::Function *cloneFunctionToVector(VectorizationUnit const &VU);
+llvm::Function *cloneFunctionToVector(const VectorizationUnit &VU);
 
 /// @brief Create a copy of the scalar functions debug info metatadata
 //         nodes and set the scope of the copied DI to the vectorized
 //         function.
-void cloneDebugInfo(VectorizationUnit const &VU);
+void cloneDebugInfo(const VectorizationUnit &VU);
 
 /// @brief Clone OpenCL related metadata from the scalar kernel to the
 /// vectorized one.
@@ -76,7 +76,7 @@ void cloneDebugInfo(VectorizationUnit const &VU);
 /// 'opencl.kernel_wg_size_info' metadata from the scalar kernel to the
 /// vectorized one. Obviously, the kernel itself has to be cloned before
 /// calling this function.
-void cloneOpenCLMetadata(VectorizationUnit const &VU);
+void cloneOpenCLMetadata(const VectorizationUnit &VU);
 }  // namespace vecz
 
 #endif  // VECZ_VECTORIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 41bc88eefe6bd..218f17d2984a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -65,14 +65,14 @@ Value *matchSizeType(IRBuilder<> &B, Value *V, bool sext) {
 }
 
 uint64_t getTypeMask(Type *Ty) {
-  auto const bits = Ty->getIntegerBitWidth();
+  const auto bits = Ty->getIntegerBitWidth();
   return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0);
 }
 
 // The index size potentially depends on the address space of the pointer,
 // but let's just use the pointer size for now.
-uint64_t getSizeTypeMask(DataLayout const &DL) {
-  auto const bits = DL.getPointerSizeInBits();
+uint64_t getSizeTypeMask(const DataLayout &DL) {
+  const auto bits = DL.getPointerSizeInBits();
   return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0);
 }
 
@@ -176,9 +176,9 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
   // If we have a uniform value here we don't need to analyse any further.
   if (!SAR.UVR.isVarying(Ins)) {
-    auto const &KB =
+    const auto &KB =
         computeKnownBits(Ins, SAR.F.getParent()->getDataLayout(), 0, &SAR.AC);
-    auto const bitWidth = OffsetTy->getIntegerBitWidth();
+    const auto bitWidth = OffsetTy->getIntegerBitWidth();
 
     // We are interested in the bits that are not known to be zero.
     BitMask &= ~KB.Zero.extractBitsAsZExtValue(bitWidth, 0);
@@ -189,8 +189,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Offset)) {
     // Copy these values into local variables, because `SAR.analyze()` can
     // invalidate any previously obtained references.
-    auto const LHS = SAR.analyze(BOp->getOperand(0));
-    auto const RHS = SAR.analyze(BOp->getOperand(1));
+    const auto LHS = SAR.analyze(BOp->getOperand(0));
+    const auto RHS = SAR.analyze(BOp->getOperand(1));
     if (LHS.mayDiverge() || RHS.mayDiverge()) {
       return setMayDiverge();
     }
@@ -226,7 +226,7 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
   // Consider that integer casts cannot scale item IDs.
   if (CastInst *Cast = dyn_cast<CastInst>(Offset)) {
-    auto const &Src = SAR.analyze(Cast->getOperand(0));
+    const auto &Src = SAR.analyze(Cast->getOperand(0));
     if (Src.mayDiverge()) {
       return setMayDiverge();
     }
@@ -237,8 +237,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
       // address, rendering the entire strided MemOp invalid, even when masked
       // such that the read from the base address is not meant to execute.
       // Note that we don't care about overflowing the index type.
-      auto const typeMask = getTypeMask(Cast->getSrcTy());
-      auto const bitMaskSized =
+      const auto typeMask = getTypeMask(Cast->getSrcTy());
+      const auto bitMaskSized =
           Src.BitMask & getSizeTypeMask(Cast->getModule()->getDataLayout());
       if ((bitMaskSized & typeMask) != bitMaskSized) {
         return setMayDiverge();
@@ -270,8 +270,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
     // If the condition isn't varying and both operands have the same
     // constant stride, the result will also have the same constant stride.
-    auto const LHS = SAR.analyze(Select->getOperand(1));
-    auto const RHS = SAR.analyze(Select->getOperand(2));
+    const auto LHS = SAR.analyze(Select->getOperand(1));
+    const auto RHS = SAR.analyze(Select->getOperand(2));
     if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
         LHS.isStrideConstantInt()) {
       return copyStrideFrom(LHS);
@@ -315,8 +315,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
   // Analyse function calls.
   if (CallInst *CI = dyn_cast<CallInst>(Offset)) {
-    auto const &BI = SAR.UVR.Ctx.builtins();
-    auto const Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension);
+    const auto &BI = SAR.UVR.Ctx.builtins();
+    const auto Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension);
     switch (Builtin.uniformity) {
       default:
       case compiler::utils::eBuiltinUniformityMaybeInstanceID:
@@ -409,7 +409,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
       // If it's a simple loop iterator, the stride can be analyzed from the
       // initial value.
       if (GEP->getPointerOperand() == Phi) {
-        for (auto const &index : GEP->indices()) {
+        for (const auto &index : GEP->indices()) {
           if (SAR.UVR.isVarying(index.get())) {
             return setMayDiverge();
           }
@@ -421,7 +421,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
       // If it's a simple loop iterator, the stride can be analyzed from the
       // initial value.
       if (GEP->getPointerOperand() == Phi) {
-        for (auto const &index : GEP->indices()) {
+        for (const auto &index : GEP->indices()) {
           if (SAR.UVR.isVarying(index.get())) {
             return setMayDiverge();
           }
@@ -434,7 +434,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Address)) {
     {
       auto *const Ptr = GEP->getPointerOperand();
-      auto const &PtrInfo = SAR.analyze(Ptr);
+      const auto &PtrInfo = SAR.analyze(Ptr);
       if (PtrInfo.mayDiverge()) {
         if (isa<SelectInst>(Ptr)) {
           // For the benefit of the Ternary Transform Pass
@@ -461,7 +461,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
       Value *GEPIndex = GEP->getOperand(1 + i);
       assert(GEPIndex && "Could not get operand from GEP");
 
-      auto const &idxOffset = SAR.analyze(GEPIndex);
+      const auto &idxOffset = SAR.analyze(GEPIndex);
       if (idxOffset.mayDiverge()) {
         return setMayDiverge();
       }
@@ -498,8 +498,8 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
     }
     return *this;
   } else if (auto *Select = dyn_cast<SelectInst>(Address)) {
-    auto const LHS = SAR.analyze(Select->getOperand(1));
-    auto const RHS = SAR.analyze(Select->getOperand(2));
+    const auto LHS = SAR.analyze(Select->getOperand(1));
+    const auto RHS = SAR.analyze(Select->getOperand(2));
     if (SAR.UVR.isVarying(Select->getCondition())) {
       // Note that we analyze the operands before returning here, for the
       // benefit of the Ternary Transform Pass, which does its work ONLY
@@ -536,8 +536,8 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
   Instruction *Offset = cast<Instruction>(ActualValue);
   // Analyse binary instructions.
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Offset)) {
-    auto const &LHS = SAR.manifest(B, BOp->getOperand(0));
-    auto const &RHS = SAR.manifest(B, BOp->getOperand(1));
+    const auto &LHS = SAR.manifest(B, BOp->getOperand(0));
+    const auto &RHS = SAR.manifest(B, BOp->getOperand(1));
 
     // Build strides immediately before their instructions
     B.SetInsertPoint(BOp);
@@ -599,7 +599,7 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
   }
 
   if (auto *GEP = dyn_cast<GetElementPtrInst>(Offset)) {
-    auto const &Ptr = SAR.manifest(B, GEP->getPointerOperand());
+    const auto &Ptr = SAR.manifest(B, GEP->getPointerOperand());
     copyStrideFrom(Ptr);
 
     PointerType *GEPPtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
@@ -615,7 +615,7 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
       Value *GEPIndex = GEP->getOperand(1 + i);
       assert(GEPIndex && "Could not get operand from GEP");
 
-      auto const &idxOffset = SAR.manifest(B, GEPIndex);
+      const auto &idxOffset = SAR.manifest(B, GEPIndex);
 
       Indices.push_back(GEPIndex);
       if (!idxOffset.hasStride()) {
@@ -664,7 +664,7 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
 }
 
 uint64_t OffsetInfo::getConstantMemoryStride(Type *PtrEleTy,
-                                             DataLayout const *DL) const {
+                                             const DataLayout *DL) const {
   uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
   VECZ_FAIL_IF(!PtrEleSize);
 
@@ -677,7 +677,7 @@ uint64_t OffsetInfo::getConstantMemoryStride(Type *PtrEleTy,
 }
 
 Value *OffsetInfo::buildMemoryStride(IRBuilder<> &B, Type *PtrEleTy,
-                                     DataLayout const *DL) const {
+                                     const DataLayout *DL) const {
   if (!ManifestStride) {
     assert(Kind != eOffsetLinear &&
            "buildMemoryStride: linear stride not manifest");
@@ -916,7 +916,7 @@ OffsetInfo &OffsetInfo::combineShl(const OffsetInfo &LHS,
     }
 
     if (ConstantInt *CShift = dyn_cast<ConstantInt>(Shift)) {
-      auto const CVal = CShift->getZExtValue();
+      const auto CVal = CShift->getZExtValue();
       BitMask = LHS.BitMask << CVal;
       return setStride(LHS.StrideInt << CVal);
     }
@@ -963,7 +963,7 @@ OffsetInfo &OffsetInfo::combineAShr(const OffsetInfo &LHS,
       // Note that we shift the bitmask as a signed value.
       // Note also that the BitMask is been initialized to the width of the
       // integer type.
-      uint64_t const signMask = (BitMask >> 1) + 1;
+      const uint64_t signMask = (BitMask >> 1) + 1;
       if (LHS.BitMask & signMask) {
         // If it's possible for the source value to be negative, all of the
         // bits in the extended value might be set.
@@ -973,7 +973,7 @@ OffsetInfo &OffsetInfo::combineAShr(const OffsetInfo &LHS,
       }
 
       if (LHS.isStrideConstantInt()) {
-        auto const lostBits = ((uint64_t(1) << CShift) - 1);
+        const auto lostBits = ((uint64_t(1) << CShift) - 1);
         if ((LHS.StrideInt & lostBits) == 0 || (LHS.BitMask & lostBits) == 0) {
           return setStride(LHS.StrideInt >> CShift);
         }
@@ -991,7 +991,7 @@ OffsetInfo &OffsetInfo::manifestAShr(IRBuilder<> &B, const OffsetInfo &LHS,
                                      const OffsetInfo &RHS) {
   if (RHS.Kind == eOffsetConstant) {
     auto *const Shift = RHS.getUniformValue();
-    auto const CShift = RHS.getValueAsConstantInt();
+    const auto CShift = RHS.getValueAsConstantInt();
 
     if (!LHS.isStrideConstantInt() &&
         (LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index b7d316b6d301a..de1097e3bf9ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -106,7 +106,7 @@ PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
   PM.addPass(
       RequireAnalysisPass<compiler::utils::DeviceInfoAnalysis, Module>());
 
-  bool const Check = VeczPassPipeline.empty();
+  const bool Check = VeczPassPipeline.empty();
   if (Check) {
     if (!buildPassPipeline(PM)) {
       return PreservedAnalyses::all();
@@ -305,7 +305,7 @@ std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
 
   // Now try and choose the best width.
   std::optional<unsigned> best_width;
-  auto const mux_sub_group_size = compiler::utils::getMuxSubgroupSize(F);
+  const auto mux_sub_group_size = compiler::utils::getMuxSubgroupSize(F);
 
   auto can_produce_legal_width = [&mux_sub_group_size](unsigned size) {
     // We only support vectorization widths where there's a clean multiple, and
@@ -318,7 +318,7 @@ std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
     if (!can_produce_legal_width(size)) {
       continue;
     }
-    unsigned const candidate_width = size / mux_sub_group_size;
+    const unsigned candidate_width = size / mux_sub_group_size;
     // Try and choose at least one width.
     if (!best_width) {
       best_width = candidate_width;
@@ -335,8 +335,8 @@ std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
     // with that.
     if (auto wgs = compiler::utils::parseRequiredWGSMetadata(F)) {
       uint64_t local_size_x = wgs.value()[0];
-      bool const best_fits = !(local_size_x % *best_width);
-      bool const cand_fits = !(local_size_x % candidate_width);
+      const bool best_fits = !(local_size_x % *best_width);
+      const bool cand_fits = !(local_size_x % candidate_width);
       if (!best_fits && cand_fits) {
         best_width = candidate_width;
         continue;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 2b6ae432554c5..427a7ae71973b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -97,10 +97,10 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   };
 
   /// @brief Type that maps exit blocks to exit mask information.
-  using DenseExitPHIMap = SmallDenseMap<BasicBlock const *, PHINode *, 2>;
+  using DenseExitPHIMap = SmallDenseMap<const BasicBlock *, PHINode *, 2>;
   /// @brief Type that maps exiting blocks to update mask information.
   using DenseExitUpdateMap =
-      SmallDenseMap<BasicBlock const *, BinaryOperator *, 2>;
+      SmallDenseMap<const BasicBlock *, BinaryOperator *, 2>;
 
   struct LoopMasksInfo {
     /// @brief Keep track of which instances left the loop through which exit
@@ -446,7 +446,7 @@ ControlFlowConversionState::ControlFlowConversionState(
 
 PreservedAnalyses ControlFlowConversionState::Impl::run(
     Function &F, FunctionAnalysisManager &AM) {
-  auto const &CFGR = AM.getResult<CFGAnalysis>(F);
+  const auto &CFGR = AM.getResult<CFGAnalysis>(F);
   if (CFGR.getFailed()) {
     ++VeczCFGFail;
     return VU.setFailed("Cannot vectorize the CFG for", &F, &F);
@@ -719,7 +719,7 @@ bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
   //
   // Here we only store the preheader's exit block as we handle the latch
   // in case the loop is divergent in the caller function.
-  auto const *const LTag = DR->getTag(&BB).loop;
+  const auto *const LTag = DR->getTag(&BB).loop;
   if (LTag && LTag->header == &BB) {
     BasicBlock *preheader = LTag->preheader;
     VECZ_ERROR_IF(!preheader, "BasicBlock tag is not defined");
@@ -951,8 +951,8 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
   SmallVector<Loop::Edge, 1> exitEdges;
   LTag.loop->getExitEdges(exitEdges);
   for (Loop::Edge &EE : exitEdges) {
-    auto const *const exitingBlock = EE.first;
-    auto const *const exitBlock = EE.second;
+    const auto *const exitingBlock = EE.first;
+    const auto *const exitBlock = EE.second;
     // Divergent loop need to keep track of which instance left at which exit.
     if (LTag.isLoopDivergent() && DR->isDivergent(*exitBlock)) {
       // The value of the exit mask of a divergent loop is a phi function
@@ -1004,7 +1004,7 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
       PHINode *REM = LMask.persistedDivergentExitMasks[exitingBlock];
       REM->addIncoming(getDefaultValue(REM->getType()), LTag.preheader);
 
-      auto const *const exitingLTag = DR->getTag(exitingBlock).loop;
+      const auto *const exitingLTag = DR->getTag(exitingBlock).loop;
       VECZ_ERROR_IF(!exitingLTag, "Loop tag is not defined");
 
       // By default, the second operand of the mask update is the exit
@@ -1344,8 +1344,8 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   }
 
   // Builtins without side effects do not need to be masked.
-  auto const builtin = Ctx.builtins().analyzeBuiltin(*callee);
-  auto const props = builtin.properties;
+  const auto builtin = Ctx.builtins().analyzeBuiltin(*callee);
+  const auto props = builtin.properties;
   if (props & compiler::utils::eBuiltinPropertyNoSideEffects) {
     LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n");
     return true;
@@ -1746,7 +1746,7 @@ bool ControlFlowConversionState::Impl::uniformizeDivergentLoops() {
         // Order the loop exit blocks such that:
         // - divergent loop exits come first
         // - smallest DCBI come first
-        auto const middle = std::partition(
+        const auto middle = std::partition(
             exitBlocks.begin(), exitBlocks.end(),
             [this](BasicBlock *BB) { return DR->isDivergent(*BB); });
         std::sort(exitBlocks.begin(), middle,
@@ -1965,7 +1965,7 @@ bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks(
       {
         SmallPtrSet<BasicBlock *, 1> predsToRemove;
         for (BasicBlock *pred : predecessors(EB)) {
-          auto const *const predLTag = DR->getTag(pred).loop;
+          const auto *const predLTag = DR->getTag(pred).loop;
           // All predecessors of the divergent loop exit that belong in a loop
           // contained in the outermost loop left by that exit need their
           // edge removed.
@@ -2310,7 +2310,7 @@ bool ControlFlowConversionState::Impl::replaceUsesOutsideDivergentLoop(
     // If the use is in a pure exit block of a divergent loop, don't replace
     // the use if it comes from an optional exit block of that loop.
     if (PHINode *PHI = dyn_cast<PHINode>(user)) {
-      auto const *const exitedLoop = DR->getTag(blockUse).outermostExitedLoop;
+      const auto *const exitedLoop = DR->getTag(blockUse).outermostExitedLoop;
       if (exitedLoop && exitedLoop->pureExit == blockUse) {
         BasicBlock *incoming = PHI->getIncomingBlock(U);
         if (!exitedLoop->loop->contains(incoming)) {
@@ -2375,10 +2375,10 @@ void removeDeferrals(BasicBlock *src, DenseDeferralMap &deferrals) {
 
 bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
   // The entry block cannot be targeted.
-  auto const &DCBI = DR->getBlockOrdering();
-  size_t const numBlocks = DCBI.size();
+  const auto &DCBI = DR->getBlockOrdering();
+  const size_t numBlocks = DCBI.size();
   DenseSet<BasicBlock *> targets(numBlocks - 1);
-  for (auto const &tag : make_range(std::next(DCBI.begin()), DCBI.end())) {
+  for (const auto &tag : make_range(std::next(DCBI.begin()), DCBI.end())) {
     targets.insert(tag.BB);
   }
 
@@ -2399,7 +2399,7 @@ bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
   lin.infos.reserve(numBlocks);
   lin.data.reserve(numBlocks);
   for (size_t BBIndex = 0; BBIndex != numBlocks; ++BBIndex) {
-    auto const &BBTag = DR->getBlockTag(BBIndex);
+    const auto &BBTag = DR->getBlockTag(BBIndex);
     BasicBlock *BB = BBTag.BB;
     lin.beginBlock(BB);
 
@@ -2434,13 +2434,13 @@ bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
             continue;
           }
 
-          size_t const deferredIndex = DR->getTagIndex(deferred);
+          const size_t deferredIndex = DR->getTagIndex(deferred);
           if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) {
             nextIndex = deferredIndex;
           }
         }
 
-        size_t const succIndex = DR->getTagIndex(succ);
+        const size_t succIndex = DR->getTagIndex(succ);
         if (!targeted.count(succ)) {
           // If we have not found a target or there is a better one.
           if (nextIndex == ~size_t(0) || nextIndex > succIndex) {
@@ -2482,7 +2482,7 @@ bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
 
       size_t nextIndex = ~size_t(0);
       for (BasicBlock *deferred : availableTargets) {
-        size_t const deferredIndex = DR->getTagIndex(deferred);
+        const size_t deferredIndex = DR->getTagIndex(deferred);
         if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) {
           LLVM_DEBUG(dbgs()
                      << (nextIndex == ~size_t(0)
@@ -2559,12 +2559,12 @@ bool ControlFlowConversionState::Impl::linearizeCFG() {
   VECZ_FAIL_IF(!computeNewTargets(lin));
 
   auto dataIt = lin.data.begin();
-  for (auto const &newTargetInfo : lin.infos) {
+  for (const auto &newTargetInfo : lin.infos) {
     BasicBlock *BB = newTargetInfo.BB;
 
     // Get the new target info for this block
-    auto const numTargets = newTargetInfo.numTargets;
-    auto const newTargets = dataIt;
+    const auto numTargets = newTargetInfo.numTargets;
+    const auto newTargets = dataIt;
     dataIt += numTargets;
 
     LLVM_DEBUG(dbgs() << BB->getName() << ":\n");
@@ -2669,7 +2669,7 @@ bool ControlFlowConversionState::Impl::generateSelects() {
   // For each basic block that has only one predecessor and phi nodes, we need
   // to either blend those phi nodes into select instructions or try to move
   // the phi nodes up the chain of linearized path.
-  for (auto const &BTag : DR->getBlockOrdering()) {
+  for (const auto &BTag : DR->getBlockOrdering()) {
     BasicBlock *B = BTag.BB;
     if (B->hasNPredecessors(1) || DR->isBlend(*B)) {
       if (PHINode *PHI = dyn_cast<PHINode>(&B->front())) {
@@ -2791,7 +2791,7 @@ bool ControlFlowConversionState::Impl::repairSSA() {
 bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
   // We need to update the incoming blocks of phi nodes whose predecessors may
   // have changed since we have not changed the phi nodes during the rewiring.
-  for (auto const &BBTag : DR->getBlockOrdering()) {
+  for (const auto &BBTag : DR->getBlockOrdering()) {
     BasicBlock *BB = BBTag.BB;
     SmallPtrSet<BasicBlock *, 4> preds(pred_begin(BB), pred_end(BB));
     for (auto it = BB->begin(); it != BB->end();) {
@@ -2888,9 +2888,9 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
 bool ControlFlowConversionState::Impl::blendInstructions() {
   LLVM_DEBUG(dbgs() << "CFC: BLEND INSTRUCTIONS\n");
 
-  auto addSuccessors = [this](BasicBlockTag const &BTag, BlockQueue &queue,
+  auto addSuccessors = [this](const BasicBlockTag &BTag, BlockQueue &queue,
                               DenseSet<BasicBlock *> &visited,
-                              BasicBlockTag const &dstTag) {
+                              const BasicBlockTag &dstTag) {
     for (BasicBlock *succ : successors(BTag.BB)) {
       // Allow latch if 'succ' belongs in 'dst's loop and 'dst' is the header
       // of that loop.
@@ -3027,7 +3027,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
 
   SmallPtrSet<Value *, 16> spareBlends;
 
-  for (auto const &dstTag : DR->getBlockOrdering()) {
+  for (const auto &dstTag : DR->getBlockOrdering()) {
     BasicBlock *dst = dstTag.BB;
     LLVM_DEBUG(dbgs() << "Blending instructions used in " << dst->getName()
                       << ":\n");
@@ -3076,7 +3076,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
         DenseSet<BasicBlock *> visited;
         BlockQueue queue(*DR);
 
-        auto const &srcTag = DR->getTag(src);
+        const auto &srcTag = DR->getTag(src);
 
         addSuccessors(srcTag, queue, visited, dstTag);
 
@@ -3084,7 +3084,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
         if (srcLoop && srcLoop->isLoopDivergent()) {
           if (dst != srcLoop->header) {
             auto &srcMasks = LoopMasks[srcLoop->loop];
-            auto const &headerTag = DR->getTag(srcLoop->header);
+            const auto &headerTag = DR->getTag(srcLoop->header);
 
             // If 'opDef' is an update loop exit mask, set an entry point in
             // the loop header.
@@ -3111,7 +3111,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
         }
 
         while (!queue.empty()) {
-          BasicBlockTag const &curTag = queue.pop();
+          const BasicBlockTag &curTag = queue.pop();
           BasicBlock *const cur = curTag.BB;
 
           LLVM_DEBUG(dbgs() << "\t\t\tPopping " << cur->getName() << "\n");
@@ -3202,7 +3202,7 @@ bool ControlFlowConversionState::Impl::simplifyMasks() {
   // however linearization and/or BOSCC can sometimes delete them from under
   // our nose so it's only safe just to go through all the boolean operations
   // and see if we can simplify any of them.
-  for (auto const &BBTag : DR->getBlockOrdering()) {
+  for (const auto &BBTag : DR->getBlockOrdering()) {
     SmallVector<Instruction *, 16> toDelete;
     for (auto &I : *BBTag.BB) {
       if (isa<SelectInst>(&I) || (I.getType()->getScalarSizeInBits() == 1 &&
@@ -3234,12 +3234,12 @@ bool ControlFlowConversionState::computeBlockOrdering() {
 }
 
 bool ControlFlowConversionState::Impl::checkBlocksOrder() const {
-  auto const &DCBI = DR->getBlockOrdering();
+  const auto &DCBI = DR->getBlockOrdering();
   VECZ_ERROR_IF(F.size() != DCBI.size(),
                 "Worklist does not contain all blocks");
 
   uint32_t next = 0u;
-  for (auto const &BBTag : DCBI) {
+  for (const auto &BBTag : DCBI) {
     VECZ_ERROR_IF(BBTag.pos != next,
                   "BasicBlock indices not in consecutive order");
     ++next;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index 0d93f99c1cf40..9e5a619b6a1e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -61,7 +61,7 @@ Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
 
   // Emit builtins inline when they have no vector/scalar equivalent.
   IRBuilder<> B(CI);
-  auto const Builtin = BI.analyzeBuiltin(*Callee);
+  const auto Builtin = BI.analyzeBuiltin(*Callee);
   if (Builtin.properties &
       compiler::utils::eBuiltinPropertyInlinePostVectorization) {
     SmallVector<Value *, 4> Args(CI->args());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
index 5c67d36ad9698..4b6adeeb792ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -139,9 +139,9 @@ PacketRange InstantiationPass::instantiateCall(CallInst *CI) {
   unsigned SimdWidth = packetizer.width().getFixedValue();
   // Handle special call instructions that return a lane ID.
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-  auto const Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension());
+  const auto Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension());
   if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
-    auto const Uniformity = Builtin.uniformity;
+    const auto Uniformity = Builtin.uniformity;
     if (Uniformity == compiler::utils::eBuiltinUniformityNever) {
       // can't handle these (global/local linear ID probably)
       VECZ_FAIL();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index ad08844ad1293..63838d477b201 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -61,7 +61,7 @@ inline Type *getWideType(Type *ty, ElementCount factor) {
     }
     return VectorType::get(ty, factor);
   }
-  bool const isScalable = isa<ScalableVectorType>(ty);
+  const bool isScalable = isa<ScalableVectorType>(ty);
   assert((!factor.isScalable() || !isScalable) &&
          "Can't widen a scalable vector by a scalable amount");
   auto *vecTy = cast<llvm::VectorType>(ty);
@@ -179,9 +179,9 @@ Value *createOptimalShuffle(IRBuilder<> &B, Value *srcA, Value *srcB,
     // We can absorb one or two unary shuffles into the new shuffle..
     auto *const shuffleAsrc = shuffleA ? shuffleA->getOperand(0) : srcA;
     auto *const shuffleBsrc = shuffleB ? shuffleB->getOperand(0) : srcB;
-    auto const srcASize =
+    const auto srcASize =
         cast<FixedVectorType>(shuffleAsrc->getType())->getNumElements();
-    auto const srcBSize =
+    const auto srcBSize =
         cast<FixedVectorType>(shuffleBsrc->getType())->getNumElements();
     if (srcASize == srcBSize) {
       Constant *srcMaskA = nullptr;
@@ -332,7 +332,7 @@ Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty,
                               unsigned FixedVecElts, const Twine &N) {
   auto *const Steps = B.CreateStepVector(Ty);
 
-  auto const EltCount = multi_llvm::getVectorElementCount(Ty);
+  const auto EltCount = multi_llvm::getVectorElementCount(Ty);
   auto *const ElTy = multi_llvm::getVectorElementType(Ty);
 
   auto *const FixedVecEltsSplat =
@@ -512,7 +512,7 @@ PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
 
 void Packetizer::Result::getPacketValues(SmallVectorImpl<Value *> &vals) const {
   assert(info && "No packet info for this packetization result");
-  auto const width = info->numInstances;
+  const auto width = info->numInstances;
   if (width != 0) {
     return getPacketValues(width, vals);
   }
@@ -647,7 +647,7 @@ Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
                                const vecz::TargetInfo &TI, IRBuilder<> &B,
                                bool URem) {
   auto *ty = subvec->getType();
-  auto const subVecEltCount = multi_llvm::getVectorElementCount(ty);
+  const auto subVecEltCount = multi_llvm::getVectorElementCount(ty);
   assert(subVecEltCount.isScalable() ^ factor.isScalable() &&
          "Must either broadcast fixed vector by scalable factor or scalable "
          "vector by fixed factor");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 3191f4d86d36c..745515052642e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -383,7 +383,7 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return Packetized instruction.
   Value *vectorizeWorkGroupCall(CallInst *CI,
-                                compiler::utils::BuiltinCall const &Builtin);
+                                const compiler::utils::BuiltinCall &Builtin);
   /// @brief Packetize an alloca instruction.
   ///
   /// @param[in] Alloca Instruction to packetize.
@@ -718,7 +718,7 @@ bool Packetizer::Impl::packetize() {
     } else {
       // We couldn't vectorize the type, so create an array instead.
       VECZ_FAIL_IF(SimdWidth.isScalable());
-      unsigned const fixedWidth = SimdWidth.getFixedValue();
+      const unsigned fixedWidth = SimdWidth.getFixedValue();
 
       AllocaInst *const wideAlloca =
           B.CreateAlloca(dataTy, getSizeInt(B, fixedWidth), alloca->getName());
@@ -737,7 +737,7 @@ bool Packetizer::Impl::packetize() {
             users.push_back(cast<Instruction>(U.getUser()));
           }
         } else if (auto memop = MemOp::get(user)) {
-          auto const memAlign = memop->getAlignment();
+          const auto memAlign = memop->getAlignment();
           if (memAlign > align.value()) {
             align = Align(memAlign);
           }
@@ -895,7 +895,7 @@ Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
   // only if the original condition is true for any lane (or for all lanes if
   // the condition is used in a BOSCC block indirection.)
   IRBuilder<> B(terminator);
-  auto const name = cond->getName();
+  const auto name = cond->getName();
 
   // Reduce the packet to a single value
   auto w = conds.size();
@@ -1215,8 +1215,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
 
-  auto const Builtin = BI.analyzeBuiltin(*callee);
-  auto const Info = BI.isMuxGroupCollective(Builtin.ID);
+  const auto Builtin = BI.analyzeBuiltin(*callee);
+  const auto Info = BI.isMuxGroupCollective(Builtin.ID);
 
   if (!Info || (!Info->isSubGroupScope() && !Info->isWorkGroupScope()) ||
       (!Info->isAnyAll() && !Info->isReduction())) {
@@ -1287,7 +1287,7 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   }
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
-  auto const Builtin = BI.analyzeBuiltin(*callee);
+  const auto Builtin = BI.analyzeBuiltin(*callee);
 
   bool isWorkGroup = false;
   if (auto Info = BI.isMuxGroupCollective(Builtin.ID)) {
@@ -1389,8 +1389,8 @@ Packetizer::Impl::isSubgroupShuffleLike(Instruction *I) {
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
 
-  auto const Builtin = BI.analyzeBuiltin(*callee);
-  auto const Info = BI.isMuxGroupCollective(Builtin.ID);
+  const auto Builtin = BI.analyzeBuiltin(*callee);
+  const auto Info = BI.isMuxGroupCollective(Builtin.ID);
 
   if (Info && Info->isSubGroupScope() && Info->isShuffleLike()) {
     return Info;
@@ -1432,7 +1432,7 @@ Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) {
 
   // We need to sanitize the input index so that it stays within the range of
   // one vectorized group.
-  unsigned const VF = SimdWidth.getFixedValue();
+  const unsigned VF = SimdWidth.getFixedValue();
   auto *const VecIdxFactor = ConstantInt::get(Idx->getType(), VF);
   // This index is the element of the vector-group which holds the desired
   // data, per mux sub-group.
@@ -1490,7 +1490,7 @@ Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) {
     // width and vectorization factor, that going through memory would be
     // faster.
     Value *ExtractedVec = UndefValue::get(DataVecTy);
-    unsigned const DataNumElts = DataVecTy->getElementCount().getFixedValue();
+    const unsigned DataNumElts = DataVecTy->getElementCount().getFixedValue();
     auto *const BaseIdx = B.CreateMul(VecIdx, B.getInt32(DataNumElts));
     for (unsigned i = 0; i < DataNumElts; i++) {
       auto *const SubIdx = B.CreateAdd(BaseIdx, B.getInt32(i));
@@ -1517,7 +1517,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
   if (SimdWidth.isScalable()) {
     return Packetizer::Result(*this);
   }
-  unsigned const VF = SimdWidth.getFixedValue();
+  const unsigned VF = SimdWidth.getFixedValue();
 
   auto *const Data = CI->getArgOperand(0);
   auto *const Val = CI->getArgOperand(1);
@@ -1569,7 +1569,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
 
   auto *const SubgroupLocalID =
       B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
-  auto const Builtin =
+  const auto Builtin =
       Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
 
   // Vectorize the sub-group local ID
@@ -1678,7 +1678,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
   if (SimdWidth.isScalable()) {
     return Packetizer::Result(*this);
   }
-  unsigned const VF = SimdWidth.getFixedValue();
+  const unsigned VF = SimdWidth.getFixedValue();
 
   // LHS is 'current' for a down-shuffle, and 'previous' for an up-shuffle.
   auto *const LHSOp = CI->getArgOperand(0);
@@ -1749,7 +1749,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
 
   auto *const SubgroupLocalID =
       B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
-  auto const Builtin =
+  const auto Builtin =
       Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
 
   // Vectorize the sub-group local ID
@@ -2044,7 +2044,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
           // If it's an alloca we can widen, we can just change the size
           llvm::TypeSize const allocSize =
               Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
-          auto const lifeSize =
+          const auto lifeSize =
               allocSize.isScalable() || SimdWidth.isScalable()
                   ? -1
                   : allocSize.getKnownMinValue() * SimdWidth.getKnownMinValue();
@@ -2056,7 +2056,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
       return results;
     }
 
-    auto const Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
+    const auto Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
     if (!(Props & compiler::utils::eBuiltinPropertyVectorEquivalent)) {
       return results;
     }
@@ -2090,7 +2090,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
     auto *const wideTy =
         getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth));
 
-    auto const n = CI->arg_size();
+    const auto n = CI->arg_size();
     assert(n <= maxOperands && "Intrinsic has too many arguments");
 
     SmallVector<Value *, 16> opPackets[maxOperands];
@@ -2111,7 +2111,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
       }
     }
 
-    auto const name = CI->getName();
+    const auto name = CI->getName();
     Type *const types[1] = {wideTy};  // because LLVM 13 is a numpty
     Value *opVals[maxOperands];
     for (unsigned i = 0; i < packetWidth; ++i) {
@@ -2138,7 +2138,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
     }
   }
 
-  auto const Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
+  const auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
 
   // Handle scans, which defer to internal builtins.
   if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin.ID)) {
@@ -2148,7 +2148,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
   }
 
   // Handle external builtins.
-  auto const Props = Builtin.properties;
+  const auto Props = Builtin.properties;
   if (Props & compiler::utils::eBuiltinPropertyExecutionFlow ||
       Props & compiler::utils::eBuiltinPropertyWorkItem) {
     return results;
@@ -2184,7 +2184,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
   }
 
   auto *const vecTy = dyn_cast<FixedVectorType>(ty);
-  unsigned const scalarWidth = vecTy ? vecTy->getNumElements() : 1;
+  const unsigned scalarWidth = vecTy ? vecTy->getNumElements() : 1;
   unsigned i = 0;
   SmallVector<SmallVector<Value *, 16>, 4> opPackets;
   for (const auto &TargetArg : CalleeVec.args) {
@@ -2322,7 +2322,7 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
 
   // We don't bother with VP for fixed vectors, because it doesn't save us
   // anything.
-  bool const VP = VL && SimdWidth.isScalable();
+  const bool VP = VL && SimdWidth.isScalable();
 
   O << VectorizationContext::InternalBuiltinPrefix << "sub_group_scan_"
     << (isInclusive ? "inclusive" : "exclusive") << "_" << op
@@ -2466,7 +2466,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   }
 
   if (auto *const vecTy = dyn_cast<FixedVectorType>(dataTy)) {
-    auto const elts = vecTy->getNumElements();
+    const auto elts = vecTy->getNumElements();
     if (elts & (elts - 1)) {
       // If the data type is a vector with number of elements not a power of 2,
       // it is not safe to widen, because of alignment padding. Reject it and
@@ -2475,7 +2475,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     }
   }
 
-  auto const packetWidth = getPacketWidthForType(dataTy);
+  const auto packetWidth = getPacketWidthForType(dataTy);
   // Note: NOT const because LLVM 11 can't multiply a const ElementCount.
   auto factor = SimdWidth.divideCoefficientBy(packetWidth);
 
@@ -2493,7 +2493,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   IRBuilder<> B(op.getInstr());
   IC.deleteInstructionLater(op.getInstr());
 
-  auto const name = op.getInstr()->getName();
+  const auto name = op.getInstr()->getName();
   auto *const mask = op.getMaskOperand();
   auto *const data = op.getDataOperand();
   auto *const stride = SAR.buildMemoryStride(B, ptr, dataTy);
@@ -2519,7 +2519,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       return results;
     }
 
-    bool const scalable = SimdWidth.isScalable();
+    const bool scalable = SimdWidth.isScalable();
     if (!mask && dataTy->isVectorTy() && !scalable) {
       // unmasked scatter/gathers are better off instantiated..
       return results;
@@ -2550,12 +2550,12 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       // is the same.
       // We only handle the one packet right now.
       PACK_FAIL_IF(ptrPacket.size() != 1);
-      auto const scalarWidth = vecPtrTy->getNumElements();
+      const auto scalarWidth = vecPtrTy->getNumElements();
       Value *&vecPtr = ptrPacket.front();
-      ElementCount const wideEC = factor * scalarWidth;
+      const ElementCount wideEC = factor * scalarWidth;
       // Sub-splat the pointers such that we get, e.g.:
       // <A, B> -> x4 -> <A, A, A, A, B, B, B, B>
-      bool const success =
+      const bool success =
           createSubSplats(Ctx.targetInfo(), B, ptrPacket, scalarWidth);
       PACK_FAIL_IF(!success);
       auto *const newPtrTy = llvm::VectorType::get(
@@ -2576,8 +2576,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       // <A, A+1, A+2, A+3, B, B+1, B+2, B+3>
       vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector);
     } else if (vecPtrTy && !scalable) {
-      auto const simdWidth = factor.getFixedValue();
-      auto const scalarWidth = vecPtrTy->getNumElements();
+      const auto simdWidth = factor.getFixedValue();
+      const auto scalarWidth = vecPtrTy->getNumElements();
 
       // Build shuffle mask to widen the pointer
       SmallVector<Constant *, 16> indices;
@@ -2659,7 +2659,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     Value *packetStride = nullptr;
     if (packetWidth != 1) {
       // Make sure the stride is at least as wide as a GEP index needs to be
-      unsigned const indexBits = Ctx.dataLayout()->getIndexSizeInBits(
+      const unsigned indexBits = Ctx.dataLayout()->getIndexSizeInBits(
           ptr->getType()->getPointerAddressSpace());
       unsigned strideBits = stride->getType()->getPrimitiveSizeInBits();
       auto *const elementStride =
@@ -2667,7 +2667,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
               ? B.CreateSExt(stride, B.getIntNTy((strideBits = indexBits)))
               : stride;
 
-      auto const simdWidth = factor.getFixedValue();
+      const auto simdWidth = factor.getFixedValue();
       packetStride =
           B.CreateMul(elementStride, B.getIntN(strideBits, simdWidth),
                       Twine(name, ".packet_stride"));
@@ -2718,7 +2718,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
 
     Value *packetStride = nullptr;
     if (packetWidth != 1) {
-      auto const simdWidth = factor.getFixedValue();
+      const auto simdWidth = factor.getFixedValue();
       packetStride = B.getInt64(simdWidth);
     }
 
@@ -2813,14 +2813,14 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomic(
     CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo) {
   ValuePacket results;
 
-  bool const IsCmpXchg = AtomicInfo.isCmpXchg();
+  const bool IsCmpXchg = AtomicInfo.isCmpXchg();
 
   Value *const ptrArg = CI.getArgOperand(0);
   Value *const valOrCmpArg = CI.getArgOperand(1);
   Value *const maskArg = CI.getArgOperand(2 + IsCmpXchg);
 
   assert(AtomicInfo.ValTy == valOrCmpArg->getType() && "AtomicInfo mismatch");
-  auto const packetWidth = getPacketWidthForType(valOrCmpArg->getType());
+  const auto packetWidth = getPacketWidthForType(valOrCmpArg->getType());
 
   if (VL && packetWidth != 1) {
     emitVeczRemarkMissed(&F, &CI,
@@ -2904,7 +2904,7 @@ ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) {
   // Work out the packet width from the pointed to type, rather than the
   // pointer type itself, because this is the width the memops will be using.
   auto *const ty = GEP->getSourceElementType();
-  auto const packetWidth = getPacketWidthForType(ty);
+  const auto packetWidth = getPacketWidthForType(ty);
 
   // It is legal to create a GEP with a mixture of scalar and vector operands.
   // If any operand is a vector, the result will be a vector of pointers.
@@ -2943,9 +2943,9 @@ ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) {
   IC.deleteInstructionLater(GEP);
 
   bool inBounds = GEP->isInBounds();
-  auto const name = GEP->getName();
+  const auto name = GEP->getName();
 
-  auto const numIndices = opPackets.size();
+  const auto numIndices = opPackets.size();
   SmallVector<Value *, 4> opVals;
   opVals.resize(numIndices);
   for (unsigned i = 0; i < packetWidth; ++i) {
@@ -3027,8 +3027,8 @@ ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) {
   resC.getPacketValues(src);
   PACK_FAIL_IF(src.empty());
 
-  auto const packetWidth = src.size();
-  auto const name = FreezeI->getName();
+  const auto packetWidth = src.size();
+  const auto name = FreezeI->getName();
 
   IRBuilder<> B(FreezeI);
   for (unsigned i = 0; i < packetWidth; ++i) {
@@ -3261,7 +3261,7 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
 
   // Handle external builtins.
   compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-  auto const Builtin = BI.analyzeBuiltinCall(*CI, Dimension);
+  const auto Builtin = BI.analyzeBuiltinCall(*CI, Dimension);
 
   if (Builtin.properties & compiler::utils::eBuiltinPropertyExecutionFlow) {
     return nullptr;
@@ -3369,7 +3369,7 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
 }
 
 Value *Packetizer::Impl::vectorizeWorkGroupCall(
-    CallInst *CI, compiler::utils::BuiltinCall const &Builtin) {
+    CallInst *CI, const compiler::utils::BuiltinCall &Builtin) {
   // Insert instructions after the call to the builtin, since they reference
   // the result of that call.
   IRBuilder<> B(buildAfter(CI, F));
@@ -3396,7 +3396,7 @@ Value *Packetizer::Impl::vectorizeWorkGroupCall(
   Value *Splat = B.CreateVectorSplat(SimdWidth, IDToSplat);
 
   // Add an index sequence [0, 1, 2, ...] to the value unless uniform.
-  auto const Uniformity = Builtin.uniformity;
+  const auto Uniformity = Builtin.uniformity;
   if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
       Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
     Value *StepVector =
@@ -3824,8 +3824,8 @@ ValuePacket Packetizer::Impl::packetizeInsertValue(
   Value *PackVal = packetizeIfVarying(Val);
   PACK_FAIL_IF(!PackVal);
 
-  bool const IsValVarying = Val != PackVal;
-  bool const IsAggregateVarying = Aggregate != PackAggregate;
+  const bool IsValVarying = Val != PackVal;
+  const bool IsAggregateVarying = Aggregate != PackAggregate;
   if (!IsAggregateVarying && IsValVarying) {
     // If the aggregate wasn't varying but the value was
     PackAggregate = packetize(Aggregate).getAsValue();
@@ -3882,7 +3882,7 @@ ValuePacket Packetizer::Impl::packetizeShuffleVector(
 
   ValuePacket results;
   IRBuilder<> B(buildAfter(Shuffle, F));
-  auto const scalarWidth = multi_llvm::getVectorNumElements(tyA);
+  const auto scalarWidth = multi_llvm::getVectorNumElements(tyA);
 
   if (SimdWidth.isScalable()) {
     PACK_FAIL_IF(packetWidth != 1);
@@ -3896,8 +3896,8 @@ ValuePacket Packetizer::Impl::packetizeShuffleVector(
       PACK_FAIL_IF(!isPowerOf2_32(scalarWidth));
       TargetInfo &VTI = Ctx.targetInfo();
 
-      auto const dstScalarWidth = multi_llvm::getVectorNumElements(ty);
-      auto const fullWidth = SimdWidth * dstScalarWidth;
+      const auto dstScalarWidth = multi_llvm::getVectorNumElements(ty);
+      const auto fullWidth = SimdWidth * dstScalarWidth;
 
       // If we're vector-predicating a vector access, scale the vector length
       // up by the original number of vector elements.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
index c87695a9d29eb..36d93e9f64a6c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -228,7 +228,7 @@ PreservedAnalyses ScalarizationPass::run(llvm::Function &F,
     // operands, scalarization can produce dead code, which will get removed
     // by later cleanup optimizations. Reductions are generally much better
     // off scalarized.
-    bool const scalable = VU.width().isScalable();
+    const bool scalable = VU.width().isScalable();
 
     OperandTracer tracer(UVR, scalable);
     for (Instruction *Leaf : Leaves) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index d20d7794f5b32..dd8aa2e823738 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -225,7 +225,7 @@ Value *Scalarizer::scalarizeOperands(Instruction *I) {
     if (!Callee->isIntrinsic()) {
       // Check if this is indeed a printf call
       compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-      auto const ID = BI.analyzeBuiltin(*Callee).ID;
+      const auto ID = BI.analyzeBuiltin(*Callee).ID;
       if (ID == BI.getPrintfBuiltin()) {
         return scalarizeOperandsPrintf(CI);
       }
@@ -272,7 +272,7 @@ Value *Scalarizer::scalarizeOperandsPrintf(CallInst *CI) {
   // Gather the operands for the new printf call, taking care to scalarize
   // any vector operands.
   llvm::SmallVector<Value *, 16> NewOps;
-  for (Use const &Op : CI->args()) {
+  for (const Use &Op : CI->args()) {
     // The first operand is the new format string
     if (Op == *CI->arg_begin()) {
       Constant *Zero = B.getInt32(0);
@@ -1304,12 +1304,12 @@ SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
     Callee = originalFunc;
   }
 
-  auto const Builtin = BI.analyzeBuiltin(*Callee);
+  const auto Builtin = BI.analyzeBuiltin(*Callee);
   Function *ScalarEquiv = BI.getScalarEquivalent(Builtin, F.getParent());
   VECZ_STAT_FAIL_IF(!ScalarEquiv, VeczScalarizeFailBuiltin);
 
   IRBuilder<> B(CI);
-  auto const Props = Builtin.properties;
+  const auto Props = Builtin.properties;
   // Ignore the mask if present
   unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size();
   SmallVector<SimdPacket *, 4> OpPackets(NumArgs);
@@ -1535,8 +1535,8 @@ SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) {
   }
 
   IRBuilder<> B(GEP);
-  bool const inBounds = GEP->isInBounds();
-  auto const name = GEP->getName();
+  const bool inBounds = GEP->isInBounds();
+  const auto name = GEP->getName();
   SimdPacket *const P = getPacket(GEP, simdWidth);
   for (unsigned i = 0; i < simdWidth; i++) {
     if (!PM.isEnabled(i) || P->at(i)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
index e17d2b592a18f..b7f3f8a013f9c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -45,8 +45,8 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
   bool changed = false;
 
-  auto const &UVR = AM.getResult<UniformValueAnalysis>(F);
-  auto const &SAR = AM.getResult<StrideAnalysis>(F);
+  const auto &UVR = AM.getResult<UniformValueAnalysis>(F);
+  const auto &SAR = AM.getResult<StrideAnalysis>(F);
   auto &DL = F.getParent()->getDataLayout();
   auto &context = F.getContext();
 
@@ -88,10 +88,10 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
 
         auto *const ty = load->getType();
         auto *const scalarTy = ty->getScalarType();
-        unsigned const numBits = ty->getPrimitiveSizeInBits();
+        const unsigned numBits = ty->getPrimitiveSizeInBits();
         if ((numBits & (numBits - 1)) == 0 && scalarTy != ty &&
             DL.fitsInLegalInteger(numBits)) {
-          auto const align = load->getAlign();
+          const auto align = load->getAlign();
           auto *const intTy = IntegerType::get(context, numBits);
           if (DL.getABITypeAlign(intTy) > align) {
             // The alignment of this type is too strict to convert
@@ -99,7 +99,7 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
           }
 
           auto *const ptr = load->getPointerOperand();
-          auto const *const info = SAR.getInfo(ptr);
+          const auto *const info = SAR.getInfo(ptr);
           if (info && info->hasStride() &&
               info->getConstantMemoryStride(ty, &DL) == 1) {
             // No need to perform this transform on contiguous loads
@@ -107,7 +107,7 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
           }
 
           IRBuilder<> B(load);
-          auto const name = load->getName();
+          const auto name = load->getName();
           auto *const newPtrTy =
               PointerType::get(intTy, ptr->getType()->getPointerAddressSpace());
           auto *const ptrCast = B.CreatePointerCast(
@@ -132,10 +132,10 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
         auto *const data = store->getValueOperand();
         auto *const ty = data->getType();
         auto *const scalarTy = ty->getScalarType();
-        unsigned const numBits = ty->getPrimitiveSizeInBits();
+        const unsigned numBits = ty->getPrimitiveSizeInBits();
         if ((numBits & (numBits - 1)) == 0 && scalarTy != ty &&
             DL.fitsInLegalInteger(numBits)) {
-          auto const align = store->getAlign();
+          const auto align = store->getAlign();
           auto *const intTy = IntegerType::get(context, numBits);
           if (DL.getABITypeAlign(intTy) > align) {
             // The alignment of this type is too strict to convert
@@ -143,7 +143,7 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
           }
 
           auto *const ptr = store->getPointerOperand();
-          auto const *const info = SAR.getInfo(ptr);
+          const auto *const info = SAR.getInfo(ptr);
           if (info && info->hasStride() &&
               info->getConstantMemoryStride(ty, &DL) == 1) {
             // No need to perform this transform on contiguous stores
@@ -197,8 +197,8 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
             IRBuilder<> B(zext);
             Value *element = getSquashed(vector, intTy, B);
 
-            auto const bits = zext->getSrcTy()->getScalarSizeInBits();
-            auto const scaled =
+            const auto bits = zext->getSrcTy()->getScalarSizeInBits();
+            const auto scaled =
                 cast<ConstantInt>(indexOp)->getZExtValue() * bits;
 
             // Note on Little Endian systems, element 0 occupies the least
@@ -206,7 +206,7 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
             // the most significant bits. Thus, we shift by "maximum element
             // number minus current element number" times by "number of bits
             // per element".
-            auto const shift =
+            const auto shift =
                 DL.isBigEndian()
                     ? intTy->getPrimitiveSizeInBits() - bits - scaled
                     : scaled;
@@ -246,11 +246,11 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
             IRBuilder<> B(sext);
             Value *element = getSquashed(vector, intTy, B);
 
-            auto const bits = sext->getSrcTy()->getScalarSizeInBits();
-            auto const shiftr = intTy->getPrimitiveSizeInBits() - bits;
-            auto const scaled =
+            const auto bits = sext->getSrcTy()->getScalarSizeInBits();
+            const auto shiftr = intTy->getPrimitiveSizeInBits() - bits;
+            const auto scaled =
                 cast<ConstantInt>(indexOp)->getZExtValue() * bits;
-            auto const shiftl = DL.isBigEndian() ? scaled : shiftr - scaled;
+            const auto shiftl = DL.isBigEndian() ? scaled : shiftr - scaled;
 
             if (shiftl != 0) {
               element =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
index a70163986fcd2..8f5a73abece1d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -36,7 +36,7 @@ namespace {
 /// memory op and there are no other users to GEP.
 /// Additionally, we reject various cases where the tranform would not result
 /// in better code.
-bool shouldTransform(SelectInst *Select, StrideAnalysisResult const &SAR) {
+bool shouldTransform(SelectInst *Select, const StrideAnalysisResult &SAR) {
   // The transform only applies to pointer selects.
   if (!Select->getType()->isPointerTy()) {
     return false;
@@ -50,7 +50,7 @@ bool shouldTransform(SelectInst *Select, StrideAnalysisResult const &SAR) {
   {
     // If the select itself is a strided pointer, we don't gain anything by
     // transforming it into a pair of masked memops.
-    auto const *info = SAR.getInfo(Select);
+    const auto *info = SAR.getInfo(Select);
     if (info && info->hasStride()) {
       return false;
     }
@@ -66,8 +66,8 @@ bool shouldTransform(SelectInst *Select, StrideAnalysisResult const &SAR) {
   // only scalar Mask Varying memops, instead of vector memops.
   if (SAR.UVR.isVarying(VecTrue) || SAR.UVR.isVarying(VecFalse)) {
     // Both pointers must be either strided or uniform (i.e. not divergent).
-    auto const *infoT = SAR.getInfo(VecTrue);
-    auto const *infoF = SAR.getInfo(VecFalse);
+    const auto *infoT = SAR.getInfo(VecTrue);
+    const auto *infoF = SAR.getInfo(VecFalse);
     if (!infoT || !infoF || infoT->mayDiverge() || infoF->mayDiverge()) {
       return false;
     }
@@ -97,7 +97,7 @@ bool shouldTransform(SelectInst *Select, StrideAnalysisResult const &SAR) {
 
     // Validate the GEP indices
     for (Value *idx : GEP->indices()) {
-      auto const *info = SAR.getInfo(idx);
+      const auto *info = SAR.getInfo(idx);
       if (!info || info->mayDiverge()) {
         return false;
       }
@@ -205,7 +205,7 @@ void Transform(SelectInst *Select, VectorizationContext &Ctx) {
 
 PreservedAnalyses TernaryTransformPass::run(llvm::Function &F,
                                             llvm::FunctionAnalysisManager &AM) {
-  auto const &SAR = AM.getResult<StrideAnalysis>(F);
+  const auto &SAR = AM.getResult<StrideAnalysis>(F);
 
   // Find selects that can be transformed
   SmallVector<SelectInst *, 4> Selects;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
index 99cdd0a35a22e..e08c7fc0981f0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -155,7 +155,7 @@ bool Reassociator::reassociate(llvm::BinaryOperator &Op) {
     return false;
   }
 
-  auto const Opcode = Op.getOpcode();
+  const auto Opcode = Op.getOpcode();
   auto *const LHS = Op.getOperand(0);
   auto *const RHS = Op.getOperand(1);
 
@@ -232,7 +232,7 @@ bool Reassociator::run(llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
     for (auto Iit = BB->begin(); Iit != BB->end();) {
       auto &I = *(Iit++);
       if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
-        auto const form = canonicalizeBinOp(*BinOp);
+        const auto form = canonicalizeBinOp(*BinOp);
         if (form == OpForm::Varying || form == OpForm::Mixed) {
           reassociate(*BinOp);
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 4069f46e14b1b..49934f83c7dc5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -92,7 +92,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   if (CIntStride && CIntStride->getSExtValue() == 1) {
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
-      auto const Legality = isVPLoadLegal(F, Ty, Alignment);
+      const auto Legality = isVPLoadLegal(F, Ty, Alignment);
       if (!Legality.isVPLegal()) {
         emitVeczRemarkMissed(F,
                              "Could not create a VP load as the target "
@@ -160,7 +160,7 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
     Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
-      auto const Legality = isVPStoreLegal(F, VecTy, Alignment);
+      const auto Legality = isVPStoreLegal(F, VecTy, Alignment);
       if (!Legality.isVPLegal()) {
         emitVeczRemarkMissed(F,
                              "Could not create a VP store as the target "
@@ -232,7 +232,7 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
     PtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
     Ptr = B.CreateBitCast(Ptr, PtrTy);
     const Function *F = B.GetInsertBlock()->getParent();
-    auto const Legality = isVPLoadLegal(F, Ty, Alignment);
+    const auto Legality = isVPLoadLegal(F, Ty, Alignment);
     if (EVL && Legality.isVPLegal()) {
       SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
       SmallVector<llvm::Type *, 2> Tys = {Ty, PtrTy};
@@ -249,7 +249,7 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
     }
   }
 
-  unsigned const Width = 1;
+  const unsigned Width = 1;
 
   LLVMContext &Ctx = B.getContext();
   BasicBlock *Entry = B.GetInsertBlock();
@@ -339,7 +339,7 @@ Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
     PtrTy = DataTy->getPointerTo(PtrTy->getAddressSpace());
     Ptr = B.CreateBitCast(Ptr, PtrTy);
     const Function *F = B.GetInsertBlock()->getParent();
-    auto const Legality = isVPStoreLegal(F, DataTy, Alignment);
+    const auto Legality = isVPStoreLegal(F, DataTy, Alignment);
     if (EVL && Legality.isVPLegal()) {
       SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
       SmallVector<llvm::Type *, 2> Tys = {Data->getType(), PtrTy};
@@ -356,7 +356,7 @@ Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
     }
   }
 
-  unsigned const Width = 1;
+  const unsigned Width = 1;
 
   LLVMContext &Ctx = B.getContext();
   BasicBlock *Entry = B.GetInsertBlock();
@@ -498,7 +498,7 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   Constant *DefaultEleData = UndefValue::get(EleTy);
 
   if (Ty->isVectorTy()) {
-    auto const Legality = isVPGatherLegal(F, Ty, Alignment);
+    const auto Legality = isVPGatherLegal(F, Ty, Alignment);
     if (EVL && Legality.isVPLegal()) {
       SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
       SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
@@ -596,7 +596,7 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
   if (DataTy->isVectorTy()) {
     auto *VecPtrTy = dyn_cast<VectorType>(Ptr->getType());
     VECZ_FAIL_IF(!VecPtrTy);
-    auto const Legality = isVPScatterLegal(F, DataTy, Alignment);
+    const auto Legality = isVPScatterLegal(F, DataTy, Alignment);
     if (EVL && Legality.isVPLegal()) {
       SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
       SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
@@ -671,7 +671,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
                                                 Type *narrowTy, Value *src,
                                                 Value *index, Value *VL) const {
   (void)VL;
-  auto const *origSrc = extract->getOperand(0);
+  const auto *origSrc = extract->getOperand(0);
   auto *eltTy = src->getType()->getScalarType();
 
   auto *wideTy = src->getType();
@@ -695,7 +695,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
   auto *const bcastalloc =
       B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
 
-  unsigned const fixedVecElts =
+  const unsigned fixedVecElts =
       multi_llvm::getVectorNumElements(origSrc->getType());
 
   Value *load = nullptr;
@@ -796,8 +796,8 @@ Value *TargetInfo::createBroadcastIndexVector(IRBuilder<> &B, Type *ty,
                                               ElementCount factor, bool URem,
                                               const llvm::Twine &N) {
   auto *const steps = B.CreateStepVector(ty, "idx0");
-  auto const tyEC = multi_llvm::getVectorElementCount(ty);
-  unsigned const factorMinVal = factor.getKnownMinValue();
+  const auto tyEC = multi_llvm::getVectorElementCount(ty);
+  const unsigned factorMinVal = factor.getKnownMinValue();
 
   unsigned fixedAmt;
   Instruction::BinaryOps Opc;
@@ -842,7 +842,7 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
   auto *const bcastalloc =
       B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
 
-  unsigned const fixedVecElts =
+  const unsigned fixedVecElts =
       multi_llvm::getVectorNumElements(insert->getOperand(0)->getType());
 
   // Construct the index, either by packetizing if (if varying) or by
@@ -907,7 +907,7 @@ TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
         Checker,
     Type *Ty, unsigned Alignment) const {
   assert(Ty->isVectorTy() && "Expected a vector type");
-  bool const isMaskLegal =
+  const bool isMaskLegal =
       !(isa<ScalableVectorType>(Ty) && TM_) ||
       Checker(TM_->getTargetTransformInfo(*F), Ty, Alignment);
   // Assuming a pointer bit width of 64
@@ -966,8 +966,8 @@ llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
   // The alloca must be inserted at the beginning of the function.
   auto *const curBlock = B.GetInsertBlock();
   auto &entryBlock = curBlock->getParent()->getEntryBlock();
-  auto const allocaIt = entryBlock.getFirstInsertionPt();
-  auto const it = B.GetInsertPoint();
+  const auto allocaIt = entryBlock.getFirstInsertionPt();
+  const auto it = B.GetInsertPoint();
 
   B.SetInsertPoint(&entryBlock, allocaIt);
   auto *const alloc = B.CreateAlloca(srcTy, nullptr);
@@ -988,14 +988,14 @@ llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
   // Index into the allocation.
   auto *const gep = B.CreateInBoundsGEP(eltTy, bcastalloc, mask, "vec.alloc");
 
-  auto const eltCount = maskTy->getElementCount();
+  const auto eltCount = maskTy->getElementCount();
   auto *const dstTy = VectorType::get(eltTy, eltCount);
-  auto const alignment =
+  const auto alignment =
       MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
 
   Value *gatherMask = nullptr;
   if (evl) {
-    auto const EC = srcTy->getElementCount();
+    const auto EC = srcTy->getElementCount();
     auto *const IndexTy = VectorType::get(evl->getType(), EC);
     auto *const step = B.CreateStepVector(IndexTy);
     gatherMask = B.CreateICmpULT(step, B.CreateVectorSplat(EC, evl));
@@ -1016,10 +1016,10 @@ llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
          "TargetInfo::createVectorShuffle: source must have vector type");
 
   auto *const undef = UndefValue::get(srcTy);
-  auto const EC = srcTy->getElementCount();
+  const auto EC = srcTy->getElementCount();
   if (!EC.isScalable()) {
     // Special case for fixed-width vectors
-    auto const width = EC.getFixedValue();
+    const auto width = EC.getFixedValue();
     SmallVector<int, 16> mask(width);
     auto it = mask.begin();
     *it++ = 0;
@@ -1291,7 +1291,7 @@ unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
             ? TM_->getPointerSizeInBits(Ty->getPointerAddressSpace())
             : VI->getType()->getPrimitiveSizeInBits();
   }
-  unsigned const MaxBits = MaxVecRegBitWidth * NumVecRegs;
+  const unsigned MaxBits = MaxVecRegBitWidth * NumVecRegs;
   while (VaryingUsage * width > MaxBits) {
     width >>= 1;
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index e26cbd895cf4c..79690f5f8ab6e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -110,7 +110,7 @@ bool TargetInfoRISCV::isVectorTypeLegal(Type *Ty) const {
   // before we can enable this for Int1Ty as well.
   bool isLegal = isLegalVPElementType(multi_llvm::getVectorElementType(Ty));
   if (isLegal) {
-    uint32_t const MinSize =
+    const uint32_t MinSize =
         multi_llvm::getVectorElementCount(Ty).getKnownMinValue();
     isLegal = isPowerOf2_32(MinSize) &&
               MinSize * Ty->getScalarSizeInBits() <= MaxLegalVectorTypeBits;
@@ -201,7 +201,7 @@ bool TargetInfoRISCV::isOperationLegal(llvm::Intrinsic::ID ID,
 
 namespace {
 static unsigned getRISCVBits(const TargetMachine *TM) {
-  auto const &Triple = TM->getTargetTriple();
+  const auto &Triple = TM->getTargetTriple();
   return Triple.isArch32Bit() ? 32 : 64;
 }
 
@@ -309,8 +309,8 @@ llvm::Value *TargetInfoRISCV::createScalableExtractElement(
   unsigned intrIdxBitWidth;
   std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy);
 
-  auto const srcEC = multi_llvm::getVectorElementCount(srcTy);
-  auto const resEC = multi_llvm::getVectorElementCount(narrowTy);
+  const auto srcEC = multi_llvm::getVectorElementCount(srcTy);
+  const auto resEC = multi_llvm::getVectorElementCount(narrowTy);
 
   auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth);
   Type *const indexVecTy = VectorType::get(indexEltTy, resEC);
@@ -325,8 +325,8 @@ llvm::Value *TargetInfoRISCV::createScalableExtractElement(
   auto *const avl = getIntrinsicVL(B, VL, narrowTy, getTargetMachine());
 
   auto *indexTy = index->getType();
-  bool const isIdxVector = indexTy->isVectorTy();
-  unsigned const idxBitWidth = indexTy->getScalarSizeInBits();
+  const bool isIdxVector = indexTy->isVectorTy();
+  const unsigned idxBitWidth = indexTy->getScalarSizeInBits();
 
   // The intrinsic may demand a larger index type than we currently have;
   // extend up to the right type.
@@ -490,9 +490,9 @@ llvm::Value *TargetInfoRISCV::createScalableInsertElement(
   std::tie(intrinsicID, intrIdxBitWidth) =
       getGatherIntrinsic(intoTy, /*isMasked*/ true);
 
-  auto const eltEC = multi_llvm::getVectorElementCount(eltTy);
-  auto const intoEC = multi_llvm::getVectorElementCount(intoTy);
-  auto const fixedAmt =
+  const auto eltEC = multi_llvm::getVectorElementCount(eltTy);
+  const auto intoEC = multi_llvm::getVectorElementCount(intoTy);
+  const auto fixedAmt =
       multi_llvm::getVectorElementCount(origInsert->getType());
   assert(!fixedAmt.isScalable() && "Scalable pre-packetized value?");
 
@@ -509,8 +509,8 @@ llvm::Value *TargetInfoRISCV::createScalableInsertElement(
   auto *const avl = getIntrinsicVL(B, VL, intoTy, getTargetMachine());
 
   auto *const indexTy = index->getType();
-  unsigned const idxBitWidth = indexTy->getScalarSizeInBits();
-  bool const indexIsVector = indexTy->isVectorTy();
+  const unsigned idxBitWidth = indexTy->getScalarSizeInBits();
+  const bool indexIsVector = indexTy->isVectorTy();
 
   // The intrinsic may demand a larger index type than we currently have;
   // extend up to the right type.
@@ -585,8 +585,8 @@ llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
   }
 
   auto *const maskTy = cast<VectorType>(mask->getType());
-  auto const srcEC = multi_llvm::getVectorElementCount(srcTy);
-  auto const resEC = multi_llvm::getVectorElementCount(maskTy);
+  const auto srcEC = multi_llvm::getVectorElementCount(srcTy);
+  const auto resEC = multi_llvm::getVectorElementCount(maskTy);
 
   auto *const resTy = VectorType::get(srcTy->getElementType(), resEC);
 
@@ -619,10 +619,10 @@ llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
 
   auto *const zero = B.getInt64(0);
 
-  bool const same = (resEC == srcEC);
-  bool const narrow = !same && (srcEC.isScalable() || !resEC.isScalable()) &&
+  const bool same = (resEC == srcEC);
+  const bool narrow = !same && (srcEC.isScalable() || !resEC.isScalable()) &&
                       resEC.getKnownMinValue() <= srcEC.getKnownMinValue();
-  bool const widen = !same && (resEC.isScalable() || !srcEC.isScalable()) &&
+  const bool widen = !same && (resEC.isScalable() || !srcEC.isScalable()) &&
                      srcEC.getKnownMinValue() <= resEC.getKnownMinValue();
 
   assert((srcTy == resTy || narrow || widen) &&
@@ -675,7 +675,7 @@ llvm::Value *TargetInfoRISCV::createVectorSlideUp(llvm::IRBuilder<> &B,
     return TargetInfo::createVectorSlideUp(B, src, insert, VL);
   }
 
-  auto const intrinsicID = getSlideUpIntrinsic(srcTy);
+  const auto intrinsicID = getSlideUpIntrinsic(srcTy);
 
   auto *const avl = getIntrinsicVL(B, VL, srcTy, getTargetMachine());
 
@@ -711,7 +711,7 @@ Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
   if (WidestEltTy < 8 || WidestEltTy > 64 || !isPowerOf2_32(WidestEltTy)) {
     return nullptr;
   }
-  auto const KnownMin = VF.getKnownMinValue();
+  const auto KnownMin = VF.getKnownMinValue();
   // The vectorization factor must be scalable and a legal vsetvli amount: no
   // greater than the maximum vector length for each element width:
   // nx64vi8,nx32vi16,nx16vi32,nxv8i64
@@ -721,14 +721,14 @@ Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
   }
 
   unsigned LMUL = 0;
-  unsigned const MaxLegalElementWidth = 64;
+  const unsigned MaxLegalElementWidth = 64;
 
   if ((WidestEltTy * KnownMin) / MaxLegalElementWidth) {
     // Non-fractional LMULs
     LMUL = Log2_64((WidestEltTy * KnownMin) / MaxLegalElementWidth);
   } else {
     // Fractional LMULs
-    auto const Fraction = MaxLegalElementWidth / (WidestEltTy * KnownMin);
+    const auto Fraction = MaxLegalElementWidth / (WidestEltTy * KnownMin);
     if (Fraction == 2) {
       LMUL = LMUL_F2;
     } else if (Fraction == 4) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index d31b6cb3aa921..9264c18f0cedb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -126,15 +126,15 @@ bool VectorizationContext::canExpandBuiltin(const Function *ScalarFn) const {
 VectorizationResult &VectorizationContext::getOrCreateBuiltin(
     llvm::Function &F, unsigned SimdWidth) {
   compiler::utils::BuiltinInfo &BI = builtins();
-  auto const Cached = VectorizedBuiltins.find(&F);
+  const auto Cached = VectorizedBuiltins.find(&F);
   if (Cached != VectorizedBuiltins.end()) {
-    auto const Found = Cached->second.find(SimdWidth);
+    const auto Found = Cached->second.find(SimdWidth);
     if (Found != Cached->second.end()) {
       return Found->second;
     }
   }
 
-  auto const Builtin = BI.analyzeBuiltin(F);
+  const auto Builtin = BI.analyzeBuiltin(F);
 
   // Try to find a vector equivalent for the builtin.
   Function *const VectorCallee =
@@ -151,7 +151,7 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
   result.func = VectorCallee;
 
   // Gather information about the function's arguments.
-  auto const Props = Builtin.properties;
+  const auto Props = Builtin.properties;
   unsigned i = 0;
   for (Argument &Arg : F.args()) {
     Type *pointerRetPointeeTy = nullptr;
@@ -181,7 +181,7 @@ VectorizationResult VectorizationContext::getVectorizedFunction(
 
   auto simdWidth = factor.getFixedValue();
   if (auto *vecTy = dyn_cast<FixedVectorType>(callee.getReturnType())) {
-    auto const Builtin = BI.analyzeBuiltin(callee);
+    const auto Builtin = BI.analyzeBuiltin(callee);
     Function *scalarEquiv = builtins().getScalarEquivalent(Builtin, &Module);
     if (!scalarEquiv) {
       ++VeczContextFailScalarizeCall;
@@ -237,7 +237,7 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   // called Function because the called Function might be a VarArg function, in
   // which case we need to create the wrapper with the expanded argument list.
   SmallVector<Type *, 8> argTys;
-  for (auto const &U : CI->args()) {
+  for (const auto &U : CI->args()) {
     argTys.push_back(U->getType());
   }
   AttributeList fnAttrs = F->getAttributes();
@@ -524,7 +524,7 @@ VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
 
 Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
     MaskedAtomic &I, const VectorizationChoices &Choices, ElementCount VF) {
-  bool const isCmpXchg = I.isCmpXchg();
+  const bool isCmpXchg = I.isCmpXchg();
   LLVMContext &ctx = I.ValTy->getContext();
 
   SmallVector<Type *, 8> argTys;
@@ -753,7 +753,7 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) {
 }
 
 bool VectorizationContext::emitMaskedMemOpBody(Function &F,
-                                               MemOpDesc const &Desc) const {
+                                               const MemOpDesc &Desc) const {
   Value *Data = Desc.getDataOperand(&F);
   Value *Ptr = Desc.getPointerOperand(&F);
   Value *Mask = Desc.getMaskOperand(&F);
@@ -776,20 +776,20 @@ bool VectorizationContext::emitMaskedMemOpBody(Function &F,
 }
 
 bool VectorizationContext::emitInterleavedMemOpBody(
-    Function &F, MemOpDesc const &Desc) const {
+    Function &F, const MemOpDesc &Desc) const {
   return emitMaskedInterleavedMemOpBody(F, Desc);
 }
 
 bool VectorizationContext::emitMaskedInterleavedMemOpBody(
-    Function &F, MemOpDesc const &Desc) const {
+    Function &F, const MemOpDesc &Desc) const {
   Value *Data = Desc.getDataOperand(&F);
   auto *const Ptr = Desc.getPointerOperand(&F);
   VECZ_FAIL_IF(!isa<VectorType>(Desc.getDataType()) || !Ptr);
 
   auto *const Mask = Desc.getMaskOperand(&F);
   auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
-  auto const Align = Desc.getAlignment();
-  auto const Stride = Desc.getStride();
+  const auto Align = Desc.getAlignment();
+  const auto Stride = Desc.getStride();
 
   BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
   IRBuilder<> B(Entry);
@@ -816,12 +816,12 @@ bool VectorizationContext::emitMaskedInterleavedMemOpBody(
 }
 
 bool VectorizationContext::emitScatterGatherMemOpBody(
-    Function &F, MemOpDesc const &Desc) const {
+    Function &F, const MemOpDesc &Desc) const {
   return emitMaskedScatterGatherMemOpBody(F, Desc);
 }
 
 bool VectorizationContext::emitMaskedScatterGatherMemOpBody(
-    Function &F, MemOpDesc const &Desc) const {
+    Function &F, const MemOpDesc &Desc) const {
   Value *Data = Desc.getDataOperand(&F);
   auto *const VecDataTy = dyn_cast<VectorType>(Desc.getDataType());
   auto *const Ptr = Desc.getPointerOperand(&F);
@@ -829,7 +829,7 @@ bool VectorizationContext::emitMaskedScatterGatherMemOpBody(
 
   auto *const Mask = Desc.getMaskOperand(&F);
   auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
-  auto const Align = Desc.getAlignment();
+  const auto Align = Desc.getAlignment();
 
   BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
   IRBuilder<> B(Entry);
@@ -914,7 +914,7 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   // If it's not a scalable vector, we can do it the fast way.
   if (!EC.isScalable() && !IsVP) {
     auto *const NeutralVal = compiler::utils::getNeutralVal(OpKind, EltTy);
-    auto const Width = EC.getFixedValue();
+    const auto Width = EC.getFixedValue();
     auto *const UndefVal = UndefValue::get(VecTy);
 
     // Put the Neutral element in a vector so we can shuffle it in.
@@ -936,14 +936,14 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
       // xxxx3333xxxxBBBB
       // xxxxxxxx77777777
       //
-      auto const N2 = N << 1u;
+      const auto N2 = N << 1u;
       auto MaskIt = mask.begin();
       for (size_t i = 0; i < Width; i += N2) {
         for (size_t j = 0; j < N; ++j) {
           *MaskIt++ = Width;
         }
 
-        auto const k = i + N - 1;
+        const auto k = i + N - 1;
         for (size_t j = 0; j < N; ++j) {
           *MaskIt++ = k;
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index b65a9c793f704..77c0264ef87ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -37,9 +37,9 @@ using namespace vecz;
 
 namespace {
 
-Function *declareFunction(VectorizationUnit const &VU) {
+Function *declareFunction(const VectorizationUnit &VU) {
   Module &Module = VU.context().module();
-  Function const *const ScalarFn = VU.scalarFunction();
+  const Function *const ScalarFn = VU.scalarFunction();
   ElementCount SimdWidth = VU.width();
 
   // For kernels, the vectorized function type is is the same as the original
@@ -66,7 +66,7 @@ Function *declareFunction(VectorizationUnit const &VU) {
 /// searches for the node that contains the scalar kernel, and copies all its
 /// metadata, which the exception of the Function itself, which is replaced by
 /// the vectorized kernel.
-void cloneOpenCLNamedMetadataHelper(VectorizationUnit const &VU,
+void cloneOpenCLNamedMetadataHelper(const VectorizationUnit &VU,
                                     const std::string &NodeName) {
   Module &M = VU.context().module();
 
@@ -121,10 +121,10 @@ void cloneOpenCLNamedMetadataHelper(VectorizationUnit const &VU,
 ///
 /// @param[in,out] ValueMap Map to update with the arguments.
 SmallVector<Instruction *, 2> createArgumentPlaceholders(
-    VectorizationUnit const &VU, Function *VecFunc,
+    const VectorizationUnit &VU, Function *VecFunc,
     ValueToValueMapTy &ValueMap) {
   SmallVector<Instruction *, 2> Placeholders;
-  auto const &Arguments = VU.arguments();
+  const auto &Arguments = VU.arguments();
   unsigned i = 0u;
   for (Argument &DstArg : VecFunc->args()) {
     Argument *SrcArg = Arguments[i++].OldArg;
@@ -192,7 +192,7 @@ decodeVectorizedFunctionName(StringRef Name) {
   return std::make_tuple(Name.str(), VF, Choices);
 }
 
-Function *cloneFunctionToVector(VectorizationUnit const &VU) {
+Function *cloneFunctionToVector(const VectorizationUnit &VU) {
   auto *const VectorizedFn = declareFunction(VU);
   VECZ_ERROR_IF(!VectorizedFn, "declareFunction failed to initialize");
 
@@ -266,7 +266,7 @@ static DILocation *getDILocation(unsigned Line, unsigned Column, MDNode *Scope,
                          /*ImplicitCode*/ false);
 }
 
-void cloneDebugInfo(VectorizationUnit const &VU) {
+void cloneDebugInfo(const VectorizationUnit &VU) {
   DISubprogram *const ScalarDI = VU.scalarFunction()->getSubprogram();
   // We don't have debug info
   if (!ScalarDI) {
@@ -430,7 +430,7 @@ void cloneDebugInfo(VectorizationUnit const &VU) {
   return;
 }
 
-void cloneOpenCLMetadata(VectorizationUnit const &VU) {
+void cloneOpenCLMetadata(const VectorizationUnit &VU) {
   cloneOpenCLNamedMetadataHelper(VU, "opencl.kernels");
   cloneOpenCLNamedMetadataHelper(VU, "opencl.kernel_wg_size_info");
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
index 24132efaabc1e..7de2788767b1b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -163,7 +163,7 @@ Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmpOperands(
 const Value *Heuristics::shouldVectorizeVisitCmpOperand(
     const Value *Val, const CmpInst *Cmp,
     DenseMap<const Value *, const Value *> &Cache) const {
-  auto const It = Cache.find(Val);
+  const auto It = Cache.find(Val);
   if (It != Cache.end()) {
     return It->second;
   }
@@ -209,7 +209,7 @@ const Value *Heuristics::shouldVectorizeVisitCmpOperand(
   if (const CallInst *CI = dyn_cast<const CallInst>(Val)) {
     // We only care if the CallInst does involve a call to a work-item builtin.
     compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-    auto const Uniformity = BI.analyzeBuiltinCall(*CI, SimdDimIdx).uniformity;
+    const auto Uniformity = BI.analyzeBuiltinCall(*CI, SimdDimIdx).uniformity;
     if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
         Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
       return (Cache[Val] = CI);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index f9a2adf8e8ad6..9a19584d1b069 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -316,7 +316,7 @@ void vecz::trackVeczSuccessFailure(VectorizationUnit &VU) {
   collectStatistics(VU, Fn, vectorizedFn);
 
   if (VeczDumpReport) {
-    auto const VF = VU.width();
+    const auto VF = VU.width();
     auto FnName = Fn->getName();
     if (vectorizedFn) {
       errs() << "vecz: Vectorization succeeded for kernel '" << FnName
@@ -341,8 +341,8 @@ bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) {
     // kernels.
     cloneOpenCLMetadata(vu);
   }
-  auto const vf = vu.width();
-  auto const dim = vu.dimension();
+  const auto vf = vu.width();
+  const auto dim = vu.dimension();
 
   // emit output metadata based on vectorization result
   auto finalVF = compiler::utils::VectorizationFactor(vf.getKnownMinValue(),
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index d60b154774b13..4cce6e70d81d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -170,7 +170,7 @@ static vecz::VeczPassOptions getDefaultPassOptions() {
     llvm::EnableStatistics(true);
   }
 
-  auto const factor = SIMDWidth ? SIMDWidth : 4;
+  const auto factor = SIMDWidth ? SIMDWidth : 4;
   auto VF = compiler::utils::VectorizationFactor::getFixedWidth(factor);
   if (VeczSimdWidth) {
     VF.setKnownMin(VeczSimdWidth);

From f993b3200de5cda16017836bbcdf51c33519cb38 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 16 Jan 2024 16:53:26 +0000
Subject: [PATCH 084/182] [tests] Update for LLVM 18.

This updates tests to pass with LLVM 18.

* link-builtins-32.ll, link-builtins-64.ll:
  Allow and ignore extra attributes. As the comment says, we do not care
  what the function looks like, just that it is there.
* broadcast_vector.ll, gep_duplication.ll:
  LLVM 18 intentionally limits how much InstCombine does in order to
  speed up compilation times, after noticing that running multiple
  iterations only rarely produced better results, and in those cases
  where it did produce better results, usually the fact that InstCombine
  is run at multiple points in the pipeline allows a later pass to
  optimize it anyway. In broadcast_vector.ll, LLVM 18 no longer infers
  16-byte alignment for an i32 store, which should not concern us,
  16-byte alignment there will not result in better code. In
  gep_duplication.ll, multiple iterations are needed to optimize the
  check of global ID, so simplify the LLVM IR to something that can be
  handled in a single iteration.
* widen_fmin_vector_scalar.ll, expect_assume.ll,
  instantiate_constants.ll:
  LLVM 18 builds vectors from poison, not from undef. Allow this.
* interleaved_load_ooo.ll:
  LLVM 18 has, and uses, a disjoint flag on or instructions when the two
  operands are known not to have bits in common. We cannot use this in
  the test because then the test would break for LLVM 17. Use an add
  instruction instead.
* loop_call_instantiation.ll:
  LLVM 18 infers the nuw and nsw attributes for two add instructions.
  Allow this.
* partial_linearization*.ll:
  LLVM 18 does better value range inference based on ranges used for control
  flow. Tweak the tests so that no such inference can be done.
---
 .../lit/llvm/Boscc/partial_linearization17.ll | 50 ++++++++++---------
 .../lit/llvm/Boscc/partial_linearization2.ll  |  4 +-
 .../lit/llvm/Boscc/partial_linearization3.ll  |  4 +-
 .../llvm/ScalableVectors/broadcast_vector.ll  |  2 +-
 .../widen_fmin_vector_scalar.ll               |  2 +-
 .../vecz/test/lit/llvm/expect_assume.ll       |  2 +-
 .../vecz/test/lit/llvm/gep_duplication.ll     | 11 ++--
 .../test/lit/llvm/instantiate_constants.ll    |  2 +-
 .../test/lit/llvm/interleaved_load_ooo.ll     | 10 +++-
 .../test/lit/llvm/loop_call_instantiation.ll  |  4 +-
 .../test/lit/llvm/partial_linearization17.ll  | 43 ++++++++--------
 .../test/lit/llvm/partial_linearization2.ll   |  4 +-
 .../test/lit/llvm/partial_linearization3.ll   |  4 +-
 13 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
index 5e7b83b787240..8f9420633ae28 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -66,7 +66,7 @@
 ;
 ; where '&' represents merge blocks of BOSCC regions.
 ;
-; __kernel void partial_linearization17(__global int *out, int n) {
+; __kernel void partial_linearization17(__global int *out, int n, int x) {
 ;   int id = get_global_id(0);
 ;   int ret = 0;
 ;   int i = 0;
@@ -74,7 +74,7 @@
 ;   while (1) {
 ;     if (n > 10) {
 ;       goto c;
-;     } else if (n > 5) {
+;     } else if (n < 5) {
 ;       goto f;
 ;     }
 ;     if (id + i++ % 2 == 0) {
@@ -87,12 +87,12 @@
 ;   goto m;
 ;
 ; f:
-;   ret += n * 2;
-;   for (int i = 0; i < n * 2; i++) ret += i;
+;   ret += x / 2;
+;   for (int i = 0; i < x / 2; i++) ret += i;
 ;   goto m;
 ;
 ; c:
-;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   for (int i = 0; i < n - 5; i++) ret += 2;
 ;   // e
 ;   if (id % 2 == 0) {
 ;     goto h;
@@ -105,8 +105,8 @@
 ;   goto o;
 ;
 ; h:
-;   for (int i = 0; i < n * 2; i++) {
-;     if (n > 5) {
+;   for (int i = 0; i < x / 2; i++) {
+;     if (x < 5) {
 ;       goto l;
 ;     }
 ;   }
@@ -118,7 +118,7 @@
 ;   ret += id << 3;
 ;
 ; o:
-;   for (int i = 0; i < n * 2; i++) ret += i;
+;   for (int i = 0; i < x / 2; i++) ret += i;
 ;
 ; p:
 ;   out[id] = ret;
@@ -130,7 +130,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n, i32 %x) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
@@ -142,7 +142,7 @@ while.body:                                       ; preds = %if.end5, %entry
   br i1 %cmp, label %for.cond28, label %if.else
 
 if.else:                                          ; preds = %while.body
-  %cmp2 = icmp sgt i32 %n, 5
+  %cmp2 = icmp slt i32 %n, 5
   br i1 %cmp2, label %f, label %if.end5
 
 if.end5:                                          ; preds = %if.else
@@ -165,14 +165,14 @@ for.body:                                         ; preds = %for.cond
   br label %for.cond
 
 f:                                                ; preds = %if.else
-  %mul = shl i32 %n, 1
+  %div = sdiv i32 %x, 2
   br label %for.cond18
 
 for.cond18:                                       ; preds = %for.body22, %f
-  %ret.1 = phi i32 [ %mul, %f ], [ %add23, %for.body22 ]
+  %ret.1 = phi i32 [ %div, %f ], [ %add23, %for.body22 ]
   %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ]
-  %mul19 = shl nsw i32 %n, 1
-  %cmp20 = icmp slt i32 %storemerge3, %mul19
+  %div19 = sdiv i32 %x, 2
+  %cmp20 = icmp slt i32 %storemerge3, %div19
   br i1 %cmp20, label %for.body22, label %m
 
 for.body22:                                       ; preds = %for.cond18
@@ -204,12 +204,12 @@ m:                                                ; preds = %for.end36, %for.con
 
 for.cond43:                                       ; preds = %for.inc52, %for.end36
   %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ]
-  %mul44 = shl nsw i32 %n, 1
-  %cmp45 = icmp slt i32 %storemerge6, %mul44
+  %div44 = sdiv i32 %x, 2
+  %cmp45 = icmp slt i32 %storemerge6, %div44
   br i1 %cmp45, label %for.body47, label %for.end54
 
 for.body47:                                       ; preds = %for.cond43
-  %cmp48 = icmp sgt i32 %n, 5
+  %cmp48 = icmp slt i32 %x, 5
   br i1 %cmp48, label %l, label %for.inc52
 
 for.inc52:                                        ; preds = %for.body47
@@ -233,8 +233,8 @@ o:                                                ; preds = %l, %m
 for.cond60:                                       ; preds = %for.body64, %o
   %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ]
   %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ]
-  %mul61 = shl nsw i32 %n, 1
-  %cmp62 = icmp slt i32 %storemerge2, %mul61
+  %div61 = sdiv i32 %x, 2
+  %cmp62 = icmp slt i32 %storemerge2, %div61
   br i1 %cmp62, label %for.body64, label %p
 
 for.body64:                                       ; preds = %for.cond60
@@ -352,17 +352,18 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 ; CHECK: br i1 %{{.+}}, label %[[FORCOND43PREHEADERUNIFORM:.+]], label %[[FOREND36UNIFORMBOSCCINDIR:.+]]
 
 ; CHECK: [[FORCOND43PREHEADERUNIFORM]]:
-; CHECK: %[[CMP18UNIFORM:.+]] = icmp
 ; CHECK: br label %[[FORCOND43UNIFORM:.+]]
 
 ; CHECK: [[FOREND36UNIFORMBOSCCINDIR]]:
 ; CHECK: br i1 %{{.+}}, label %[[MUNIFORM]], label %[[FORCOND43PREHEADER:.+]]
 
 ; CHECK: [[FORCOND43UNIFORM]]:
-; CHECK: br i1 %[[CMP18UNIFORM]], label %[[FORBODY47UNIFORM:.+]], label %[[FOREND54UNIFORM:.+]]
+; CHECK: %[[CMP45UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP45UNIFORM]], label %[[FORBODY47UNIFORM:.+]], label %[[FOREND54UNIFORM:.+]]
 
 ; CHECK: [[FORBODY47UNIFORM]]:
-; CHECK: br i1 true, label %[[LUNIFORM:.+]], label %[[FORINC52UNIFORM:.+]]
+; CHECK: %[[CMP48UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP48UNIFORM]], label %[[LUNIFORM:.+]], label %[[FORINC52UNIFORM:.+]]
 
 ; CHECK: [[FORINC52UNIFORM]]:
 ; CHECK: br label %[[FORCOND43UNIFORM]]
@@ -425,7 +426,6 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 ; CHECK: br label %[[FORCOND43PREHEADER]]
 
 ; CHECK: [[FORCOND43PREHEADER]]:
-; CHECK: %[[CMP14:.+]] = icmp
 ; CHECK: br label %[[FORCOND43:.+]]
 
 ; CHECK: [[MLOOPEXIT]]:
@@ -438,10 +438,12 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 ; CHECK: br label %[[O:.+]]
 
 ; CHECK: [[FORCOND43]]:
+; CHECK: %[[CMP14:.+]] = icmp
 ; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]]
 
 ; CHECK: [[FORBODY47]]:
-; CHECK: br i1 true, label %[[L:.+]], label %[[FORINC52:.+]]
+; CHECK: %[[CMP48:.+]] = icmp
+; CHECK: br i1 %[[CMP48]], label %[[L:.+]], label %[[FORINC52:.+]]
 
 ; CHECK: [[FORINC52]]:
 ; CHECK: br label %[[FORCOND43]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
index 371755af3e382..f0a36128da223 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
@@ -59,7 +59,7 @@
 ;   int id = get_global_id(0);
 ;   int ret = 0;
 ;
-;   if (n > 10) { // uniform
+;   if (n < 10) { // uniform
 ;     if (id % 3 == 0) { // varying
 ;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h;
 ;     } else { // varying
@@ -95,7 +95,7 @@ define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n)
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
-  %cmp = icmp sgt i32 %n, 10
+  %cmp = icmp slt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
 
 if.then:                                          ; preds = %entry
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
index 56ec40be4a215..289c00aba9f32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
@@ -59,7 +59,7 @@
 ;   int id = get_global_id(0);
 ;   int ret = 0;
 ;
-;   if (n > 10) { // uniform
+;   if (n < 10) { // uniform
 ;     if (id % 3 == 0) { // varying
 ;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end;
 ;     } else { // varying
@@ -93,7 +93,7 @@ define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n)
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
-  %cmp = icmp sgt i32 %n, 10
+  %cmp = icmp slt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
 
 if.then:                                          ; preds = %entry
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index ea203f8658eea..d43996696bcb4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -150,7 +150,7 @@ entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> {{(undef|poison)}})
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16
-; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align 16
+; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align
 ; CHECK-NEXT:    [[V:%.*]] = load <4 x i32>, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
index f0895d6165aec..567c014a5e68a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -57,7 +57,7 @@ entry:
 ; scalar operand is sub-splatted to the required <16 x float>.
 ; CHECK: %[[LDA:.+]] = load <16 x float>, ptr %{{.+}}
 ; CHECK: %[[LDB:.+]] = load <4 x float>, ptr %{{.+}}
-; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> {{undef|poison}}, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; CHECK: %[[RES:.+]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> %[[LDA]], <16 x float> %[[SPL]])
 ; CHECK: store <16 x float> %[[RES]], ptr %{{.+}}
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
index 9969f607c5b7a..f114912fc0bbf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -65,7 +65,7 @@ entry:
 ; CHECK: [[EX1:%.*]] = call i32 @llvm.expect.i32(i32 [[E1]], i32 42)
 ; CHECK: [[EX2:%.*]] = call i32 @llvm.expect.i32(i32 [[E2]], i32 42)
 ; CHECK: [[EX3:%.*]] = call i32 @llvm.expect.i32(i32 [[E3]], i32 42)
-; CHECK: [[C0:%.*]] = insertelement <4 x i32> undef, i32 [[EX0]], i64 0
+; CHECK: [[C0:%.*]] = insertelement <4 x i32> {{undef|poison}}, i32 [[EX0]], i64 0
 ; CHECK: [[C1:%.*]]  = insertelement <4 x i32> [[C0]], i32 [[EX1]], i64 1
 ; CHECK: [[C2:%.*]]  = insertelement <4 x i32> [[C1]], i32 [[EX2]], i64 2
 ; CHECK: [[C3:%.*]]  = insertelement <4 x i32> [[C2]], i32 [[EX3]], i64 3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index 42909fddd9c44..bf3af3df43f6b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -31,21 +31,20 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) {
 entry:
   %out.addr = alloca ptr addrspace(1), align 8
-  %global_id = alloca i32, align 4
+  %global_id = alloca i64, align 8
   %myStruct = alloca %struct.testStruct, align 4
   store ptr addrspace(1) %out, ptr %out.addr, align 8
   %call = call i64 @__mux_get_global_id(i32 0) #2
-  %conv = trunc i64 %call to i32
-  store i32 %conv, ptr %global_id, align 4
+  store i64 %call, ptr %global_id, align 8
   %x = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
   %arrayidx = getelementptr inbounds [2 x i32], ptr %x, i64 0, i64 0
   store i32 0, ptr %arrayidx, align 4
   %x1 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
   %arrayidx2 = getelementptr inbounds [2 x i32], ptr %x1, i64 0, i64 1
   store i32 1, ptr %arrayidx2, align 4
-  %0 = load i32, ptr %global_id, align 4
-  %and = and i32 %0, 1
-  %tobool = icmp ne i32 %and, 0
+  %0 = load i64, ptr %global_id, align 8
+  %and = and i64 %0, 1
+  %tobool = icmp ne i64 %and, 0
   br i1 %tobool, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
index 06da56e483189..3131dc3b75f60 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -85,7 +85,7 @@ attributes #6 = { convergent nobuiltin nounwind }
 ; CHECK: %[[C1:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
 ; CHECK: %[[C2:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
 ; CHECK: %[[C3:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
-; CHECK: %[[G0:.+]] = insertelement <4 x float> undef, float %[[C0]], {{(i32|i64)}} 0
+; CHECK: %[[G0:.+]] = insertelement <4 x float> {{undef|poison}}, float %[[C0]], {{(i32|i64)}} 0
 ; CHECK: %[[G1:.+]] = insertelement <4 x float> %[[G0]], float %[[C1]], {{(i32|i64)}} 1
 ; CHECK: %[[G2:.+]] = insertelement <4 x float> %[[G1]], float %[[C2]], {{(i32|i64)}} 2
 ; CHECK: %[[G3:.+]] = insertelement <4 x float> %[[G2]], float %[[C3]], {{(i32|i64)}} 3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
index 6f48a1a77ff74..6cc46cc26b748 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
@@ -30,7 +30,15 @@ entry:
   %mul = mul nsw i32 %conv2, %stride
   %add = add nsw i32 %conv, %mul
   %mul3 = shl nsw i32 %add, 1
-  %add4 = or i32 %mul3, 1
+  ; LLVM will not generate an add, but the precise form of the or instruction
+  ; that gets generated depends on the LLVM version.
+  ; LLVM 17-: %add4 = or i32 %mul3, 1
+  ; LLVM 18+: %add4 = or disjoint i32 %mul3, 1
+  ; The LLVM 17 form is not recognized as an add by LLVM 18, and the LLVM 18
+  ; form uses a flag which does not exist in LLVM 17. As this is not the
+  ; purpose of the test, use an add instruction here for now, and revisit this
+  ; once our minimum version of LLVM is LLVM 18.
+  %add4 = add nsw nuw i32 %mul3, 1
   %idxprom = sext i32 %add4 to i64
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
   %0 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
index d36b9e0c1f350..200647016c69c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -46,7 +46,7 @@ declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 ; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE1]]
 ; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %{{.+}}, i32 %[[INSTANCE1]]
 ; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V4]], i32 %[[V5]])
-; CHECK: %[[V7]] = add i32 %[[INSTANCE1]], 1
+; CHECK: %[[V7]] = add {{(nuw |nsw )*}}i32 %[[INSTANCE1]], 1
 ; CHECK: br label %[[LOOPHEADER1]]
 
 ; CHECK: [[LOOPHEADER2:instloop.header.*]]:
@@ -57,7 +57,7 @@ declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 ; CHECK: [[LOOPBODY2]]:
 ; CHECK: %[[V9:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE3]]
 ; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V9]])
-; CHECK: %[[V11]] = add i32 %[[INSTANCE3]], 1
+; CHECK: %[[V11]] = add {{(nuw |nsw )*}}i32 %[[INSTANCE3]], 1
 ; CHECK: br label %[[LOOPHEADER2]]
 
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
index 4dd7317c2df6e..d1cf75289c5eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -64,7 +64,7 @@
 ;              |
 ;              p
 ;
-; __kernel void partial_linearization17(__global int *out, int n) {
+; __kernel void partial_linearization17(__global int *out, int n, int x) {
 ;   int id = get_global_id(0);
 ;   int ret = 0;
 ;   int i = 0;
@@ -72,7 +72,7 @@
 ;   while (1) {
 ;     if (n > 10) {
 ;       goto c;
-;     } else if (n > 5) {
+;     } else if (n < 5) {
 ;       goto f;
 ;     }
 ;     if (id + i++ % 2 == 0) {
@@ -85,12 +85,12 @@
 ;   goto m;
 ;
 ; f:
-;   ret += n * 2;
-;   for (int i = 0; i < n * 2; i++) ret += i;
+;   ret += x / 2;
+;   for (int i = 0; i < x / 2; i++) ret += i;
 ;   goto m;
 ;
 ; c:
-;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   for (int i = 0; i < n - 5; i++) ret += 2;
 ;   // e
 ;   if (id % 2 == 0) {
 ;     goto h;
@@ -103,8 +103,8 @@
 ;   goto o;
 ;
 ; h:
-;   for (int i = 0; i < n * 2; i++) {
-;     if (n > 5) {
+;   for (int i = 0; i < x / 2; i++) {
+;     if (x < 5) {
 ;       goto l;
 ;     }
 ;   }
@@ -116,7 +116,7 @@
 ;   ret += id << 3;
 ;
 ; o:
-;   for (int i = 0; i < n * 2; i++) ret += i;
+;   for (int i = 0; i < x / 2; i++) ret += i;
 ;
 ; p:
 ;   out[id] = ret;
@@ -128,7 +128,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n, i32 %x) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
@@ -140,7 +140,7 @@ while.body:                                       ; preds = %if.end5, %entry
   br i1 %cmp, label %for.cond28, label %if.else
 
 if.else:                                          ; preds = %while.body
-  %cmp2 = icmp sgt i32 %n, 5
+  %cmp2 = icmp slt i32 %n, 5
   br i1 %cmp2, label %f, label %if.end5
 
 if.end5:                                          ; preds = %if.else
@@ -163,14 +163,14 @@ for.body:                                         ; preds = %for.cond
   br label %for.cond
 
 f:                                                ; preds = %if.else
-  %mul = shl i32 %n, 1
+  %div = sdiv i32 %x, 2
   br label %for.cond18
 
 for.cond18:                                       ; preds = %for.body22, %f
-  %ret.1 = phi i32 [ %mul, %f ], [ %add23, %for.body22 ]
+  %ret.1 = phi i32 [ %div, %f ], [ %add23, %for.body22 ]
   %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ]
-  %mul19 = shl nsw i32 %n, 1
-  %cmp20 = icmp slt i32 %storemerge3, %mul19
+  %div19 = sdiv i32 %x, 2
+  %cmp20 = icmp slt i32 %storemerge3, %div19
   br i1 %cmp20, label %for.body22, label %m
 
 for.body22:                                       ; preds = %for.cond18
@@ -202,12 +202,12 @@ m:                                                ; preds = %for.end36, %for.con
 
 for.cond43:                                       ; preds = %for.inc52, %for.end36
   %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ]
-  %mul44 = shl nsw i32 %n, 1
-  %cmp45 = icmp slt i32 %storemerge6, %mul44
+  %div44 = sdiv i32 %x, 2
+  %cmp45 = icmp slt i32 %storemerge6, %div44
   br i1 %cmp45, label %for.body47, label %for.end54
 
 for.body47:                                       ; preds = %for.cond43
-  %cmp48 = icmp sgt i32 %n, 5
+  %cmp48 = icmp slt i32 %x, 5
   br i1 %cmp48, label %l, label %for.inc52
 
 for.inc52:                                        ; preds = %for.body47
@@ -231,8 +231,8 @@ o:                                                ; preds = %l, %m
 for.cond60:                                       ; preds = %for.body64, %o
   %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ]
   %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ]
-  %mul61 = shl nsw i32 %n, 1
-  %cmp62 = icmp slt i32 %storemerge2, %mul61
+  %div61 = sdiv i32 %x, 2
+  %cmp62 = icmp slt i32 %storemerge2, %div61
   br i1 %cmp62, label %for.body64, label %p
 
 for.body64:                                       ; preds = %for.cond60
@@ -333,7 +333,6 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 ; CHECK: br label %[[FORCOND43PREHEADER:.+]]
 
 ; CHECK: [[FORCOND43PREHEADER]]:
-; CHECK: %[[CMP14:.+]] = icmp
 ; CHECK: br label %[[FORCOND43:.+]]
 
 ; CHECK: [[MLOOPEXIT]]:
@@ -346,10 +345,12 @@ attributes #2 = { convergent nobuiltin nounwind readonly }
 ; CHECK: br label %[[O:.+]]
 
 ; CHECK: [[FORCOND43]]:
+; CHECK: %[[CMP14:.+]] = icmp
 ; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]]
 
 ; CHECK: [[FORBODY47]]:
-; CHECK: br i1 true, label %[[L:.+]], label %[[FORINC52:.+]]
+; CHECK: %[[CMP48:.+]] = icmp
+; CHECK: br i1 %[[CMP48]], label %[[L:.+]], label %[[FORINC52:.+]]
 
 ; CHECK: [[FORINC52]]:
 ; CHECK: br label %[[FORCOND43]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
index 93f135b7a073d..edce8b7c2e5a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
@@ -55,7 +55,7 @@
 ;   int id = get_global_id(0);
 ;   int ret = 0;
 ;
-;   if (n > 10) { // uniform
+;   if (n < 10) { // uniform
 ;     if (id % 3 == 0) { // varying
 ;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h;
 ;     } else { // varying
@@ -91,7 +91,7 @@ define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n)
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
-  %cmp = icmp sgt i32 %n, 10
+  %cmp = icmp slt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
 
 if.then:                                          ; preds = %entry
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
index aeacf9c8d3a98..143b073882487 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
@@ -55,7 +55,7 @@
 ;   int id = get_global_id(0);
 ;   int ret = 0;
 ;
-;   if (n > 10) { // uniform
+;   if (n < 10) { // uniform
 ;     if (id % 3 == 0) { // varying
 ;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end;
 ;     } else { // varying
@@ -89,7 +89,7 @@ define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n)
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
-  %cmp = icmp sgt i32 %n, 10
+  %cmp = icmp slt i32 %n, 10
   br i1 %cmp, label %if.then, label %if.else17
 
 if.then:                                          ; preds = %entry

From 8fe1f81803417e467920fbe0e61ea4a1a4e432fb Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 18 Jan 2024 14:32:39 +0000
Subject: [PATCH 085/182] [NFC] Upgrade to clang-format-17.

We can see in the diff that clang-format-17 is better able to figure out
when (A)*b and (A)&b are a cast and when they are a binary operator.
---
 .../compiler_passes/vecz/source/transform/packetizer.cpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 745515052642e..007e5857923af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2042,7 +2042,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
       if (auto *const alloca = dyn_cast<AllocaInst>(ptr)) {
         if (!needsInstantiation(Ctx, *alloca)) {
           // If it's an alloca we can widen, we can just change the size
-          llvm::TypeSize const allocSize =
+          const llvm::TypeSize allocSize =
               Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
           const auto lifeSize =
               allocSize.isScalable() || SimdWidth.isScalable()

From 0f682622b58bffa6b7520e495acc5215315c5343 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 11 Jan 2024 17:45:58 +0000
Subject: [PATCH 086/182] [compiler] Fix unchecked std::optional accesses

This cleans the project of the `bugprone-unchecked-optional-access`
check which is present in newer versions of clang-tidy.

Some of these *are* checked as far as we're concerned, but clang-tidy
can't know that that. Others are genuine oversights.
---
 .../analysis/instantiation_analysis.cpp       | 14 +++++++----
 .../vecz/source/transform/packetizer.cpp      | 12 ++++++----
 .../source/transform/pre_linearize_pass.cpp   | 24 ++++++++++---------
 .../vecz/source/transform/scalarizer.cpp      | 13 ++++++----
 4 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index 0ed8726694daa..328d0e4978914 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -111,11 +111,17 @@ bool needsInstantiation(const VectorizationContext &Ctx, Instruction &I) {
   if (CallInst *CI = dyn_cast<CallInst>(&I)) {
     return analyzeCall(Ctx, CI);
   } else if (LoadInst *Load = dyn_cast<LoadInst>(&I)) {
-    MemOp Op = *MemOp::get(Load);
-    return analyzeMemOp(Op);
+    if (auto Op = MemOp::get(Load)) {
+      return analyzeMemOp(*Op);
+    }
+    // If it's not a MemOp, assume we don't need to instantiate.
+    return false;
   } else if (StoreInst *Store = dyn_cast<StoreInst>(&I)) {
-    MemOp Op = *MemOp::get(Store);
-    return analyzeMemOp(Op);
+    if (auto Op = MemOp::get(Store)) {
+      return analyzeMemOp(*Op);
+    }
+    // If it's not a MemOp, assume we don't need to instantiate.
+    return false;
   } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
     return analyzeAlloca(Ctx, Alloca);
   } else if (isa<AtomicRMWInst>(&I) || isa<AtomicCmpXchgInst>(&I)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 007e5857923af..98209bb56107d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2443,13 +2443,17 @@ Value *Packetizer::Impl::vectorizeInstruction(Instruction *Ins) {
 }
 
 ValuePacket Packetizer::Impl::packetizeLoad(LoadInst *Load) {
-  auto Op = *MemOp::get(Load);
-  return packetizeMemOp(Op);
+  if (auto Op = MemOp::get(Load)) {
+    return packetizeMemOp(*Op);
+  }
+  return ValuePacket{};
 }
 
 ValuePacket Packetizer::Impl::packetizeStore(StoreInst *Store) {
-  auto Op = *MemOp::get(Store);
-  return packetizeMemOp(Op);
+  if (auto Op = MemOp::get(Store)) {
+    return packetizeMemOp(*Op);
+  }
+  return ValuePacket{};
 }
 
 ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index 54fd680166ce8..e718f75a61b7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -49,6 +49,7 @@
 #include <llvm/IR/Function.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/Pass.h>
+#include <llvm/Support/InstructionCost.h>
 #include <llvm/Transforms/Utils/LoopUtils.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
@@ -80,9 +81,9 @@ bool isTrivialBlock(const llvm::BasicBlock &BB) {
 // This assumes sequential execution (no Instruction Level Parallelism)
 // and takes no account of Data Hazards &c so is not guaranteed to be
 // entirely accurate.
-unsigned calculateBlockCost(const BasicBlock &BB,
-                            const TargetTransformInfo &TTI) {
-  unsigned cost = 0;
+InstructionCost calculateBlockCost(const BasicBlock &BB,
+                                   const TargetTransformInfo &TTI) {
+  InstructionCost cost;
   for (const auto &I : BB) {
     if (I.isTerminator()) {
       break;
@@ -100,16 +101,16 @@ unsigned calculateBlockCost(const BasicBlock &BB,
       inst_cost *= multi_llvm::getVectorNumElements(I.getType());
     }
 
-    cost += *inst_cost.getValue();
+    cost += inst_cost;
   }
   return cost;
 }
 
 // It creates a temporary function in order to build a target-dependent
 // vector AND reduction inside it, in order to calculate the cost of it.
-unsigned calculateBoolReductionCost(LLVMContext &context, Module *module,
-                                    const TargetTransformInfo &TTI,
-                                    llvm::ElementCount width) {
+InstructionCost calculateBoolReductionCost(LLVMContext &context, Module *module,
+                                           const TargetTransformInfo &TTI,
+                                           llvm::ElementCount width) {
   Type *cond_ty = VectorType::get(Type::getInt1Ty(context), width);
 
   FunctionType *new_fty =
@@ -122,7 +123,7 @@ unsigned calculateBoolReductionCost(LLVMContext &context, Module *module,
   IRBuilder<> B(BB);
   multi_llvm::createSimpleTargetReduction(B, &TTI, &*F->arg_begin(),
                                           RecurKind::And);
-  unsigned cost = calculateBlockCost(*BB, TTI);
+  InstructionCost cost = calculateBlockCost(*BB, TTI);
 
   // We don't really need that function in the module anymore because it's
   // only purpose was to be used for analysis, so we go ahead and remove it.
@@ -227,7 +228,7 @@ PreservedAnalyses PreLinearizePass::run(Function &F,
   bool div_exceptions =
       VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions);
 
-  unsigned boscc_cost = 0;
+  InstructionCost boscc_cost;
   UniformValueResult *UVR = nullptr;
   if (VU.choices().linearizeBOSCC()) {
     boscc_cost = calculateBoolReductionCost(F.getContext(), F.getParent(), TTI,
@@ -279,13 +280,14 @@ PreservedAnalyses PreLinearizePass::run(Function &F,
 
       // The cost of a "bypass" branch is essentially zero. This occurs in a
       // "triangle" type control struct (i.e. if with no else).
-      unsigned min_cost = new_succs.empty() ? ~0 : 0;
+      InstructionCost min_cost = new_succs.empty() ? InstructionCost::getMax()
+                                                   : InstructionCost::getMin();
 
       // The total cost of executing every successor sequentially
       InstructionCost total_cost = 0;
 
       for (auto *succ : hoistable) {
-        unsigned block_cost = calculateBlockCost(*succ, TTI);
+        InstructionCost block_cost = calculateBlockCost(*succ, TTI);
         if (block_cost < min_cost) {
           min_cost = block_cost;
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index dd8aa2e823738..aa899606c437e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -21,6 +21,7 @@
 #include <llvm/ADT/Statistic.h>
 #include <llvm/Analysis/InstructionSimplify.h>
 #include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IntrinsicInst.h>
@@ -727,10 +728,14 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
         // aggregate variable, our vector, which is fragmented across multiple
         // values. First argument takes the offset of the piece, and the second
         // takes the piece size.
-        auto DIExpr = *DIExpression::createFragmentExpression(
-            DIB.createExpression(), lane * bitSize, bitSize);
-        DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
-        VectorElements.insert(LaneVal);
+        std::optional<DIExpression *> DIExpr =
+            DIExpression::createFragmentExpression(DIB.createExpression(),
+                                                   lane * bitSize, bitSize);
+        if (DIExpr) {
+          DIB.insertDbgValueIntrinsic(LaneVal, DILocal, *DIExpr, DILoc,
+                                      Original);
+          VectorElements.insert(LaneVal);
+        }
       }
     }
   }

From 345e8b754238a27f48bfca4f943bf4cc4d765cf7 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 18 Jan 2024 19:07:27 +0000
Subject: [PATCH 087/182] [NFC] Add const.

Variables that can be defined as const are defined as const. This also
revealed that some loop iteration variables made pointless copies; these
have been changed to use references instead. Some variables also no
longer need to be captured in lambdas as a result of this change, so
they have been removed from the capture lists.
---
 .../source/analysis/control_flow_analysis.cpp |  6 +-
 .../source/analysis/divergence_analysis.cpp   | 10 +--
 .../analysis/instantiation_analysis.cpp       |  5 +-
 .../source/analysis/liveness_analysis.cpp     |  3 +-
 .../source/analysis/simd_width_analysis.cpp   |  7 +-
 .../analysis/uniform_value_analysis.cpp       | 12 +--
 .../vecz/source/control_flow_boscc.cpp        | 19 ++---
 .../vecz/source/control_flow_roscc.cpp        |  2 +-
 .../compiler_passes/vecz/source/debugging.cpp |  4 +-
 .../vecz/source/memory_operations.cpp         | 61 ++++++++-------
 .../vecz/source/offset_info.cpp               | 17 +++--
 .../compiler_passes/vecz/source/pass.cpp      |  2 +-
 .../vecz/source/reachability.cpp              | 24 +++---
 .../transform/builtin_inlining_pass.cpp       | 35 ++++-----
 .../control_flow_conversion_pass.cpp          | 18 ++---
 .../inline_post_vectorization_pass.cpp        |  2 +-
 .../source/transform/instantiation_pass.cpp   | 14 ++--
 .../interleaved_group_combine_pass.cpp        | 10 +--
 .../transform/packetization_helpers.cpp       | 12 +--
 .../source/transform/packetization_pass.cpp   |  4 +-
 .../vecz/source/transform/packetizer.cpp      | 75 ++++++++++---------
 .../vecz/source/transform/passes.cpp          |  2 +-
 .../source/transform/pre_linearize_pass.cpp   | 10 +--
 .../source/transform/printf_scalarizer.cpp    |  4 +-
 .../source/transform/scalarization_pass.cpp   |  4 +-
 .../vecz/source/transform/scalarizer.cpp      | 70 ++++++++---------
 .../transform/ternary_transform_pass.cpp      |  4 +-
 .../transform/uniform_reassociation_pass.cpp  |  2 +-
 .../vecz/source/vector_target_info.cpp        | 49 ++++++------
 .../vecz/source/vector_target_info_arm.cpp    | 18 ++---
 .../vecz/source/vector_target_info_riscv.cpp  |  2 +-
 .../vecz/source/vectorization_choices.cpp     |  6 +-
 .../vecz/source/vectorization_context.cpp     | 30 ++++----
 .../vecz/source/vectorization_helpers.cpp     | 12 +--
 .../vecz/source/vectorization_heuristics.cpp  | 10 +--
 .../vecz/source/vectorization_unit.cpp        |  2 +-
 .../vecz/source/vectorizer.cpp                | 14 ++--
 .../vecz/tools/source/veczc.cpp               |  2 +-
 38 files changed, 300 insertions(+), 283 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
index c3c2ab8e4a229..6ccf6d5f2e99b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
@@ -41,7 +41,7 @@ CFGResult CFGAnalysis::run(llvm::Function &F,
 
   LLVM_DEBUG(dbgs() << "CONTROL FLOW ANALYSIS\n");
 
-  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+  const UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
 
   bool mayDiverge = false;
   for (BasicBlock &BB : F) {
@@ -81,9 +81,9 @@ CFGResult CFGAnalysis::run(llvm::Function &F,
     return Res;
   }
 
-  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   using RPOTraversal = ReversePostOrderTraversal<const Function *>;
-  RPOTraversal FuncRPOT(&F);
+  const RPOTraversal FuncRPOT(&F);
   if (containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
                              const LoopInfo>(FuncRPOT, LI)) {
     emitVeczRemarkMissed(&F, &F, "Irreducible loop detected in");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
index f2f1b9844d804..0d10d73b9a9a3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -218,7 +218,7 @@ bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
   SmallVector<unsigned, 16> stack;
   stack.push_back(0);
   uint32_t pos = 0;
-  SmallVector<unsigned, 16> children;
+  const SmallVector<unsigned, 16> children;
   SmallVector<unsigned, 16> loopExits;
   while (!stack.empty()) {
     const auto u = stack.pop_back_val();
@@ -392,8 +392,8 @@ void DivergenceResult::markDivergent(const Loop &L) {
 
 void DivergenceResult::markByAll(BasicBlock &src) {
   Function &F = *src.getParent();
-  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  const PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
 
   BlockQueue queue(*this);
   queue.push(&src);
@@ -493,7 +493,7 @@ DenseSet<BasicBlock *> DivergenceResult::joinPoints(BasicBlock &src) const {
   }
 
   Function &F = *src.getParent();
-  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  const PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
 
   DenseMap<const BasicBlock *, const BasicBlock *> defMap;
   DenseSet<BasicBlock *> joins;
@@ -626,7 +626,7 @@ DivergenceResult DivergenceAnalysis::run(llvm::Function &F,
   Res.basicBlockTags.reserve(F.size() * 4);
 
   // Prepare the BasicBlockTags.
-  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   for (BasicBlock &BB : F) {
     // Create BB info entries.
     BasicBlockTag &BBTag = Res.getOrCreateTag(&BB);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index 328d0e4978914..0dcd6bf69f743 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -78,8 +78,9 @@ bool analyzeCall(const VectorizationContext &Ctx, CallInst *CI) {
 
   // Functions returning void must have side-effects.
   // We cannot vectorize them and instead we need to instantiate them.
-  bool HasSideEffects = Callee->getReturnType()->isVoidTy() ||
-                        (Props & compiler::utils::eBuiltinPropertySideEffects);
+  const bool HasSideEffects =
+      Callee->getReturnType()->isVoidTy() ||
+      (Props & compiler::utils::eBuiltinPropertySideEffects);
   if (HasSideEffects &&
       (Props & compiler::utils::eBuiltinPropertySupportsInstantiation)) {
     return true;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
index 1dd7e04dbe0bf..22d1b267f6a31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -212,7 +212,8 @@ void LivenessResult::Impl::mark(Value *V, const BasicBlock *parent,
 
 void LivenessResult::Impl::calculateMaxRegistersInBlock(const BasicBlock *BB) {
   auto &BI = LR.BlockInfos[BB];
-  SmallPtrSet<const Value *, 16> liveOut(BI.LiveOut.begin(), BI.LiveOut.end());
+  const SmallPtrSet<const Value *, 16> liveOut(BI.LiveOut.begin(),
+                                               BI.LiveOut.end());
   SmallPtrSet<const Value *, 16> seenButNotInLiveOut;
 
   auto maxRegistersUsed = liveOut.size();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
index a81ae64dd9cad..86627dc4a2ed2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -134,7 +134,7 @@ unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F,
       // the live set at the point before the last (i.e. first) instruction, so
       // we deal with the operands first and then process the live set.
       if (PAR.needsPacketization(&inst)) {
-        bool isGEP = isa<GetElementPtrInst>(&inst);
+        const bool isGEP = isa<GetElementPtrInst>(&inst);
         for (auto operand : inst.operand_values()) {
           if (isa<Instruction>(operand) || isa<Argument>(operand)) {
             if (!isGEP || PAR.needsPacketization(operand)) {
@@ -167,8 +167,9 @@ unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F,
 
 SimdWidthAnalysis::Result SimdWidthAnalysis::run(
     Function &F, llvm::FunctionAnalysisManager &AM) {
-  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
-  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  const VectorizationUnit &VU =
+      AM.getResult<VectorizationUnitAnalysis>(F).getVU();
 
   // If the target does not provide vector registers, return 0.
   MaxVecRegBitWidth =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 4b1f571feb9e7..346ece7a4543b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -40,7 +40,7 @@ namespace {
 // Find leaves by recursing through an instruction's uses
 bool findStrayLeaves(UniformValueResult &UVR, Instruction &I,
                      DenseSet<Instruction *> &Visited) {
-  for (Use &U : I.uses()) {
+  for (const Use &U : I.uses()) {
     auto *User = U.getUser();
     if (isa<StoreInst>(User) || isa<AtomicRMWInst>(User) ||
         isa<AtomicCmpXchgInst>(User)) {
@@ -129,7 +129,7 @@ static bool isGroupBroadcastOrReduction(
 
 void UniformValueResult::findVectorLeaves(
     std::vector<Instruction *> &Leaves) const {
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       // Reductions and broadcasts are always vector leaves regardless of
@@ -211,7 +211,7 @@ void UniformValueResult::findVectorLeaves(
 }
 
 void UniformValueResult::findVectorRoots(std::vector<Value *> &Roots) const {
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       CallInst *CI = dyn_cast<CallInst>(&I);
@@ -279,7 +279,7 @@ void UniformValueResult::markVaryingValues(Value *V, Value *From) {
     // Some builtins produce a uniform value regardless of their inputs.
     Function *Callee = CI->getCalledFunction();
     if (Callee) {
-      compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
       const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension);
       const auto Uniformity = Builtin.uniformity;
       if (Uniformity == compiler::utils::eBuiltinUniformityAlways) {
@@ -310,7 +310,7 @@ void UniformValueResult::markVaryingValues(Value *V, Value *From) {
   LLVM_DEBUG(dbgs() << "vecz: Needs packetization: " << *V << "\n");
 
   // Visit all users of V, they are varying too.
-  for (Use &Use : V->uses()) {
+  for (const Use &Use : V->uses()) {
     User *User = Use.getUser();
     markVaryingValues(User, V);
   }
@@ -443,7 +443,7 @@ UniformValueResult UniformValueAnalysis::run(
     Res.markVaryingValues(Root);
   }
 
-  compiler::utils::BuiltinInfo &BI = Res.Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Res.Ctx.builtins();
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       // Find atomic instructions, these are always varying
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 0620b26722157..a02fca4dac328 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -110,10 +110,11 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
   SmallVector<Loop *, 16> duplicatedLoops;
   SmallPtrSet<Loop *, 16> duplicatedLoopSet;
 
-  size_t size = std::accumulate(uniformRegions.begin(), uniformRegions.end(), 0,
-                                [](size_t base, const UniformRegion &region) {
-                                  return base + region.predicatedBlocks.size();
-                                });
+  const size_t size =
+      std::accumulate(uniformRegions.begin(), uniformRegions.end(), 0,
+                      [](size_t base, const UniformRegion &region) {
+                        return base + region.predicatedBlocks.size();
+                      });
   std::vector<BasicBlock *> newBlocks;
   newBlocks.reserve(size);
 
@@ -666,7 +667,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
     // connected, we can replace the blended values uses with their new
     // value.
     DenseSet<Instruction *> toDelete;
-    for (URVBlender::value_type &blender : URVB) {
+    for (const URVBlender::value_type &blender : URVB) {
       BasicBlock *block = blender.first;
       Value *from = blender.second.first;
       Instruction *to = blender.second.second;
@@ -694,7 +695,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
                                  BasicBlock *to) {
     for (Instruction &I : *B) {
       if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
-        int fromIdx = PHI->getBasicBlockIndex(from);
+        const int fromIdx = PHI->getBasicBlockIndex(from);
         if (fromIdx != -1) {
           PHI->setIncomingBlock(fromIdx, to);
         }
@@ -1081,7 +1082,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
       // SSA doesn't have to look for the instructions inside the uniform loop.
       for (Instruction &I : *connectionPoint) {
         if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
-          int idx = PHI->getBasicBlockIndex(target);
+          const int idx = PHI->getBasicBlockIndex(target);
           VECZ_ERROR_IF(idx == -1,
                         "Connection point PHIs must have incoming "
                         "block from the target");
@@ -1261,7 +1262,7 @@ bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues(
     // latch. Since the CFG is final now, this should cover everything.
     for (Instruction &headerI : *LTag->header) {
       if (PHINode *PHI = dyn_cast<PHINode>(&headerI)) {
-        int latchIdx = PHI->getBasicBlockIndex(LTag->latch);
+        const int latchIdx = PHI->getBasicBlockIndex(LTag->latch);
         VECZ_ERROR_IF(latchIdx == -1,
                       "Header has no incoming value from the latch");
         if ((PHI == to) || (PHI->getIncomingValue(latchIdx) == from)) {
@@ -1368,7 +1369,7 @@ bool ControlFlowConversionState::BOSCCGadget::cleanUp() {
   // blend the same two values together. Also, sometimes values are blended
   // even though they have no further uses and can be removed as dead code.
 
-  RPOT rpot(&F);
+  const RPOT rpot(&F);
   std::vector<PHINode *> blends;
   for (auto *BB : rpot) {
     for (auto I = BB->begin(); I != BB->end();) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
index 5302486ba60c6..cd8762dbd7d39 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
@@ -124,7 +124,7 @@ bool ControlFlowConversionState::ROSCCGadget::run(Function &F) {
     // last. However, "getSuccessor(n)" also indexes backwards, from the end.
     BasicBlock *SuccT = Branch->getSuccessor(0);
     BasicBlock *SuccF = Branch->getSuccessor(1);
-    bool Which = isReturnBlock(*SuccT);
+    const bool Which = isReturnBlock(*SuccT);
 
     BasicBlock *ReturnBlock = Which ? SuccT : SuccF;
     Value *Cond = Branch->getCondition();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
index 851f47b40883c..0a175f60a6464 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
@@ -63,7 +63,7 @@ void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg,
   if (I) {
     ORE.emit(OptimizationRemarkMissed("vecz", "vecz", I) << RemarkMsg);
   } else {
-    DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
+    const DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
     ORE.emit(OptimizationRemarkMissed("vecz", "vecz", D, &(F->getEntryBlock()))
              << RemarkMsg);
   }
@@ -75,7 +75,7 @@ void emitVeczRemarkMissed(const Function *F, StringRef Msg, StringRef Note) {
 
 void emitVeczRemark(const Function *F, const Value *V, StringRef Msg) {
   const Instruction *I = V ? dyn_cast<Instruction>(V) : nullptr;
-  DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
+  const DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
 
   auto RemarkMsg = createRemarkMessage(V, Msg);
   OptimizationRemarkEmitter ORE(F);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index e99117f87ac17..ac06e3d84dd08 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -41,10 +41,12 @@ static std::string getMaskedMemOpName(Type *DataTy, PointerType *PtrTy,
   }
   compiler::utils::NameMangler Mangler(&DataTy->getContext());
   const char *BaseName = IsLoad ? "masked_load" : "masked_store";
-  compiler::utils::TypeQualifiers DataQuals(compiler::utils::eTypeQualNone);
-  compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
-                                           compiler::utils::eTypeQualNone);
-  compiler::utils::TypeQualifiers MaskQuals(compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers DataQuals(
+      compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers PtrQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers MaskQuals(
+      compiler::utils::eTypeQualNone);
   std::string Name;
   raw_string_ostream O(Name);
   O << VectorizationContext::InternalBuiltinPrefix << BaseName << Alignment
@@ -58,7 +60,8 @@ static std::string getMaskedMemOpName(Type *DataTy, PointerType *PtrTy,
     return std::string();
   }
   if (IsVP) {
-    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
     if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
                             VLQuals)) {
       return std::string();
@@ -72,7 +75,7 @@ Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
                                          Type *DataTy, PointerType *PtrTy,
                                          unsigned Alignment, bool IsLoad,
                                          bool IsVP) {
-  Module &M = Ctx.module();
+  const Module &M = Ctx.module();
   LLVMContext &LLVMCtx = M.getContext();
   Type *MaskTy = IntegerType::getInt1Ty(LLVMCtx);
   if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
@@ -80,7 +83,7 @@ Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
   }
 
   // Try to retrieve the builtin if it already exists.
-  std::string Name =
+  const std::string Name =
       getMaskedMemOpName(DataTy, PtrTy, MaskTy, Alignment, IsLoad, IsVP);
   VECZ_FAIL_IF(Name.empty());
   Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
@@ -158,10 +161,10 @@ static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy,
   compiler::utils::NameMangler Mangler(&DataTy->getContext());
   const char *BaseName = IsLoad ? "interleaved_load" : "interleaved_store";
   std::string Name;
-  compiler::utils::TypeQualifiers VecQuals(compiler::utils::eTypeQualNone,
-                                           compiler::utils::eTypeQualNone);
-  compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
-                                           compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers VecQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers PtrQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
   raw_string_ostream O(Name);
   O << VectorizationContext::InternalBuiltinPrefix;
   if (MaskTy) {
@@ -182,13 +185,15 @@ static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy,
     return std::string();
   }
   if (MaskTy) {
-    compiler::utils::TypeQualifiers MaskQuals(compiler::utils::eTypeQualNone);
+    const compiler::utils::TypeQualifiers MaskQuals(
+        compiler::utils::eTypeQualNone);
     if (!Mangler.mangleType(O, MaskTy, MaskQuals)) {
       return std::string();
     }
   }
   if (IsVP) {
-    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
     if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
                             VLQuals)) {
       return std::string();
@@ -207,8 +212,8 @@ Function *vecz::getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx,
   LLVMContext &LLVMCtx = M.getContext();
 
   // Try to retrieve the builtin if it already exists.
-  std::string Name = getInterleavedMemOpName(DataTy, PtrTy, Stride, MaskTy,
-                                             Alignment, IsLoad, IsVP);
+  const std::string Name = getInterleavedMemOpName(
+      DataTy, PtrTy, Stride, MaskTy, Alignment, IsLoad, IsVP);
   VECZ_FAIL_IF(Name.empty());
   Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
   if (!F) {
@@ -296,11 +301,12 @@ static std::string getScatterGatherMemOpName(Type *DataTy, VectorType *VecPtrTy,
   compiler::utils::NameMangler Mangler(&DataTy->getContext());
   const char *BaseName = IsGather ? "gather_load" : "scatter_store";
   std::string Name;
-  compiler::utils::TypeQualifiers VecQuals(compiler::utils::eTypeQualNone,
-                                           compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers VecQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
   compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
                                            compiler::utils::eTypeQualNone);
-  compiler::utils::TypeQualifiers MaskQuals(compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers MaskQuals(
+      compiler::utils::eTypeQualNone);
   PtrQuals.push_back(compiler::utils::eTypeQualNone);
   raw_string_ostream O(Name);
   O << VectorizationContext::InternalBuiltinPrefix;
@@ -319,7 +325,8 @@ static std::string getScatterGatherMemOpName(Type *DataTy, VectorType *VecPtrTy,
     return std::string();
   }
   if (IsVP) {
-    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
     if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
                             VLQuals)) {
       return std::string();
@@ -335,15 +342,15 @@ Function *vecz::getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx,
                                                 llvm::Type *MaskTy,
                                                 unsigned Alignment,
                                                 bool IsGather, bool IsVP) {
-  Module &M = Ctx.module();
+  const Module &M = Ctx.module();
   LLVMContext &LLVMCtx = M.getContext();
   assert(VecPtrTy);
   assert(!MaskTy || multi_llvm::getVectorElementCount(MaskTy) ==
                         multi_llvm::getVectorElementCount(DataTy));
 
   // Try to retrieve the builtin if it already exists.
-  std::string Name = getScatterGatherMemOpName(DataTy, VecPtrTy, MaskTy,
-                                               Alignment, IsGather, IsVP);
+  const std::string Name = getScatterGatherMemOpName(DataTy, VecPtrTy, MaskTy,
+                                                     Alignment, IsGather, IsVP);
   VECZ_FAIL_IF(Name.empty());
   Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
   if (!F) {
@@ -461,7 +468,7 @@ std::optional<MemOpDesc> MemOpDesc::analyzeMemOpFunction(Function &F) {
 }
 
 std::optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
-  StringRef MangledName = F.getName();
+  const StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
     return std::nullopt;
@@ -522,7 +529,7 @@ std::optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
 }
 
 std::optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(Function &F) {
-  StringRef MangledName = F.getName();
+  const StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
     return std::nullopt;
@@ -602,7 +609,7 @@ std::optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(Function &F) {
 }
 
 std::optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(Function &F) {
-  StringRef MangledName = F.getName();
+  const StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
     return std::nullopt;
@@ -700,7 +707,7 @@ std::optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(Function &F) {
 }
 
 std::optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(Function &F) {
-  StringRef MangledName = F.getName();
+  const StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
     return std::nullopt;
@@ -754,7 +761,7 @@ std::optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(Function &F) {
 
 std::optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
     Function &F) {
-  StringRef MangledName = F.getName();
+  const StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
     return std::nullopt;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 218f17d2984a2..cc9f700804312 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -245,8 +245,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
       }
       BitMask = Src.BitMask & typeMask;
     } else if (isa<SExtInst>(Cast)) {
-      uint64_t widthMask = getTypeMask(Cast->getSrcTy());
-      uint64_t signMask = (widthMask >> 1) + 1;
+      const uint64_t widthMask = getTypeMask(Cast->getSrcTy());
+      const uint64_t signMask = (widthMask >> 1) + 1;
       if (Src.BitMask & signMask) {
         // If it's possible for the source value to be negative, all of the
         // bits in the extended value might be set.
@@ -364,7 +364,8 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
         if (((VUArg.OldArg == Arg) || (VUArg.NewArg == Arg)) &&
             VUArg.PointerRetPointeeTy) {
           Type *MemTy = VUArg.PointerRetPointeeTy;
-          uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
+          const uint64_t MemSize =
+              SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
           return setStride(MemSize);
         }
       }
@@ -384,7 +385,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
     }
 
     Type *MemTy = Alloca->getAllocatedType();
-    uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
+    const uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
     return setStride(MemSize);
   } else if (auto *const Phi = dyn_cast<PHINode>(Address)) {
     // If all the incoming values are the same, we can trace through it. In
@@ -481,7 +482,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
       if (idxOffset.isStrideConstantInt()) {
         // Add all the strides together,
         // since `Base + (A * X) + (B * X) == Base + (A + B) * X`
-        uint64_t MemSize = SizeOrZero(
+        const uint64_t MemSize = SizeOrZero(
             GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy));
         GEPStrideInt += idxOffset.StrideInt * MemSize;
       } else {
@@ -629,7 +630,7 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
       // process of manifesting the indices can change the insert point.
       B.SetInsertPoint(GEP);
       Value *idxStride = nullptr;
-      uint64_t MemSize =
+      const uint64_t MemSize =
           SizeOrZero(GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy));
       if (MemSize == 1) {
         // Don't need to do anything if the size is 1
@@ -665,7 +666,7 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
 
 uint64_t OffsetInfo::getConstantMemoryStride(Type *PtrEleTy,
                                              const DataLayout *DL) const {
-  uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
+  const uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
   VECZ_FAIL_IF(!PtrEleSize);
 
   // It's not a valid stride if it's not divisible by the element size.
@@ -684,7 +685,7 @@ Value *OffsetInfo::buildMemoryStride(IRBuilder<> &B, Type *PtrEleTy,
     return nullptr;
   }
 
-  uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
+  const uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
   VECZ_FAIL_IF(!PtrEleSize);
 
   // It's not a valid stride if it's not divisible by the element size.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index de1097e3bf9ff..8f90e78f17f86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -334,7 +334,7 @@ std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
     // If we have a required work-group size, prefer one that will fit well
     // with that.
     if (auto wgs = compiler::utils::parseRequiredWGSMetadata(F)) {
-      uint64_t local_size_x = wgs.value()[0];
+      const uint64_t local_size_x = wgs.value()[0];
       const bool best_fits = !(local_size_x % *best_width);
       const bool cand_fits = !(local_size_x % candidate_width);
       if (!best_fits && cand_fits) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
index 6bc9efac517a8..2fbff40ef2e10 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
@@ -85,7 +85,7 @@ void Reachability::recalculate(Function &F) {
         continue;
       }
 
-      size_t succIndex = indexMap[succ];
+      const size_t succIndex = indexMap[succ];
 
       node.successors.push_back(succIndex);
       auto &succNode = graph[succIndex];
@@ -95,13 +95,13 @@ void Reachability::recalculate(Function &F) {
 
     if (auto *DTNode = DT.getNode(&BB)) {
       if (auto *IDom = DTNode->getIDom()) {
-        size_t dom = indexMap[IDom->getBlock()];
+        const size_t dom = indexMap[IDom->getBlock()];
         node.dom = dom;
       }
     }
     if (auto *PDTNode = PDT.getNode(&BB)) {
       if (auto *IPDom = PDTNode->getIDom()) {
-        size_t postDom = indexMap[IPDom->getBlock()];
+        const size_t postDom = indexMap[IPDom->getBlock()];
         node.postDom = postDom;
       }
     }
@@ -133,12 +133,12 @@ void Reachability::recalculate(Function &F) {
   std::vector<size_t> rootsY = roots;
 
   while (!roots.empty()) {
-    size_t u = roots.back();
+    const size_t u = roots.back();
     roots.pop_back();
 
     auto &uNode = graph[u];
     uNode.X = Xindex++;
-    for (size_t v : uNode.successors) {
+    for (const size_t v : uNode.successors) {
       auto &vNode = graph[v];
       if (--vNode.predTmp == 0) {
         roots.push_back(v);
@@ -160,14 +160,14 @@ void Reachability::recalculate(Function &F) {
   // the property of a max heap. No need to make_heap!
   while (!roots.empty()) {
     std::pop_heap(roots.begin(), roots.end(), cmpY);
-    size_t u = roots.back();
+    const size_t u = roots.back();
     roots.pop_back();
 
     auto &uNode = graph[u];
     uNode.Y = Yindex++;
     for (auto vi = uNode.successors.rbegin(), ve = uNode.successors.rend();
          vi != ve; ++vi) {
-      size_t v = *vi;
+      const size_t v = *vi;
       auto &vNode = graph[v];
       if (--vNode.predTmp == 0) {
         roots.push_back(v);
@@ -183,7 +183,7 @@ void Reachability::recalculate(Function &F) {
       dbgs() << BB.getName() << ":\n";
       dbgs() << "[ " << node.X << ", " << node.Y << " ] : ";
       dbgs() << "( " << node.dom << ", " << node.postDom << " ) : ";
-      for (size_t s : node.successors) {
+      for (const size_t s : node.successors) {
         if (graph[s].X <= graph[i].X) {
           dbgs() << "!x!";
         }
@@ -202,7 +202,7 @@ void Reachability::recalculate(Function &F) {
 
 bool Reachability::validate() const {
   for (auto &node : graph) {
-    for (size_t s : node.successors) {
+    for (const size_t s : node.successors) {
       if (graph[s].X <= node.X || graph[s].Y <= node.Y) {
         return false;
       }
@@ -223,8 +223,8 @@ bool Reachability::isReachableImpl(size_t from, size_t to) const {
       return false;
     }
 
-    size_t dom = nodeTo.dom;
-    size_t postDom = nodeFrom.postDom;
+    const size_t dom = nodeTo.dom;
+    const size_t postDom = nodeFrom.postDom;
     if (dom == from || postDom == to) {
       return true;
     }
@@ -244,7 +244,7 @@ bool Reachability::isReachableImpl(size_t from, size_t to) const {
     }
 
     // possible false positive, so check recursively..
-    for (size_t succ : nodeFrom.successors) {
+    for (const size_t succ : nodeFrom.successors) {
       if (succ == to) {
         return true;
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index 26d243724d842..f8c49fa1fbe7b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -90,7 +90,7 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
                                 ArrayRef<Value *> Args, llvm::CallBase *CB) {
   LLVMContext &Context = F->getContext();
   auto &DL = F->getParent()->getDataLayout();
-  unsigned PtrBits = DL.getPointerSizeInBits();
+  const unsigned PtrBits = DL.getPointerSizeInBits();
 
   // Check the alignment constraints do not exceed the algorithmic requirements
   // of doing 64 bits at time
@@ -99,8 +99,8 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   const auto &MSI = cast<MemSetInst>(CB);
 
   // Note that once LLVM 8.0 is deprecated we can use actual alignment classes
-  Align Alignment = MSI->getDestAlign().valueOrOne();
-  Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
+  const Align Alignment = MSI->getDestAlign().valueOrOne();
+  const Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
   if (Alignment < std::max(Int64Alignment, Align(8u))) {
     return nullptr;
   }
@@ -109,7 +109,7 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   Type *Int8Ty = B.getInt8Ty();
 
   Value *StoredValue = Args[1];
-  bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
+  const bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
   llvm::StoreInst *MS = nullptr;
 
   // For nicely named IR instructions
@@ -120,7 +120,7 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   if (!CL) {
     return nullptr;
   }
-  int64_t Bytes = CL->getValue().getZExtValue();
+  const int64_t Bytes = CL->getValue().getZExtValue();
 
   // Unlike memcpy, if we want to use 64bit stores in memset we need to
   // construct the 64bit value from a 8bit one.
@@ -130,7 +130,7 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   if (ConstantValue) {
     // If we can get the value at compile time, calculate the 64bit value at
     // compile time as well.
-    unsigned IntValue = ConstantValue->getZExtValue();
+    const unsigned IntValue = ConstantValue->getZExtValue();
     APInt APValue(64, IntValue);
     for (int i = 1; IntValue && i < 8; ++i) {
       APValue |= APValue << 8;
@@ -165,7 +165,8 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
 
     // Set alignments for store to be minimum of that from
     // the instruction and what is required for 8 byte stores
-    Align StoreAlign = byte == 0 ? Alignment : std::min(Align(8u), Alignment);
+    const Align StoreAlign =
+        byte == 0 ? Alignment : std::min(Align(8u), Alignment);
     MS->setAlignment(StoreAlign);
   }
   // ...and then we fill in the remaining with 8bit stores.
@@ -185,9 +186,9 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
   auto &DL = F->getParent()->getDataLayout();
 
   const auto &MSI = cast<MemCpyInst>(CB);
-  Align DestAlignment = MSI->getDestAlign().valueOrOne();
-  Align SourceAlignment = MSI->getSourceAlign().valueOrOne();
-  Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
+  const Align DestAlignment = MSI->getDestAlign().valueOrOne();
+  const Align SourceAlignment = MSI->getSourceAlign().valueOrOne();
+  const Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
 
   if (DestAlignment < std::max(Int64Alignment, Align(8u))) {
     return nullptr;
@@ -197,13 +198,13 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
     return nullptr;
   }
 
-  unsigned PtrBits = DL.getPointerSizeInBits();
+  const unsigned PtrBits = DL.getPointerSizeInBits();
 
   Value *DstPtr = Args[0];
   Value *SrcPtr = Args[1];
   Type *Int8Ty = B.getInt8Ty();
 
-  bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
+  const bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
   llvm::StoreInst *MC = nullptr;
 
   // For nicely named IR instructions
@@ -216,7 +217,7 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
   if (!CL) {
     return nullptr;
   }
-  int64_t Length = CL->getValue().getSExtValue();
+  const int64_t Length = CL->getValue().getSExtValue();
 
   // Emit enough stores to replicate the behaviour of memcpy.
   int64_t byte = 0;
@@ -241,10 +242,10 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
 
     // Set alignments for stores and loads to be minimum of that from
     // the instruction and what is required for 8 byte load/stores
-    Align StoreAlign =
+    const Align StoreAlign =
         byte == 0 ? DestAlignment : std::min(Align(8u), DestAlignment);
     MC->setAlignment(StoreAlign);
-    Align LoadAlign =
+    const Align LoadAlign =
         byte == 0 ? SourceAlignment : std::min(Align(8u), SourceAlignment);
     LoadValue->setAlignment(LoadAlign);
   }
@@ -285,7 +286,7 @@ Value *BuiltinInliningPass::processCallSite(CallInst *CI,
   if (Callee->isIntrinsic()) {
     if (Callee->getIntrinsicID() == Intrinsic::memcpy) {
       IRBuilder<> B(CI);
-      SmallVector<Value *, 4> Args(CI->args());
+      const SmallVector<Value *, 4> Args(CI->args());
       if (Value *Impl = emitBuiltinMemCpy(Callee, B, Args, CI)) {
         return Impl;
       }
@@ -293,7 +294,7 @@ Value *BuiltinInliningPass::processCallSite(CallInst *CI,
 
     if (Callee->getIntrinsicID() == Intrinsic::memset) {
       IRBuilder<> B(CI);
-      SmallVector<Value *, 4> Args(CI->args());
+      const SmallVector<Value *, 4> Args(CI->args());
       if (Value *Impl = emitBuiltinMemSet(Callee, B, Args, CI)) {
         return Impl;
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 427a7ae71973b..be0d30ed6f9ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -950,7 +950,7 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
   Type *maskTy = Type::getInt1Ty(F.getContext());
   SmallVector<Loop::Edge, 1> exitEdges;
   LTag.loop->getExitEdges(exitEdges);
-  for (Loop::Edge &EE : exitEdges) {
+  for (const Loop::Edge &EE : exitEdges) {
     const auto *const exitingBlock = EE.first;
     const auto *const exitBlock = EE.second;
     // Divergent loop need to keep track of which instance left at which exit.
@@ -996,7 +996,7 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
     return innerLoop;
   };
 
-  for (Loop::Edge &EE : exitEdges) {
+  for (const Loop::Edge &EE : exitEdges) {
     BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
     BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
 
@@ -1077,7 +1077,7 @@ bool ControlFlowConversionState::Impl::createCombinedLoopExitMask(
   auto *const Loop = LTag.loop;
   Loop->getExitEdges(exitEdges);
   auto &LMask = LoopMasks[Loop];
-  for (Loop::Edge &EE : exitEdges) {
+  for (const Loop::Edge &EE : exitEdges) {
     BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
     BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
     if (DR->isDivergent(*exitBlock)) {
@@ -1863,7 +1863,7 @@ bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks(
         // proven otherwise, should not.
         break;
       case Instruction::Br: {
-        unsigned keepIdx = succIdx == 0 ? 1 : 0;
+        const unsigned keepIdx = succIdx == 0 ? 1 : 0;
         auto *newT = BranchInst::Create(T->getSuccessor(keepIdx), T);
 
         updateMaps(T, newT);
@@ -2674,8 +2674,8 @@ bool ControlFlowConversionState::Impl::generateSelects() {
     if (B->hasNPredecessors(1) || DR->isBlend(*B)) {
       if (PHINode *PHI = dyn_cast<PHINode>(&B->front())) {
         LLVM_DEBUG(dbgs() << B->getName() << ":\n");
-        SmallPtrSet<BasicBlock *, 2> incomings(PHI->block_begin(),
-                                               PHI->block_end());
+        const SmallPtrSet<BasicBlock *, 2> incomings(PHI->block_begin(),
+                                                     PHI->block_end());
         BasicBlock *cur = B;
         while (cur->hasNPredecessors(1) && !incomings.empty()) {
           cur = cur->getSinglePredecessor();
@@ -2793,7 +2793,7 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
   // have changed since we have not changed the phi nodes during the rewiring.
   for (const auto &BBTag : DR->getBlockOrdering()) {
     BasicBlock *BB = BBTag.BB;
-    SmallPtrSet<BasicBlock *, 4> preds(pred_begin(BB), pred_end(BB));
+    const SmallPtrSet<BasicBlock *, 4> preds(pred_begin(BB), pred_end(BB));
     for (auto it = BB->begin(); it != BB->end();) {
       Instruction &I = *it++;
       PHINode *PHI = dyn_cast<PHINode>(&I);
@@ -2801,8 +2801,8 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
         break;
       }
 
-      SmallPtrSet<BasicBlock *, 4> incomings(PHI->block_begin(),
-                                             PHI->block_end());
+      const SmallPtrSet<BasicBlock *, 4> incomings(PHI->block_begin(),
+                                                   PHI->block_end());
 
       // If no predecessors of `BB` is an incoming block of its PHI Node, then
       // completely transform the PHI Node into multiple select instructions.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index 9e5a619b6a1e3..f83b16b03f689 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -64,7 +64,7 @@ Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
   const auto Builtin = BI.analyzeBuiltin(*Callee);
   if (Builtin.properties &
       compiler::utils::eBuiltinPropertyInlinePostVectorization) {
-    SmallVector<Value *, 4> Args(CI->args());
+    const SmallVector<Value *, 4> Args(CI->args());
     if (Value *Impl = BI.emitBuiltinInline(Callee, B, Args)) {
       VECZ_ERROR_IF(
           Impl->getType() != CI->getType(),
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
index 4b6adeeb792ba..c6f69afa0abf2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -58,7 +58,7 @@ InstantiationPass::InstantiationPass(Packetizer &pp)
 PacketRange InstantiationPass::instantiate(Value *V) {
   VECZ_FAIL_IF(packetizer.width().isScalable());
   if (auto info = packetizer.getPacketized(V)) {
-    unsigned SimdWidth = packetizer.width().getFixedValue();
+    const unsigned SimdWidth = packetizer.width().getFixedValue();
     return info.getAsPacket(SimdWidth);
   }
 
@@ -66,7 +66,7 @@ PacketRange InstantiationPass::instantiate(Value *V) {
   // items.
   auto *Ins = dyn_cast<Instruction>(V);
   if (Ins && packetizer.uniform().isMaskVarying(V)) {
-    PacketRange P = simdBroadcast(Ins);
+    const PacketRange P = simdBroadcast(Ins);
     if (!P) {
       emitVeczRemark(&packetizer.function(), V,
                      "Failed to broadcast Mask Varying instruction");
@@ -90,7 +90,7 @@ PacketRange InstantiationPass::instantiateInternal(Value *V) {
   if (packetizer.uniform().isVarying(V)) {
     // The packetizer will call back into the instantiator when it needs to
     VECZ_FAIL_IF(packetizer.width().isScalable());
-    unsigned SimdWidth = packetizer.width().getFixedValue();
+    const unsigned SimdWidth = packetizer.width().getFixedValue();
     return packetizer.packetize(V).getAsPacket(SimdWidth);
   } else {
     return instantiate(V);
@@ -126,7 +126,7 @@ PacketRange InstantiationPass::assignInstance(const PacketRange P, Value *V) {
 
 PacketRange InstantiationPass::broadcast(Value *V) {
   VECZ_FAIL_IF(packetizer.width().isScalable());
-  unsigned SimdWidth = packetizer.width().getFixedValue();
+  const unsigned SimdWidth = packetizer.width().getFixedValue();
   PacketRange P = packetizer.createPacket(V, SimdWidth);
   for (unsigned i = 0; i < SimdWidth; i++) {
     P[i] = V;
@@ -136,9 +136,9 @@ PacketRange InstantiationPass::broadcast(Value *V) {
 
 PacketRange InstantiationPass::instantiateCall(CallInst *CI) {
   VECZ_FAIL_IF(packetizer.width().isScalable());
-  unsigned SimdWidth = packetizer.width().getFixedValue();
+  const unsigned SimdWidth = packetizer.width().getFixedValue();
   // Handle special call instructions that return a lane ID.
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   const auto Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension());
   if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
     const auto Uniformity = Builtin.uniformity;
@@ -249,7 +249,7 @@ PacketRange InstantiationPass::instantiateCall(CallInst *CI) {
 
 PacketRange InstantiationPass::instantiateAlloca(AllocaInst *Alloca) {
   VECZ_FAIL_IF(packetizer.width().isScalable());
-  unsigned SimdWidth = packetizer.width().getFixedValue();
+  const unsigned SimdWidth = packetizer.width().getFixedValue();
   PacketRange P = packetizer.createPacket(Alloca, SimdWidth);
   VECZ_FAIL_IF(!P);
   IRBuilder<> B(Alloca);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index d1cbc37410c5d..de1b2b5c1240e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -267,7 +267,7 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
       if (!Op || !Op->isStrideConstantInt()) {
         continue;
       }
-      int64_t Stride = Op->getStrideAsConstantInt();
+      const int64_t Stride = Op->getStrideAsConstantInt();
       if ((Stride == 0) || (Stride == 1)) {
         continue;
       }
@@ -383,7 +383,7 @@ bool InterleavedGroupCombinePass::findGroup(
       Ptr0 = Info0.Op->getOperand(0);
     }
 
-    IRBuilder<> B(cast<Instruction>(Info0.Op));
+    const IRBuilder<> B(cast<Instruction>(Info0.Op));
     Value *Base0 = UVR.extractMemBase(Ptr0);
     if (!Base0) {
       continue;
@@ -395,7 +395,7 @@ bool InterleavedGroupCombinePass::findGroup(
     }
 
     Type *EleTy = DataType0->getScalarType();
-    unsigned Align = EleTy->getScalarSizeInBits() / 8;
+    const unsigned Align = EleTy->getScalarSizeInBits() / 8;
     assert(Align != 0 &&
            "interleaved memory operation with zero-sized elements");
 
@@ -429,7 +429,7 @@ bool InterleavedGroupCombinePass::findGroup(
         continue;
       }
 
-      IRBuilder<> B(cast<Instruction>(InfoN.Op));
+      const IRBuilder<> B(cast<Instruction>(InfoN.Op));
       Value *BaseN = UVR.extractMemBase(PtrN);
       if (!BaseN || BaseN != Base0) {
         continue;
@@ -503,7 +503,7 @@ bool InterleavedGroupCombinePass::findGroup(
         continue;
       }
 
-      unsigned Stride = Info0.Stride;
+      const unsigned Stride = Info0.Stride;
       Group.Stride = Stride;
       // If the group is bigger than the stride we can still de-interleave the
       // first "Stride" members
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 63838d477b201..6c7411ef6e78e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -65,7 +65,7 @@ inline Type *getWideType(Type *ty, ElementCount factor) {
   assert((!factor.isScalable() || !isScalable) &&
          "Can't widen a scalable vector by a scalable amount");
   auto *vecTy = cast<llvm::VectorType>(ty);
-  unsigned elts = vecTy->getElementCount().getKnownMinValue();
+  const unsigned elts = vecTy->getElementCount().getKnownMinValue();
   // If we're widening a scalable type then set the fixed factor to scalable
   // here.
   if (isScalable && !factor.isScalable()) {
@@ -245,7 +245,7 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
     return false;
   }
 
-  unsigned srcWidth = vecTy->getNumElements();
+  const unsigned srcWidth = vecTy->getNumElements();
 
   // Build shuffle mask to widen the vector condition.
   SmallVector<int, 16> mask;
@@ -424,7 +424,7 @@ Value *Packetizer::Result::getAsValue() const {
     // Gathering an instantiated vector by concatenating all the lanes
     auto parts = narrow(2);
     auto *vecTy = cast<FixedVectorType>(parts.front()->getType());
-    unsigned fullWidth = vecTy->getNumElements() * 2;
+    const unsigned fullWidth = vecTy->getNumElements() * 2;
 
     SmallVector<int, 16> mask;
     for (size_t j = 0; j < fullWidth; ++j) {
@@ -480,7 +480,7 @@ PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
   Value *vec = info->vector;
   if (auto *const vecTy = dyn_cast<FixedVectorType>(vec->getType())) {
     assert(isa<FixedVectorType>(vecTy) && "Must be a fixed vector type here!");
-    unsigned scalarWidth = vecTy->getNumElements() / width;
+    const unsigned scalarWidth = vecTy->getNumElements() / width;
     if (scalarWidth > 1) {
       auto *const undef = UndefValue::get(vec->getType());
 
@@ -716,9 +716,9 @@ const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
     result = createScalableBroadcastOfFixedVector(TI, B, scalar, factor);
   } else if (ty->isVectorTy()) {
     auto *const vecTy = cast<FixedVectorType>(ty);
-    unsigned scalarWidth = vecTy->getNumElements();
+    const unsigned scalarWidth = vecTy->getNumElements();
 
-    unsigned simdWidth = factor.getFixedValue();
+    const unsigned simdWidth = factor.getFixedValue();
 
     // Build shuffle mask to perform the splat.
     SmallVector<int, 16> mask;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
index 1934a81938be7..3462bb05ebb7b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
@@ -49,10 +49,10 @@ PreservedAnalyses PacketizationPass::run(Function &F,
   VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
 
   if (!VU.width().isScalable()) {
-    unsigned SimdWidth = VU.width().getFixedValue();
+    const unsigned SimdWidth = VU.width().getFixedValue();
     if (VU.autoWidth() && VU.context().targetInfo().getTargetMachine()) {
       LLVM_DEBUG(dbgs() << "vecz: Original SIMD width: " << SimdWidth << "\n");
-      unsigned NewSimdWidth = AM.getResult<SimdWidthAnalysis>(F).value;
+      const unsigned NewSimdWidth = AM.getResult<SimdWidthAnalysis>(F).value;
       LLVM_DEBUG(dbgs() << "vecz: Re-determined SIMD width: " << NewSimdWidth
                         << "\n");
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 98209bb56107d..524a263521281 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -472,7 +472,7 @@ Packetizer::Impl::~Impl() = default;
 bool Packetizer::packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
                            ElementCount Width, unsigned Dim) {
   Impl impl(F, AM, Width, Dim);
-  bool Res = impl.packetize();
+  const bool Res = impl.packetize();
   if (!Res) {
     impl.onFailure();
   }
@@ -689,7 +689,7 @@ bool Packetizer::Impl::packetize() {
       // back to vector type for contiguous loads/stores)
       bool needCast = false;
       auto *const newTy = newAlloca->getType();
-      for (Use &U : alloca->uses()) {
+      for (const Use &U : alloca->uses()) {
         auto *const user = dyn_cast<BitCastInst>(U.getUser());
         if (!user) {
           needCast = true;
@@ -727,13 +727,13 @@ bool Packetizer::Impl::packetize() {
       // Make sure the alloca has an alignment at least as wide as any of the
       // packetized loads or stores using it.
       SmallVector<Instruction *, 8> users;
-      for (Use &U : alloca->uses()) {
+      for (const Use &U : alloca->uses()) {
         users.push_back(cast<Instruction>(U.getUser()));
       }
       while (!users.empty()) {
         auto *const user = users.pop_back_val();
         if (isa<BitCastInst>(user) || isa<GetElementPtrInst>(user)) {
-          for (Use &U : user->uses()) {
+          for (const Use &U : user->uses()) {
             users.push_back(cast<Instruction>(U.getUser()));
           }
         } else if (auto memop = MemOp::get(user)) {
@@ -758,7 +758,7 @@ bool Packetizer::Impl::packetize() {
     IC.deleteInstructionLater(alloca);
   }
 
-  compiler::utils::NameMangler Mangler(&F.getContext());
+  const compiler::utils::NameMangler Mangler(&F.getContext());
 
   // Handle __mux_get_sub_group_size specially (i.e., not in BuiltinInfo) since
   // inlining it requires extra vectorization context, such as the vectorization
@@ -914,7 +914,7 @@ Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
     }
   }
 
-  RecurKind kind = allOf ? RecurKind::And : RecurKind::Or;
+  const RecurKind kind = allOf ? RecurKind::And : RecurKind::Or;
 
   // VP reduction intrinsics didn't make it into LLVM 13 so we have to make do
   // by pre-sanitizing the input such that elements past VL get the identity
@@ -975,7 +975,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
       // a divergence reduction so it will need reducing manually here.
       if (newCond->getType()->isVectorTy()) {
         IRBuilder<> B(Branch);
-        RecurKind kind = RecurKind::Or;
+        const RecurKind kind = RecurKind::Or;
         newCond = createMaybeVPTargetReduction(B, TTI, newCond, kind, VL);
       }
 
@@ -1212,7 +1212,7 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   if (!CI || !CI->getCalledFunction()) {
     return nullptr;
   }
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
 
   const auto Builtin = BI.analyzeBuiltin(*callee);
@@ -1223,8 +1223,8 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
     return nullptr;
   }
 
-  bool isWorkGroup = Info->isWorkGroupScope();
-  unsigned argIdx = isWorkGroup ? 1 : 0;
+  const bool isWorkGroup = Info->isWorkGroupScope();
+  const unsigned argIdx = isWorkGroup ? 1 : 0;
 
   SmallVector<Value *, 16> opPackets;
   IRBuilder<> B(CI);
@@ -1285,7 +1285,7 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   if (!CI || !CI->getCalledFunction()) {
     return nullptr;
   }
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
   const auto Builtin = BI.analyzeBuiltin(*callee);
 
@@ -1302,7 +1302,7 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
 
   IRBuilder<> B(CI);
 
-  unsigned argIdx = isWorkGroup ? 1 : 0;
+  const unsigned argIdx = isWorkGroup ? 1 : 0;
   auto *const src = CI->getArgOperand(argIdx);
 
   auto op = packetize(src);
@@ -1386,7 +1386,7 @@ Packetizer::Impl::isSubgroupShuffleLike(Instruction *I) {
   if (!CI || !CI->getCalledFunction()) {
     return std::nullopt;
   }
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
 
   const auto Builtin = BI.analyzeBuiltin(*callee);
@@ -1666,7 +1666,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
 
 Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
     Instruction *I, compiler::utils::GroupCollective ShuffleUpDown) {
-  bool IsDown =
+  const bool IsDown =
       ShuffleUpDown.Op == compiler::utils::GroupCollective::OpKind::ShuffleDown;
   assert((IsDown || ShuffleUpDown.Op ==
                         compiler::utils::GroupCollective::OpKind::ShuffleUp) &&
@@ -1911,8 +1911,8 @@ Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
 
     Value *vecMask = nullptr;
 
-    MemOpDesc desc = memop->getDesc();
-    bool isVector = desc.getDataType()->isVectorTy();
+    const MemOpDesc desc = memop->getDesc();
+    const bool isVector = desc.getDataType()->isVectorTy();
 
     // If only the mask operand is varying, we do not need to vectorize the
     // MemOp itself, only reduce the mask with an OR.
@@ -2257,14 +2257,14 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
 
   compiler::utils::NameMangler mangler(&CI->getContext());
 
-  unsigned ArgOffset = Scan.isWorkGroupScope() ? 1 : 0;
+  const unsigned ArgOffset = Scan.isWorkGroupScope() ? 1 : 0;
 
   // The operands and types for the internal builtin
   SmallVector<Value *, 2> Ops = {
       packetize(CI->getArgOperand(ArgOffset)).getAsValue()};
   SmallVector<Type *, 2> Tys = {getWideType(CI->getType(), SimdWidth)};
 
-  bool isInclusive =
+  const bool isInclusive =
       Scan.Op == compiler::utils::GroupCollective::OpKind::ScanInclusive;
   StringRef op = "add";
   // min/max scans are prefixed with s/u if they are signed/unsigned integer
@@ -2328,7 +2328,7 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
     << (isInclusive ? "inclusive" : "exclusive") << "_" << op
     << (VP ? "_vp" : "") << "_";
 
-  compiler::utils::TypeQualifiers VecQuals(
+  const compiler::utils::TypeQualifiers VecQuals(
       compiler::utils::eTypeQualNone, opIsSignedInt
                                           ? compiler::utils::eTypeQualSignedInt
                                           : compiler::utils::eTypeQualNone);
@@ -2340,7 +2340,8 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
   if (VP) {
     Ops.push_back(VL);
     Tys.push_back(VL->getType());
-    compiler::utils::TypeQualifiers VLQuals(compiler::utils::eTypeQualNone);
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
     if (!mangler.mangleType(O, Tys[1], VLQuals)) {
       return results;
     }
@@ -2513,9 +2514,10 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
   }
 
   auto *const constantStrideVal = dyn_cast_or_null<ConstantInt>(stride);
-  int constantStride =
+  const int constantStride =
       constantStrideVal ? constantStrideVal->getSExtValue() : 0;
-  bool validStride = stride && (!constantStrideVal || constantStride != 0);
+  const bool validStride =
+      stride && (!constantStrideVal || constantStride != 0);
   if (!validStride) {
     if (dataTy->isPointerTy()) {
       // We do not have vector-of-pointers support in Vecz builtins, hence
@@ -2730,12 +2732,13 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     // alignment, but may be overaligned. After vectorization it can't be
     // larger than the pointee element type.
     unsigned alignment = op.getAlignment();
-    unsigned sizeInBits = dataTy->getPrimitiveSizeInBits().getKnownMinValue();
+    const unsigned sizeInBits =
+        dataTy->getPrimitiveSizeInBits().getKnownMinValue();
     alignment = std::min(alignment, std::max(sizeInBits, 8u) / 8u);
 
     // Regular load or store.
     if (mask) {
-      bool isVectorMask = mask->getType()->isVectorTy();
+      const bool isVectorMask = mask->getType()->isVectorTy();
       auto maskPacket = packetizeAndGet(mask, packetWidth);
       PACK_FAIL_IF(maskPacket.empty());
 
@@ -2744,8 +2747,8 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       auto *const vecTy = dyn_cast<FixedVectorType>(dataTy);
       if (vecTy && !isVectorMask) {
         PACK_FAIL_IF(factor.isScalable());
-        unsigned simdWidth = factor.getFixedValue();
-        unsigned scalarWidth = vecTy->getNumElements();
+        const unsigned simdWidth = factor.getFixedValue();
+        const unsigned scalarWidth = vecTy->getNumElements();
 
         // Build shuffle mask to widen the vector condition.
         SmallVector<int, 16> widenMask;
@@ -2777,7 +2780,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
         }
       }
     } else {
-      TargetInfo &VTI = Ctx.targetInfo();
+      const TargetInfo &VTI = Ctx.targetInfo();
       if (op.isLoad()) {
         auto *const one = B.getInt64(1);
         for (unsigned i = 0; i != packetWidth; ++i) {
@@ -2833,7 +2836,7 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomic(
   }
 
   ValuePacket valOrCmpPacket;
-  Result valResult = packetize(valOrCmpArg);
+  const Result valResult = packetize(valOrCmpArg);
   PACK_FAIL_IF(!valResult);
   valResult.getPacketValues(packetWidth, valOrCmpPacket);
   PACK_FAIL_IF(valOrCmpPacket.empty());
@@ -2841,20 +2844,20 @@ ValuePacket Packetizer::Impl::packetizeMaskedAtomic(
   ValuePacket newValPacket;
   if (IsCmpXchg) {
     Value *const newValArg = CI.getArgOperand(2);
-    Result newValResult = packetize(newValArg);
+    const Result newValResult = packetize(newValArg);
     PACK_FAIL_IF(!newValResult);
     newValResult.getPacketValues(packetWidth, newValPacket);
     PACK_FAIL_IF(newValPacket.empty());
   }
 
   ValuePacket ptrPacket;
-  Result ptrResult = packetize(ptrArg);
+  const Result ptrResult = packetize(ptrArg);
   PACK_FAIL_IF(!ptrResult);
   ptrResult.getPacketValues(packetWidth, ptrPacket);
   PACK_FAIL_IF(ptrPacket.empty());
 
   ValuePacket maskPacket;
-  Result maskResult = packetize(maskArg);
+  const Result maskResult = packetize(maskArg);
   PACK_FAIL_IF(!maskResult);
   maskResult.getPacketValues(packetWidth, maskPacket);
   PACK_FAIL_IF(maskPacket.empty());
@@ -2946,7 +2949,7 @@ ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) {
   IRBuilder<> B(GEP);
   IC.deleteInstructionLater(GEP);
 
-  bool inBounds = GEP->isInBounds();
+  const bool inBounds = GEP->isInBounds();
   const auto name = GEP->getName();
 
   const auto numIndices = opPackets.size();
@@ -3182,7 +3185,7 @@ ValuePacket Packetizer::Impl::packetizeSelect(SelectInst *Select) {
   PACK_FAIL_IF(!resC);
 
   IRBuilder<> B(Select);
-  bool isVectorSelect = cond->getType()->isVectorTy();
+  const bool isVectorSelect = cond->getType()->isVectorTy();
   SmallVector<Value *, 16> vecC;
   if (UVR.isVarying(cond)) {
     resC.getPacketValues(packetWidth, vecC);
@@ -3264,7 +3267,7 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
   }
 
   // Handle external builtins.
-  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   const auto Builtin = BI.analyzeBuiltinCall(*CI, Dimension);
 
   if (Builtin.properties & compiler::utils::eBuiltinPropertyExecutionFlow) {
@@ -3449,7 +3452,7 @@ Value *Packetizer::Impl::vectorizeAlloca(AllocaInst *alloca) {
   // case" is actually the most likely.
   //
   VECZ_FAIL_IF(SimdWidth.isScalable());
-  unsigned fixedWidth = SimdWidth.getFixedValue();
+  const unsigned fixedWidth = SimdWidth.getFixedValue();
   IRBuilder<> B(alloca);
   auto *const ty = alloca->getAllocatedType();
   AllocaInst *wideAlloca =
@@ -3898,7 +3901,7 @@ ValuePacket Packetizer::Impl::packetizeShuffleVector(
     } else {
       // It isn't safe to do it if it's not a power of 2.
       PACK_FAIL_IF(!isPowerOf2_32(scalarWidth));
-      TargetInfo &VTI = Ctx.targetInfo();
+      const TargetInfo &VTI = Ctx.targetInfo();
 
       const auto dstScalarWidth = multi_llvm::getVectorNumElements(ty);
       const auto fullWidth = SimdWidth * dstScalarWidth;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index 2c99f5e6af665..13897a143e591 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -83,7 +83,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
                                                 FunctionAnalysisManager &AM) {
   auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
 
-  TargetInfo &VTI = Ctx.targetInfo();
+  const TargetInfo &VTI = Ctx.targetInfo();
   std::vector<Instruction *> ToDelete;
   for (Function &Builtin : F.getParent()->functions()) {
     std::optional<MemOpDesc> BuiltinDesc =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index e718f75a61b7a..fee752eb9df71 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -123,7 +123,7 @@ InstructionCost calculateBoolReductionCost(LLVMContext &context, Module *module,
   IRBuilder<> B(BB);
   multi_llvm::createSimpleTargetReduction(B, &TTI, &*F->arg_begin(),
                                           RecurKind::And);
-  InstructionCost cost = calculateBlockCost(*BB, TTI);
+  const InstructionCost cost = calculateBlockCost(*BB, TTI);
 
   // We don't really need that function in the module anymore because it's
   // only purpose was to be used for analysis, so we go ahead and remove it.
@@ -220,12 +220,12 @@ PreservedAnalyses PreLinearizePass::run(Function &F,
                                         FunctionAnalysisManager &AM) {
   VectorizationUnitAnalysis::Result R =
       AM.getResult<VectorizationUnitAnalysis>(F);
-  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
-  VectorizationUnit &VU = R.getVU();
+  const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  const VectorizationUnit &VU = R.getVU();
 
   bool modified = false;
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool div_exceptions =
+  const bool div_exceptions =
       VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions);
 
   InstructionCost boscc_cost;
@@ -287,7 +287,7 @@ PreservedAnalyses PreLinearizePass::run(Function &F,
       InstructionCost total_cost = 0;
 
       for (auto *succ : hoistable) {
-        InstructionCost block_cost = calculateBlockCost(*succ, TTI);
+        const InstructionCost block_cost = calculateBlockCost(*succ, TTI);
         if (block_cost < min_cost) {
           min_cost = block_cost;
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
index 28d649814c4ec..fe18bef86e51a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -85,8 +85,8 @@ bool IncrementPtr(const char **fmt) {
 GlobalVariable *GetNewFormatStringAsGlobalVar(
     Module &module, GlobalVariable *const string_value,
     const std::string &new_format_string) {
-  ArrayRef<uint8_t> Elts((uint8_t *)(&new_format_string[0]),
-                         new_format_string.size());
+  const ArrayRef<uint8_t> Elts((uint8_t *)(&new_format_string[0]),
+                               new_format_string.size());
   Constant *new_format_string_const =
       ConstantDataArray::get(module.getContext(), Elts);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
index 36d93e9f64a6c..285d4ab334e81 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -190,9 +190,9 @@ PreservedAnalyses ScalarizationPass::run(llvm::Function &F,
   const auto *DI =
       MAMProxy.getCachedResult<compiler::utils::DeviceInfoAnalysis>(
           *F.getParent());
-  bool DoubleSupport = DI && DI->double_capabilities != 0;
+  const bool DoubleSupport = DI && DI->double_capabilities != 0;
 
-  bool FullScalarization =
+  const bool FullScalarization =
       VU.choices().isEnabled(VectorizationChoices::eFullScalarization);
   bool NeedsScalarization = false;
   Scalarizer SR(F, Ctx, DoubleSupport);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index aa899606c437e..833b49bf09c4a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -100,7 +100,7 @@ Value *Scalarizer::getGather(Value *V) {
   }
 
   auto *VecTy = cast<FixedVectorType>(V->getType());
-  unsigned SimdWidth = VecTy->getNumElements();
+  const unsigned SimdWidth = VecTy->getNumElements();
 
   SimdPacket *P = getPacket(V, SimdWidth, false);
   assert(P);
@@ -152,7 +152,7 @@ bool Scalarizer::scalarizeAll() {
   for (Value *V : ToScalarize) {
     auto *VecTy = getVectorType(V);
     assert(VecTy && "Trying to scalarize a non-vector");
-    unsigned SimdWidth = VecTy->getNumElements();
+    const unsigned SimdWidth = VecTy->getNumElements();
     // In the SimdPacket we use a mask that is stored as a uint64_t. Due
     // to that, there is a limit on the vector size that Vecz can
     // handle.
@@ -225,7 +225,7 @@ Value *Scalarizer::scalarizeOperands(Instruction *I) {
     // printf calls:
     if (!Callee->isIntrinsic()) {
       // Check if this is indeed a printf call
-      compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
       const auto ID = BI.analyzeBuiltin(*Callee).ID;
       if (ID == BI.getPrintfBuiltin()) {
         return scalarizeOperandsPrintf(CI);
@@ -257,7 +257,7 @@ Value *Scalarizer::scalarizeOperandsPrintf(CallInst *CI) {
   // Get the format string as a string
   GlobalVariable *FmtStringGV = GetFormatStringAsValue(CI->getArgOperand(0));
   VECZ_STAT_FAIL_IF(!FmtStringGV, VeczScalarizeFailCall);
-  std::string FmtString = GetFormatStringAsString(FmtStringGV);
+  const std::string FmtString = GetFormatStringAsString(FmtStringGV);
   VECZ_STAT_FAIL_IF(FmtString.empty(), VeczScalarizeFailCall);
   std::string NewFmtString;
   const EnumPrintfError err =
@@ -460,7 +460,7 @@ Value *Scalarizer::scalarizeOperandsExtractElement(ExtractElementInst *Extr) {
     ReturnVal = Select;
   } else {
     // Scalarize the original vector, but only for the lane to extract.
-    unsigned Lane = ConstantExtractIndex->getZExtValue();
+    const unsigned Lane = ConstantExtractIndex->getZExtValue();
     PM.enable(Lane);
     OrigVecPacket = scalarize(OrigVec, PM);
     VECZ_FAIL_IF(!OrigVecPacket);
@@ -476,7 +476,7 @@ Value *Scalarizer::scalarizeOperandsExtractElement(ExtractElementInst *Extr) {
 Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) {
   auto *VecSrcTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
   VECZ_FAIL_IF(!VecSrcTy);
-  unsigned SimdWidth = VecSrcTy->getNumElements();
+  const unsigned SimdWidth = VecSrcTy->getNumElements();
   PacketMask PM;
   PM.enableAll(SimdWidth);
   SimdPacket *SrcPacket = scalarize(BC->getOperand(0), PM);
@@ -486,8 +486,8 @@ Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) {
   Type *DstAsIntTy = DstTy;
   Type *SrcEleTy = VecSrcTy->getElementType();
   Type *SrcEleAsIntTy = SrcEleTy;
-  unsigned SrcEleBits = SrcEleTy->getScalarSizeInBits();
-  unsigned DstBits = DstTy->getPrimitiveSizeInBits();
+  const unsigned SrcEleBits = SrcEleTy->getScalarSizeInBits();
+  const unsigned DstBits = DstTy->getPrimitiveSizeInBits();
   if (!DstTy->isIntegerTy()) {
     DstAsIntTy = IntegerType::get(BC->getContext(), DstBits);
   }
@@ -519,7 +519,7 @@ SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) {
   auto *VecTy = getVectorType(V);
   VECZ_ERROR_IF(!VecTy,
                 "We shouldn't be trying to scalarize a non-vector instruction");
-  unsigned SimdWidth = VecTy->getNumElements();
+  const unsigned SimdWidth = VecTy->getNumElements();
 
   // Re-use cached packets, but make sure it contains all the lanes we want.
   // If we have a cached packet with missing lanes, it will be fetched by
@@ -610,7 +610,7 @@ SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) {
 SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) {
   auto *VecTy = getVectorType(V);
   VECZ_FAIL_IF(!VecTy);
-  unsigned SimdWidth = VecTy->getNumElements();
+  const unsigned SimdWidth = VecTy->getNumElements();
   SimdPacket *P = getPacket(V, SimdWidth);
 
   if (Constant *CVec = dyn_cast<Constant>(V)) {
@@ -758,7 +758,7 @@ SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
   PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
   auto *VecDataTy = dyn_cast<FixedVectorType>(Load->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
 
   Type *ScalarEleTy = VecDataTy->getElementType();
   PointerType *ScalarPtrTy =
@@ -815,7 +815,7 @@ SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
 
   // The individual elements may need laxer alignment requirements than the
   // whole vector.
-  unsigned Alignment = Load->getAlign().value();
+  const unsigned Alignment = Load->getAlign().value();
   unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
   if (Alignment < EleAlign) {
     EleAlign = Alignment;
@@ -844,7 +844,7 @@ SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
   auto *VecDataTy =
       dyn_cast<FixedVectorType>(Store->getValueOperand()->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   Type *ScalarEleTy = VecDataTy->getElementType();
   PointerType *ScalarPtrTy =
       PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
@@ -900,7 +900,7 @@ SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
   }
 
   // See comment at equivalent part of scalarizeLoad()
-  unsigned Alignment = Store->getAlign().value();
+  const unsigned Alignment = Store->getAlign().value();
   unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
   if (Alignment < EleAlign) {
     EleAlign = Alignment;
@@ -933,7 +933,7 @@ SimdPacket *Scalarizer::scalarizeBinaryOp(BinaryOperator *BinOp,
   Value *LHS = BinOp->getOperand(0);
   auto *VecDataTy = dyn_cast<FixedVectorType>(LHS->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   SimdPacket *LHSPacket = scalarize(LHS, PM);
   VECZ_FAIL_IF(!LHSPacket);
   Value *RHS = BinOp->getOperand(1);
@@ -961,7 +961,7 @@ SimdPacket *Scalarizer::scalarizeFreeze(FreezeInst *FreezeI, PacketMask PM) {
   Value *Src = FreezeI->getOperand(0);
   auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
 
@@ -982,7 +982,7 @@ SimdPacket *Scalarizer::scalarizeUnaryOp(UnaryOperator *UnOp, PacketMask PM) {
   Value *Src = UnOp->getOperand(0);
   auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
   SimdPacket *P = getPacket(UnOp, SimdWidth);
@@ -1002,7 +1002,7 @@ SimdPacket *Scalarizer::scalarizeUnaryOp(UnaryOperator *UnOp, PacketMask PM) {
 
 SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) {
   // Make sure we support the cast operation.
-  CastInst::CastOps Opc = CastI->getOpcode();
+  const CastInst::CastOps Opc = CastI->getOpcode();
   switch (Opc) {
     default:
       return nullptr;
@@ -1026,7 +1026,7 @@ SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) {
   Value *Src = CastI->getOperand(0);
   auto *VecSrcTy = dyn_cast<FixedVectorType>(Src->getType());
   VECZ_FAIL_IF(!VecSrcTy);
-  unsigned SimdWidth = VecSrcTy->getNumElements();
+  const unsigned SimdWidth = VecSrcTy->getNumElements();
   auto *VecDstTy = dyn_cast<FixedVectorType>(CastI->getType());
   VECZ_STAT_FAIL_IF(!VecDstTy || (VecDstTy->getNumElements() != SimdWidth),
                     VeczScalarizeFailCast);
@@ -1055,9 +1055,9 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
   auto *VecSrcTy = dyn_cast<FixedVectorType>(SrcTy);
   auto *VecDstTy = dyn_cast<FixedVectorType>(BC->getDestTy());
   VECZ_FAIL_IF(!VecDstTy);
-  unsigned SimdWidth = VecDstTy->getNumElements();
-  bool Vec3Src = VecSrcTy && (VecSrcTy->getNumElements() == 3);
-  bool Vec3Dst = (SimdWidth == 3);
+  const unsigned SimdWidth = VecDstTy->getNumElements();
+  const bool Vec3Src = VecSrcTy && (VecSrcTy->getNumElements() == 3);
+  const bool Vec3Dst = (SimdWidth == 3);
   VECZ_STAT_FAIL_IF(Vec3Src ^ Vec3Dst, VeczScalarizeFailBitcast);
 
   // Handle non-vector -> vector casts and vector casts with different widths.
@@ -1068,8 +1068,8 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
     Value *SrcAsInt = Src;
     Type *DstEleTy = VecDstTy->getElementType();
     Type *DstEleAsIntTy = DstEleTy;
-    unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
-    unsigned LaneBits = DstEleTy->getPrimitiveSizeInBits();
+    const unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
+    const unsigned LaneBits = DstEleTy->getPrimitiveSizeInBits();
     if (!SrcTy->isIntegerTy()) {
       SrcAsIntTy = SrcTy->getIntNTy(BC->getContext(), SrcBits);
       SrcAsInt = B.CreateBitCast(SrcAsInt, SrcAsIntTy);
@@ -1120,7 +1120,7 @@ SimdPacket *Scalarizer::scalarizeICmp(ICmpInst *ICmp, PacketMask PM) {
   Value *LHS = ICmp->getOperand(0);
   auto *VecDataTy = dyn_cast<FixedVectorType>(ICmp->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   SimdPacket *LHSPacket = scalarize(LHS, PM);
   VECZ_FAIL_IF(!LHSPacket);
   Value *RHS = ICmp->getOperand(1);
@@ -1143,7 +1143,7 @@ SimdPacket *Scalarizer::scalarizeFCmp(FCmpInst *FCmp, PacketMask PM) {
   Value *LHS = FCmp->getOperand(0);
   auto *VecDataTy = dyn_cast<FixedVectorType>(FCmp->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   SimdPacket *LHSPacket = scalarize(LHS, PM);
   VECZ_FAIL_IF(!LHSPacket);
   Value *RHS = FCmp->getOperand(1);
@@ -1172,7 +1172,7 @@ SimdPacket *Scalarizer::scalarizeSelect(SelectInst *Select, PacketMask PM) {
   Value *TrueVal = Select->getTrueValue();
   auto *VecDataTy = dyn_cast<FixedVectorType>(Select->getType());
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   SimdPacket *TruePacket = scalarize(TrueVal, PM);
   VECZ_FAIL_IF(!TruePacket);
   Value *FalseVal = Select->getFalseValue();
@@ -1197,7 +1197,7 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
   VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
   auto *VecDataTy = getVectorType(CI);
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
   assert((MaskedOp.isLoad() || MaskedOp.isStore()) &&
          "Masked op is not a store or load!");
 
@@ -1256,7 +1256,7 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
     PtrPacket.set(i, ScalarPtr);
   }
 
-  unsigned Alignment = MaskedOp.getAlignment();
+  const unsigned Alignment = MaskedOp.getAlignment();
   unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
   if (Alignment < EleAlign) {
     EleAlign = Alignment;
@@ -1290,7 +1290,7 @@ SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
   VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
   auto *VecDataTy = getVectorType(CI);
   VECZ_FAIL_IF(!VecDataTy);
-  unsigned SimdWidth = VecDataTy->getNumElements();
+  const unsigned SimdWidth = VecDataTy->getNumElements();
 
   if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
     if (MaskedOp->isMaskedMemOp()) {
@@ -1316,7 +1316,7 @@ SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
   IRBuilder<> B(CI);
   const auto Props = Builtin.properties;
   // Ignore the mask if present
-  unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size();
+  const unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size();
   SmallVector<SimdPacket *, 4> OpPackets(NumArgs);
   SmallVector<Value *, 4> OpScalars(NumArgs);
   for (unsigned i = 0; i < NumArgs; i++) {
@@ -1408,8 +1408,8 @@ SimdPacket *Scalarizer::scalarizeShuffleVector(ShuffleVectorInst *Shuffle,
   assert(RHS && "Could not get operand 1");
   auto *LHSVecTy = dyn_cast<FixedVectorType>(LHS->getType());
   VECZ_FAIL_IF(!LHSVecTy);
-  unsigned SrcWidth = LHSVecTy->getNumElements();
-  unsigned DstWidth = VecTy->getNumElements();
+  const unsigned SrcWidth = LHSVecTy->getNumElements();
+  const unsigned DstWidth = VecTy->getNumElements();
 
   // Determine which lanes we need from both vector operands.
   PacketMask LHSMask;
@@ -1517,7 +1517,7 @@ SimdPacket *Scalarizer::scalarizeInsertElement(InsertElementInst *Insert,
 SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) {
   auto *const vecDataTy = dyn_cast<FixedVectorType>(GEP->getType());
   VECZ_FAIL_IF(!vecDataTy);
-  unsigned simdWidth = vecDataTy->getNumElements();
+  const unsigned simdWidth = vecDataTy->getNumElements();
 
   Value *const ptr = GEP->getPointerOperand();
   SimdPacket *ptrPacket = nullptr;
@@ -1604,7 +1604,7 @@ SimdPacket *Scalarizer::scalarizePHI(PHINode *Phi, PacketMask PM) {
   }
 
   // Assign the scalarized incoming values to the scalarized Phi nodes
-  for (unsigned lane : ActiveLanes) {
+  for (const unsigned lane : ActiveLanes) {
     VECZ_ERROR_IF(!PM.isEnabled(lane), "Active lane should be enabled.");
     PHINode *SPhi = cast<PHINode>(P->at(lane));
     for (unsigned i = 0; i < NumIncoming; ++i) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
index 8f5a73abece1d..69214a20da3a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -145,7 +145,7 @@ void Transform(SelectInst *Select, VectorizationContext &Ctx) {
     Value *GepFalse = B.CreateGEP(GEP->getSourceElementType(), False, Indices);
     auto MaskedOp = MemOp::get(Memop);
     assert(MaskedOp);
-    MemOpDesc Mem = MaskedOp->getDesc();
+    const MemOpDesc Mem = MaskedOp->getDesc();
 
     // We should have filtered out all vector memory operations earlier.
     assert(!Mem.getDataType()->isVectorTy());
@@ -177,7 +177,7 @@ void Transform(SelectInst *Select, VectorizationContext &Ctx) {
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       ToDelete.push_back(GEP);
 
-      SmallVector<Value *, 2> Indices(GEP->idx_begin(), GEP->idx_end());
+      const SmallVector<Value *, 2> Indices(GEP->idx_begin(), GEP->idx_end());
 
       for (User *G : GEP->users()) {
         if (LoadInst *Load = dyn_cast<LoadInst>(G)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
index e08c7fc0981f0..d4fe6be17dc18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -342,7 +342,7 @@ bool Reassociator::run(llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
 PreservedAnalyses UniformReassociationPass::run(Function &F,
                                                 FunctionAnalysisManager &AM) {
   Reassociator reassociator;
-  bool changed = reassociator.run(F, AM);
+  const bool changed = reassociator.run(F, AM);
   (void)changed;
 
   PreservedAnalyses PA;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 49934f83c7dc5..c6dfc00e904e4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -100,8 +100,8 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
         VECZ_FAIL();
       }
       auto *Mask = createAllTrueMask(B, multi_llvm::getVectorElementCount(Ty));
-      SmallVector<llvm::Value *, 2> Args = {VecPtr, Mask, EVL};
-      SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
+      const SmallVector<llvm::Value *, 2> Args = {VecPtr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
     }
     return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Alignment));
@@ -120,7 +120,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
                          "Could not create a scalable-vector interleaved load");
     VECZ_FAIL();
   }
-  unsigned SimdWidth = Elts.getFixedValue();
+  const unsigned SimdWidth = Elts.getFixedValue();
   // Load individual values.
   SmallVector<Value *, 8> Values;
   Value *Index = B.getInt64(0);
@@ -169,8 +169,9 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
       }
       auto *Mask =
           createAllTrueMask(B, multi_llvm::getVectorElementCount(VecTy));
-      SmallVector<llvm::Value *, 3> Args = {Data, VecPtr, Mask, EVL};
-      SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtr->getType()};
+      const SmallVector<llvm::Value *, 3> Args = {Data, VecPtr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Data->getType(),
+                                                VecPtr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
     }
     return B.CreateAlignedStore(Data, VecPtr, MaybeAlign(Alignment));
@@ -190,7 +191,7 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
         "Could not create a scalable-vector interleaved store");
     VECZ_FAIL();
   }
-  unsigned SimdWidth = Elts.getFixedValue();
+  const unsigned SimdWidth = Elts.getFixedValue();
   // Extract values from the vector.
   SmallVector<Value *, 8> Values;
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -234,8 +235,8 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
     const Function *F = B.GetInsertBlock()->getParent();
     const auto Legality = isVPLoadLegal(F, Ty, Alignment);
     if (EVL && Legality.isVPLegal()) {
-      SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
-      SmallVector<llvm::Type *, 2> Tys = {Ty, PtrTy};
+      const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, PtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
     } else if (Legality.isMaskLegal()) {
       Mask = applyEVLToMask(B, EVL, Mask);
@@ -341,8 +342,8 @@ Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
     const Function *F = B.GetInsertBlock()->getParent();
     const auto Legality = isVPStoreLegal(F, DataTy, Alignment);
     if (EVL && Legality.isVPLegal()) {
-      SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
-      SmallVector<llvm::Type *, 2> Tys = {Data->getType(), PtrTy};
+      const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), PtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
     } else if (Legality.isMaskLegal()) {
       Mask = applyEVLToMask(B, EVL, Mask);
@@ -500,8 +501,8 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   if (Ty->isVectorTy()) {
     const auto Legality = isVPGatherLegal(F, Ty, Alignment);
     if (EVL && Legality.isVPLegal()) {
-      SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
-      SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
+      const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_gather, Tys, Args);
     } else if (Legality.isMaskLegal()) {
       Function *MaskedGather = Intrinsic::getDeclaration(
@@ -529,7 +530,7 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
 
   VECZ_FAIL_IF(EVL);
   auto VecWidth = multi_llvm::getVectorElementCount(Ty);
-  unsigned Width = VecWidth.getFixedValue();
+  const unsigned Width = VecWidth.getFixedValue();
 
   // Fallback scalar function generator
   // Create all the required blocks.
@@ -598,8 +599,8 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
     VECZ_FAIL_IF(!VecPtrTy);
     const auto Legality = isVPScatterLegal(F, DataTy, Alignment);
     if (EVL && Legality.isVPLegal()) {
-      SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
-      SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
+      const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_scatter, Tys, Args);
     } else if (Legality.isMaskLegal()) {
       Function *MaskedScatter = Intrinsic::getDeclaration(
@@ -627,7 +628,7 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
 
   VECZ_FAIL_IF(EVL);
   auto VecWidth = multi_llvm::getVectorElementCount(DataTy);
-  unsigned Width = VecWidth.getFixedValue();
+  const unsigned Width = VecWidth.getFixedValue();
 
   // Fallback scalar function generator
   // Create all the required blocks.
@@ -1087,12 +1088,12 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
   }
 
   auto VecWidth = multi_llvm::getVectorElementCount(VecTy);
-  unsigned SimdWidth = VecWidth.getFixedValue();
+  const unsigned SimdWidth = VecWidth.getFixedValue();
 
   Type *EleTy = VecTy->getElementType();
-  unsigned Align = EleTy->getScalarSizeInBits() / 8;
+  const unsigned Align = EleTy->getScalarSizeInBits() / 8;
 
-  bool HasMask =
+  const bool HasMask =
       (Kind == eMaskedInterleavedLoad) || (Kind == eMaskedInterleavedStore);
   SmallVector<Value *, 4> Vectors;
   SmallVector<Value *, 4> VecMasks(Masks.begin(), Masks.end());
@@ -1105,7 +1106,7 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
     for (unsigned i = 0; i < Group.size(); i++) {
       Value *AddressN = Address;
       if (i > 0) {
-        unsigned Offset = i * SimdWidth;
+        const unsigned Offset = i * SimdWidth;
         AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset));
       }
       Value *Load = nullptr;
@@ -1141,7 +1142,7 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
       Value *Vector = Vectors[i];
       Value *AddressN = Address;
       if (i > 0) {
-        unsigned Offset = i * SimdWidth;
+        const unsigned Offset = i * SimdWidth;
         AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset));
       }
       Value *Store = nullptr;
@@ -1276,11 +1277,11 @@ bool TargetInfo::interleaveVectors(IRBuilder<> &B,
 unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
                                        const ArrayRef<const Value *> vals,
                                        unsigned width) const {
-  unsigned MaxVecRegBitWidth =
+  const unsigned MaxVecRegBitWidth =
       TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
           .getFixedValue();
 
-  unsigned NumVecRegs =
+  const unsigned NumVecRegs =
       TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));
 
   unsigned VaryingUsage = 0;
@@ -1301,7 +1302,7 @@ unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
 
 unsigned TargetInfo::getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
                                            const llvm::Type &Ty) const {
-  unsigned MaxVecRegBitWidth =
+  const unsigned MaxVecRegBitWidth =
       TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
           .getFixedValue();
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index 912aabeb060f6..e6fa868d072f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -144,13 +144,13 @@ bool TargetInfoArm::canOptimizeInterleavedGroupImpl(const Instruction &val,
     return false;
   }
 
-  unsigned VecBits = VecTy->getPrimitiveSizeInBits();
+  const unsigned VecBits = VecTy->getPrimitiveSizeInBits();
   if ((VecBits != 128) && (VecBits != 64)) {
     return false;
   }
 
   // NEON interleave instructions only allow 8, 16, and 32 bit elements
-  unsigned ElementSize = VecTy->getScalarSizeInBits();
+  const unsigned ElementSize = VecTy->getScalarSizeInBits();
   if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) {
     return false;
   }
@@ -163,7 +163,7 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
                                              ArrayRef<Value *> group,
                                              ArrayRef<Value *>, Value *address,
                                              int stride) const {
-  bool HasMask =
+  const bool HasMask =
       (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore);
   // canOptimizeInterleavedGroup() should have returned false in this case.
   // ARM does not have masked vector load or store instructions.
@@ -213,7 +213,7 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
   }
 
   Type *EleTy = VecTy->getElementType();
-  unsigned Alignment = (EleTy->getPrimitiveSizeInBits() / 8);
+  const unsigned Alignment = (EleTy->getPrimitiveSizeInBits() / 8);
 
   // Declare the intrinsic if needed.
   SmallVector<Type *, 2> Tys;
@@ -244,7 +244,7 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
   if (kind == eInterleavedLoad) {
     for (unsigned i = 0; i < Calls.size(); i++) {
       CallInst *Op = Calls[i];
-      ArrayRef<unsigned> Indices(&i, 1);
+      const ArrayRef<unsigned> Indices(&i, 1);
       Value *Extract = B.CreateExtractValue(CI, Indices);
       Op->replaceAllUsesWith(Extract);
     }
@@ -312,13 +312,13 @@ bool TargetInfoAArch64::canOptimizeInterleavedGroupImpl(
     return false;
   }
 
-  unsigned VecBits = VecTy->getPrimitiveSizeInBits();
+  const unsigned VecBits = VecTy->getPrimitiveSizeInBits();
   if ((VecBits != 128) && (VecBits != 64)) {
     return false;
   }
 
   // NEON interleave instructions only allow 8, 16, and 32 bit elements
-  unsigned ElementSize = VecTy->getScalarSizeInBits();
+  const unsigned ElementSize = VecTy->getScalarSizeInBits();
   if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) {
     return false;
   }
@@ -329,7 +329,7 @@ bool TargetInfoAArch64::canOptimizeInterleavedGroupImpl(
 bool TargetInfoAArch64::optimizeInterleavedGroup(
     IRBuilder<> &B, InterleavedOperation kind, ArrayRef<Value *> group,
     ArrayRef<Value *>, Value *address, int stride) const {
-  bool HasMask =
+  const bool HasMask =
       (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore);
   // canOptimizeInterleavedGroup() should have returned false in this case.
   // AArch64 does not have masked vector load or store instructions.
@@ -398,7 +398,7 @@ bool TargetInfoAArch64::optimizeInterleavedGroup(
   if (kind == eInterleavedLoad) {
     for (unsigned i = 0; i < Calls.size(); i++) {
       CallInst *Op = Calls[i];
-      ArrayRef<unsigned> Indices(&i, 1);
+      const ArrayRef<unsigned> Indices(&i, 1);
       Value *Extract = B.CreateExtractValue(CI, Indices);
       Op->replaceAllUsesWith(Extract);
     }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 79690f5f8ab6e..0ebb2edc816db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -223,7 +223,7 @@ static unsigned getRISCVBits(const TargetMachine *TM) {
 llvm::Value *getIntrinsicVL(llvm::IRBuilderBase &B, llvm::Value *VL,
                             llvm::Type *wideTy, llvm::TargetMachine *TM,
                             const Twine &N = "xlen") {
-  unsigned XLenTyWidth = getRISCVBits(TM);
+  const unsigned XLenTyWidth = getRISCVBits(TM);
   Type *XLen = B.getIntNTy(XLenTyWidth);
 
   if (VL) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
index 1c48d0d0ddfd2..1703ff0f490d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
@@ -79,7 +79,7 @@ bool VectorizationChoices::parseChoicesString(StringRef Str) {
   // so we will use it here.
   compiler::utils::Lexer L(Str);
   // We support multiple separators in case of platform-dependent issues
-  StringRef Separators = ":;,";
+  const StringRef Separators = ":;,";
   // All the parsed choices will be stored in a set and will only be
   // enabled/disabled after the parsing has been completed successfully.
   SmallVector<ChoiceValuePair, 4> ParsedChoices;
@@ -96,11 +96,11 @@ bool VectorizationChoices::parseChoicesString(StringRef Str) {
       break;
     }
     // Consume the optional "no" prefix, which disables the given prefix
-    bool disable = L.Consume("no");
+    const bool disable = L.Consume("no");
     // Consume the Choice name
     if (L.ConsumeAlphanumeric(ParsedChoice)) {
       // Convert the string to a Choice value
-      Choice C = fromString(ParsedChoice);
+      const Choice C = fromString(ParsedChoice);
       if (C == eInvalid) {
         printChoicesParseError(Str, L.CurrentPos() - ParsedChoice.size(),
                                "Invalid Choice \"" + ParsedChoice + "\"");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 9264c18f0cedb..90034c99cc6f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -153,7 +153,7 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
   // Gather information about the function's arguments.
   const auto Props = Builtin.properties;
   unsigned i = 0;
-  for (Argument &Arg : F.args()) {
+  for (const Argument &Arg : F.args()) {
     Type *pointerRetPointeeTy = nullptr;
     VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR;
 
@@ -231,7 +231,7 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   // function can become a bit too complex, among other things because name
   // mangling with arbitrary types can become a bit complex. printf is the only
   // vararg OpenCL builtin, so only user functions are affected by this.
-  bool isVarArg = F->isVarArg();
+  const bool isVarArg = F->isVarArg();
   VECZ_FAIL_IF(isVarArg && F->getName() != "printf");
   // Copy the argument types. This is done from the CallInst instead of the
   // called Function because the called Function might be a VarArg function, in
@@ -266,7 +266,7 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   argTys.push_back(Type::getInt1Ty(ctx));
   // Generate the function name
   compiler::utils::NameMangler mangler(&ctx);
-  SmallVector<compiler::utils::TypeQualifiers, 8> quals(
+  const SmallVector<compiler::utils::TypeQualifiers, 8> quals(
       argTys.size(), compiler::utils::TypeQualifiers());
   std::string newFName;
   raw_string_ostream O(newFName);
@@ -312,7 +312,7 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   CIArgs.pop_back();
 
   FunctionType *FTy = CI->getFunctionType();
-  AttributeList callAttrs = CI->getAttributes();
+  const AttributeList callAttrs = CI->getAttributes();
   SmallVector<std::pair<Value *, BasicBlock *>, 4> PhiOperands;
   if (hasImmArg) {
     Value *immArg = newFunction->getArg(firstImmArg);
@@ -393,7 +393,7 @@ VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
   if (!FnName.consume_front("masked_")) {
     return std::nullopt;
   }
-  bool IsCmpXchg = FnName.consume_front("cmpxchg_");
+  const bool IsCmpXchg = FnName.consume_front("cmpxchg_");
   if (!IsCmpXchg && !FnName.consume_front("atomicrmw_")) {
     return std::nullopt;
   }
@@ -663,8 +663,8 @@ std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
   if (!L.Consume("sub_group_scan_")) {
     return std::nullopt;
   }
-  bool isInt = ty->isIntOrIntVectorTy();
-  bool isInclusive = L.Consume("inclusive_");
+  const bool isInt = ty->isIntOrIntVectorTy();
+  const bool isInclusive = L.Consume("inclusive_");
   if (isInclusive || L.Consume("exclusive_")) {
     StringRef OpKind;
     if (L.ConsumeAlpha(OpKind)) {
@@ -699,7 +699,7 @@ std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
       } else {
         return std::nullopt;
       }
-      bool isVP = L.Consume("_vp");
+      const bool isVP = L.Consume("_vp");
       return std::make_tuple(isInclusive, opKind, isVP);
     }
   }
@@ -739,9 +739,9 @@ bool VectorizationContext::defineInternalBuiltin(Function *F) {
 
   // Handle subgroup scan operations.
   if (auto scanInfo = isSubgroupScan(F->getName(), F->getReturnType())) {
-    bool isInclusive = std::get<0>(*scanInfo);
-    RecurKind opKind = std::get<1>(*scanInfo);
-    bool isVP = std::get<2>(*scanInfo);
+    const bool isInclusive = std::get<0>(*scanInfo);
+    const RecurKind opKind = std::get<1>(*scanInfo);
+    const bool isVP = std::get<2>(*scanInfo);
     return emitSubgroupScanBody(*F, isInclusive, opKind, isVP);
   }
 
@@ -904,7 +904,7 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
 
   Type *const VecTy = F.getReturnType();
   Type *const EltTy = multi_llvm::getVectorElementType(VecTy);
-  ElementCount EC = multi_llvm::getVectorElementCount(VecTy);
+  const ElementCount EC = multi_llvm::getVectorElementCount(VecTy);
 
   Function::arg_iterator Arg = F.arg_begin();
 
@@ -1069,7 +1069,7 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
 bool VectorizationContext::emitMaskedAtomicBody(
     Function &F, const VectorizationContext::MaskedAtomic &MA) const {
   LLVMContext &Ctx = F.getContext();
-  bool IsCmpXchg = MA.isCmpXchg();
+  const bool IsCmpXchg = MA.isCmpXchg();
 
   auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F);
 
@@ -1290,7 +1290,7 @@ PreservedAnalyses DefineInternalBuiltinsPass::run(Module &M,
       continue;
     }
     llvm::SmallPtrSet<VectorizationUnit *, 1> UserVUs;
-    for (Use &U : F.uses()) {
+    for (const Use &U : F.uses()) {
       if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) {
         auto R = FAM.getResult<VectorizationUnitAnalysis>(*CI->getFunction());
         if (R.hasResult()) {
@@ -1308,7 +1308,7 @@ PreservedAnalyses DefineInternalBuiltinsPass::run(Module &M,
     }
 
     VectorizationContext &Ctx = (*UserVUs.begin())->context();
-    bool DefinedBuiltin = Ctx.defineInternalBuiltin(&F);
+    const bool DefinedBuiltin = Ctx.defineInternalBuiltin(&F);
     if (!DefinedBuiltin) {
       // If we've failed to define this builtin, ensure we clean up the
       // half-complete body. We can't simply delete it because it will have
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 77c0264ef87ff..7308a828fcc29 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -40,14 +40,14 @@ namespace {
 Function *declareFunction(const VectorizationUnit &VU) {
   Module &Module = VU.context().module();
   const Function *const ScalarFn = VU.scalarFunction();
-  ElementCount SimdWidth = VU.width();
+  const ElementCount SimdWidth = VU.width();
 
   // For kernels, the vectorized function type is is the same as the original
   // scalar function type, since function arguments are uniform. We no longer
   // use Vectorization Units for builtins.
   FunctionType *VectorizedFnType = VU.scalarFunction()->getFunctionType();
   VECZ_FAIL_IF(!VectorizedFnType);
-  std::string VectorizedName =
+  const std::string VectorizedName =
       getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, VU.choices());
   Module.getOrInsertFunction(VectorizedName, VectorizedFnType);
   auto *const VectorizedFn = Module.getFunction(VectorizedName);
@@ -68,7 +68,7 @@ Function *declareFunction(const VectorizationUnit &VU) {
 /// the vectorized kernel.
 void cloneOpenCLNamedMetadataHelper(const VectorizationUnit &VU,
                                     const std::string &NodeName) {
-  Module &M = VU.context().module();
+  const Module &M = VU.context().module();
 
   // Try to get the OpenCL metadata
   NamedMDNode *KernelsMD = M.getNamedMetadata(NodeName);
@@ -151,8 +151,8 @@ namespace vecz {
 std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF,
                                       VectorizationChoices Choices,
                                       bool IsBuiltin) {
-  Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v");
-  Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_");
+  const Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v");
+  const Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_");
   return ((IsBuiltin ? VectorizationContext::InternalBuiltinPrefix
                      : Twine("__vecz_")) +
           Prefix + Twine(VF.getKnownMinValue()) + IsVP + ScalarName)
@@ -222,7 +222,7 @@ Function *cloneFunctionToVector(const VectorizationUnit &VU) {
     LLVMContext &Ctx = VectorizedFn->getContext();
     AttributeList PAL = VectorizedFn->getAttributes();
     bool RemovedAttribute = false;
-    for (Attribute::AttrKind Kind : {Attribute::ZExt, Attribute::SExt}) {
+    for (const Attribute::AttrKind Kind : {Attribute::ZExt, Attribute::SExt}) {
       if (PAL.hasRetAttr(Kind)) {
         PAL = PAL.removeRetAttribute(Ctx, Kind);
         RemovedAttribute = true;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
index 7de2788767b1b..beed2b6f3f38e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -208,7 +208,7 @@ const Value *Heuristics::shouldVectorizeVisitCmpOperand(
 
   if (const CallInst *CI = dyn_cast<const CallInst>(Val)) {
     // We only care if the CallInst does involve a call to a work-item builtin.
-    compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+    const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
     const auto Uniformity = BI.analyzeBuiltinCall(*CI, SimdDimIdx).uniformity;
     if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
         Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
@@ -234,7 +234,7 @@ Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmp(
   const Value *RHS =
       shouldVectorizeVisitCmpOperand(Cmp->getOperand(1), Cmp, Cache);
 
-  CmpInst::Predicate pred = Cmp->getPredicate();
+  const CmpInst::Predicate pred = Cmp->getPredicate();
 
   BrClauseKind vectorize = BrClauseKind::None;
   // The CmpInst may involve two CallInst, or it may involve only one but
@@ -243,7 +243,7 @@ Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmp(
     vectorize = shouldVectorizeVisitCmpOperands(RHS, pred);
   }
   if (llvm::isa_and_nonnull<const CallInst>(RHS)) {
-    BrClauseKind RHSStatus = shouldVectorizeVisitCmpOperands(LHS, pred);
+    const BrClauseKind RHSStatus = shouldVectorizeVisitCmpOperands(LHS, pred);
     // This should never happen but in case it does, we want to "void" the
     // result and vectorize!
     if (vectorize != BrClauseKind::None && vectorize != RHSStatus) {
@@ -293,7 +293,7 @@ bool Heuristics::shouldVectorize() {
       if (isa<StoreInst>(&I) || isa<LoadInst>(&I)) {
         weight++;
       } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-        compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+        const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
         if (Function *Callee = CI->getCalledFunction()) {
           auto const builtin = BI.analyzeBuiltin(*Callee);
           if (!(builtin.properties &
@@ -333,7 +333,7 @@ bool Heuristics::shouldVectorize() {
   Instruction *TI = BB.getTerminator();
   if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
     if (BI->isConditional()) {
-      BrClauseKind clause = shouldVectorizeVisitBr(BI->getCondition());
+      const BrClauseKind clause = shouldVectorizeVisitBr(BI->getCondition());
       unsigned succWeight = 0;
       if (clause != BrClauseKind::None) {
         BasicBlock *start = nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
index 98403fd40d7d8..1a25d7deaf658 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
@@ -93,7 +93,7 @@ void VectorizationUnit::setWidth(ElementCount NewWidth) {
   SimdWidth = NewWidth;
 
   // Determine the vectorized function's name and try to look it up.
-  std::string VectorizedName =
+  const std::string VectorizedName =
       getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, Choices);
   if (VectorizedFn) {
     VectorizedFn->setName(VectorizedName);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index 9a19584d1b069..8c90389ef4be6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -233,7 +233,7 @@ void collectStatistics(VectorizationUnit &VU, Function *Scalar,
   // Normalized Scalar Insts = Simd Width * Scalar Insts
   // IK - Input Kernel
   // Scalar Insts = IK's Scalar Insts + IK's Vec Insts * IK's VecWidth
-  unsigned SimdWidth = VU.width().getFixedValue();
+  const unsigned SimdWidth = VU.width().getFixedValue();
   Ratio = (SimdWidth * (ScalarInstructions - ScalarVectorInsts +
                         ScalarVectorInsts * MaxScalarVectorWidth)) /
           VeczInstructions;
@@ -245,9 +245,9 @@ VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx,
                                                  const VeczPassOptions &Opts,
                                                  FunctionAnalysisManager &FAM,
                                                  bool Check) {
-  unsigned SimdDimIdx = Opts.vec_dim_idx;
-  unsigned LocalSize = Opts.local_size;
-  bool Auto = Opts.vecz_auto;
+  const unsigned SimdDimIdx = Opts.vec_dim_idx;
+  const unsigned LocalSize = Opts.local_size;
+  const bool Auto = Opts.vecz_auto;
   auto VF =
       ElementCount::get(Opts.factor.getKnownMin(), Opts.factor.isScalable());
 
@@ -310,7 +310,7 @@ VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx,
 void vecz::trackVeczSuccessFailure(VectorizationUnit &VU) {
   Function *Fn = VU.scalarFunction();
   Function *vectorizedFn = VU.vectorizedFunction();
-  bool failed = VU.failed();
+  const bool failed = VU.failed();
   VeczFail += failed;
   VeczSuccess += !failed;
   collectStatistics(VU, Fn, vectorizedFn);
@@ -348,8 +348,8 @@ bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) {
   auto finalVF = compiler::utils::VectorizationFactor(vf.getKnownMinValue(),
                                                       vf.isScalable());
 
-  compiler::utils::VectorizationInfo info{finalVF, dim,
-                                          vu.choices().vectorPredication()};
+  const compiler::utils::VectorizationInfo info{
+      finalVF, dim, vu.choices().vectorPredication()};
 
   if (vectorizedFn && vectorizedFn != fn) {  // success
     // Link the original function to the vectorized one.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 4cce6e70d81d9..102f2d1f0ad20 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -120,7 +120,7 @@ static llvm::cl::list<unsigned> SGSizes(
 static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
                                            llvm::StringRef cpu_model,
                                            llvm::StringRef target_features) {
-  llvm::Triple triple(triple_string);
+  const llvm::Triple triple(triple_string);
   llvm::InitializeAllTargets();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllAsmPrinters();

From c39749dcd1bfb0549d49d428d8a4f6718363566a Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 22 Jan 2024 11:07:57 +0000
Subject: [PATCH 088/182] [NFC] Address clang-tidy-17 readability warnings.

* When variables are passed to functions that have a parameter with a
  similar name, but that is not the parameter that the variable is used
  for, a warning is issued. Rename to avoid the warning.
* clang-tidy-17 wants us to use buffer.data() to access the whole
  buffer, rather than &buffer[0].
* clang-tidy-17 warns us when an access specifier specifies the same
  access we would already get otherwise (whether implicitly or through a
  prior access specifier). Remove these.

Two instances of &buffer[0] are annotated with NOLINT because they are
only used to access a single element, not the whole buffer.
---
 .../analysis/vectorizable_function_analysis.h  |  1 -
 .../source/transform/printf_scalarizer.cpp     |  2 +-
 .../vecz/source/vectorization_context.cpp      | 18 +++++++++---------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
index 4309e05aa476f..61fcfe4ed66ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -44,7 +44,6 @@ class VectorizableFunctionAnalysis
     /// is the cause of the problem.
     const llvm::Value *failedAt = nullptr;
 
-   public:
     /// @brief Handle invalidation events from the new pass manager.
     ///
     /// @return false, as this analysis can never be invalidated.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
index fe18bef86e51a..224cd5c3718cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -85,7 +85,7 @@ bool IncrementPtr(const char **fmt) {
 GlobalVariable *GetNewFormatStringAsGlobalVar(
     Module &module, GlobalVariable *const string_value,
     const std::string &new_format_string) {
-  const ArrayRef<uint8_t> Elts((uint8_t *)(&new_format_string[0]),
+  const ArrayRef<uint8_t> Elts((uint8_t *)new_format_string.data(),
                                new_format_string.size());
   Constant *new_format_string_const =
       ConstantDataArray::get(module.getContext(), Elts);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 90034c99cc6f8..933e6beb03d7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -316,28 +316,28 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   SmallVector<std::pair<Value *, BasicBlock *>, 4> PhiOperands;
   if (hasImmArg) {
     Value *immArg = newFunction->getArg(firstImmArg);
-    BasicBlock *immTrue =
+    BasicBlock *const immTrueBB =
         BasicBlock::Create(ctx, "active.imm.1", newFunction, mergeBlock);
     CIArgs[firstImmArg] = ConstantInt::getTrue(ctx);
     CallInst *c0 =
-        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immTrue);
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immTrueBB);
     c0->setCallingConv(cc);
     c0->setAttributes(callAttrs);
-    BranchInst::Create(mergeBlock, immTrue);
+    BranchInst::Create(mergeBlock, immTrueBB);
 
     CIArgs[firstImmArg] = ConstantInt::getFalse(ctx);
     // Now the false half
-    BasicBlock *immFalse =
+    BasicBlock *const immFalseBB =
         BasicBlock::Create(ctx, "active.imm.0", newFunction, mergeBlock);
 
     CallInst *c1 =
-        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immFalse);
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immFalseBB);
     c1->setCallingConv(cc);
     c1->setAttributes(callAttrs);
-    BranchInst::Create(mergeBlock, immFalse);
-    BranchInst::Create(immTrue, immFalse, immArg, activeBlock);
-    PhiOperands.push_back({c0, immTrue});
-    PhiOperands.push_back({c1, immFalse});
+    BranchInst::Create(mergeBlock, immFalseBB);
+    BranchInst::Create(immTrueBB, immFalseBB, immArg, activeBlock);
+    PhiOperands.push_back({c0, immTrueBB});
+    PhiOperands.push_back({c1, immFalseBB});
 
     // Now fix up the new function's signature. It can't be inheriting illegal
     // attributes; only intrinsics may have the `ImmArg` Attribute. The verifier

From 5b304da09e8c6e4589e52418ad7aa3dea48097c9 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 22 Jan 2024 12:30:25 +0000
Subject: [PATCH 089/182] [compiler] Improve analysis of 'true uniform' values

This commit extends the Uniform Value Analysis with a fourth kind of
uniformity: "true" uniformity. This represents a value that is uniform
on both active and inactive lanes. The old class of "uniform" has been
renamed to "active" uniformity to clarify this.

The analysis pass has been extended with a method to query whether a
value is truly uniform. It processes this recursively on demand and
caches the result. This is because the initial analysis run works from
varying "roots" and marks all dependent values as varying - uniform
values aren't handled at all. Rather than negatively affecting the
performance of all Uniform Value Analysis runs, this on-demand method
keeps costs the same except for users that need to query uniformity.

The query is still conservative, but less so. This can be seen in the
test changes, where some cases which were previously conservatively
handled as possibly varying/active uniform are now seen as truly
uniform.
---
 .../analysis/uniform_value_analysis.cpp       | 51 ++++++++++++++++++-
 .../include/analysis/uniform_value_analysis.h | 22 ++++++--
 .../control_flow_conversion_pass.cpp          | 18 ++-----
 .../vecz/test/lit/llvm/Boscc/boscc_merge3.ll  |  2 +-
 .../lit/llvm/Boscc/partial_linearization12.ll |  2 +-
 .../lit/llvm/Boscc/partial_linearization17.ll |  2 +-
 .../lit/llvm/Boscc/partial_linearization18.ll |  2 +-
 .../lit/llvm/Boscc/partial_linearization19.ll |  2 +-
 .../lit/llvm/Boscc/partial_linearization5.ll  |  8 +--
 .../lit/llvm/Boscc/partial_linearization6.ll  |  8 +--
 .../lit/llvm/Boscc/partial_linearization7.ll  |  2 +-
 .../test/lit/llvm/partial_linearization12.ll  |  2 +-
 .../test/lit/llvm/partial_linearization17.ll  |  2 +-
 .../test/lit/llvm/partial_linearization18.ll  |  2 +-
 .../test/lit/llvm/partial_linearization19.ll  |  2 +-
 .../test/lit/llvm/partial_linearization5.ll   |  8 +--
 .../test/lit/llvm/partial_linearization7.ll   |  2 +-
 17 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 346ece7a4543b..ff64eccda0b97 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -18,6 +18,9 @@
 
 #include <compiler/utils/builtin_info.h>
 #include <compiler/utils/mangling.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/Debug.h>
@@ -76,6 +79,38 @@ bool isDivergenceReduction(const Function &F) {
           L.Consume("divergence_"));
 }
 
+bool isTrueUniformInternal(const Value *V, unsigned Depth) {
+  if (!V) {
+    return false;
+  }
+
+  // Constants and Arguments that can't be undef/poison are truly uniform
+  if (isa<Constant>(V) || isa<Argument>(V)) {
+    return isGuaranteedNotToBePoison(V);
+  }
+
+  constexpr unsigned DepthLimit = 6;
+
+  if (Depth < DepthLimit) {
+    // For a specific subset of instructions, if all operands are truly
+    // uniform, then the instruction is too.
+    // FIXME: This is pessimistic. We could improve this by extending the list
+    // of instructions covered. We could also use flow-sensitive analysis in
+    // isGuaranteedNotToBePoison to enhance its capabilities.
+    if (const auto *I = dyn_cast<Instruction>(V)) {
+      if (isa<UnaryOperator>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+          isa<CmpInst>(I) || isa<SelectInst>(I) || isa<PHINode>(I)) {
+        return isGuaranteedNotToBePoison(I) &&
+               llvm::all_of(I->operands(), [Depth](Value *Op) {
+                 return isTrueUniformInternal(Op, Depth + 1);
+               });
+      }
+    }
+  }
+
+  return false;
+}
+
 }  // namespace
 
 UniformValueResult::UniformValueResult(Function &F, VectorizationUnit &vu)
@@ -102,7 +137,21 @@ bool UniformValueResult::isValueOrMaskVarying(const Value *V) const {
   if (found == varying.end()) {
     return false;
   }
-  return found->second != VaryingKind::eValueUniform;
+  return found->second != VaryingKind::eValueTrueUniform &&
+         found->second != VaryingKind::eValueActiveUniform;
+}
+
+bool UniformValueResult::isTrueUniform(const Value *V) {
+  auto found = varying.find(V);
+  if (found != varying.end()) {
+    return found->second == VaryingKind::eValueTrueUniform;
+  }
+  if (!isTrueUniformInternal(V, /*Depth=*/0)) {
+    return false;
+  }
+  // Cache this result to help speed up future queries
+  varying[V] = VaryingKind::eValueTrueUniform;
+  return true;
 }
 
 /// @brief Utility function to check whether an instruction is a call to a
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
index 2abd0d396d8d8..1e9071de6b137 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
@@ -41,8 +41,11 @@ class VectorizationUnit;
 /// @brief Holds the result of Uniform Value Analysis for a given function.
 struct UniformValueResult {
   enum class VaryingKind {
-    /// @brief The value is uniform.
-    eValueUniform,
+    /// @brief The value is truly uniform on all active and inactive lanes.
+    eValueTrueUniform,
+    /// @brief The value is uniform on active lanes. May be poison or undefined
+    /// on inactive lanes.
+    eValueActiveUniform,
     /// @brief The value is varying and lanes may see different values.
     eValueVarying,
     /// @brief The value is uniform, but its mask is not.
@@ -71,24 +74,33 @@ struct UniformValueResult {
   ///
   /// @param[in] V Value to analyze.
   ///
-  /// @brief true if the value needs to be packetized, false otherwise.
+  /// @return true if the value needs to be packetized, false otherwise.
   bool isVarying(const llvm::Value *V) const;
 
   /// @brief Determine whether the given value has a varying mask or not.
   ///
   /// @param[in] V Value to analyze.
   ///
-  /// @brief true if the value has a varying mask, false otherwise.
+  /// @return true if the value has a varying mask, false otherwise.
   bool isMaskVarying(const llvm::Value *V) const;
 
   /// @brief Determine whether the given value has a varying mask or not.
   ///
   /// @param[in] V Value to analyze.
   ///
-  /// @brief true if the value is varying or has a varying mask, false
+  /// @return true if the value is varying or has a varying mask, false
   /// otherwise.
   bool isValueOrMaskVarying(const llvm::Value *V) const;
 
+  /// @brief Determine (on demand) whether the given value is a true uniform
+  /// value.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @return true if the value is true uniform, false otherwise. Caches the
+  /// result for future queries.
+  bool isTrueUniform(const llvm::Value *V);
+
   /// @brief Remove the value from the analysis.
   ///
   /// @param[in] V Value to remove.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index be0d30ed6f9ec..b8fe270bb0d0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -405,25 +405,15 @@ static inline Error makeStringError(const Twine &message, Instruction &I) {
   return make_error<StringError>(helper_stream.str(), inconvertibleErrorCode());
 }
 
-// A conservative helper method to determine whether a branch condition
+// A helper method to determine whether a branch condition
 // (expected to be an i1 result of a comparison instruction) is truly uniform.
-// Note that we can't (currently) rely on UniformValueAnalysis for this
-// purpose. We need to be able to discern "truly" uniform values from uniform
-// values which are only uniform on active lanes.
-// FIXME: This is pessimistic. We could expand on this, or enhance the
-// UniformValueAnalysis.
-static bool isBranchCondTrulyUniform(Value *cond) {
+static bool isBranchCondTrulyUniform(Value *cond, UniformValueResult &UVR) {
   const auto *cmp = dyn_cast_if_present<CmpInst>(cond);
   if (!cmp || cmp->getType()->isVectorTy()) {
     return false;
   }
 
-  // Pessimistically assume that only arguments and constants are truly
-  // uniform: i.e., they won't given different reuslts on active vs inactive
-  // lanes.
-  return llvm::all_of(cmp->operands(), [](Value *op) {
-    return isa<Argument>(op) || isa<Constant>(op);
-  });
+  return UVR.isTrueUniform(cmp);
 }
 }  // namespace
 
@@ -1545,7 +1535,7 @@ bool ControlFlowConversionState::Impl::createBranchReductions() {
         // FIXME: Is this missing incorrect branches in uniform blocks/loops?
         if (auto *LTag = DR->getTag(&BB).loop;
             DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) {
-          if (!isBranchCondTrulyUniform(cond)) {
+          if (!isBranchCondTrulyUniform(cond, *UVR)) {
             cond = BinaryOperator::Create(Instruction::BinaryOps::And, cond,
                                           MaskInfos[&BB].entryMask,
                                           cond->getName() + "_active", Branch);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
index ccd79ca20e4b6..80aa461758366 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -27,7 +27,7 @@ declare i64 @__mux_get_global_id(i32) #0
 ; Function Attrs: nounwind readnone
 declare spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64, float addrspace(1)*)
 
-define spir_kernel void @boscc_merge3(float addrspace(1)* %out, i64 %n, float %m) {
+define spir_kernel void @boscc_merge3(float addrspace(1)* %out, i64 noundef %n, float noundef %m) {
 entry:
   %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0
   %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
index eba24fbbd23c1..914dc5a28a347 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
@@ -213,7 +213,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
index 8f9420633ae28..6e4485b743385 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -130,7 +130,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n, i32 %x) #0 {
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 noundef %n, i32 noundef %x) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
index 571c0a48fdc06..0c281548e2b54 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
@@ -109,7 +109,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
index 7b67cbd488fdd..dc06f7bb8372f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
@@ -118,7 +118,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
index fb52cd7854755..f50d14347636e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -88,7 +88,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
@@ -227,11 +227,7 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[IFELSE5]]:
 ; CHECK: %[[CMP7:.+]] = icmp
-; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
-; on inactive lanes.
-; CHECK: %[[CMP7_ACTIVE:.+]] = and i1 %[[CMP7]], {{%.*}}
-; CHECK: %[[CMP7_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP7_ACTIVE]])
-; CHECK: br i1 %[[CMP7_ACTIVE_ANY]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
 
 ; CHECK: [[FORCOND14PREHEADER]]:
 ; CHECK: br label %[[FORCOND14:.+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
index b4a295e3f90c6..bfdbe5321f762 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -85,7 +85,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
@@ -168,11 +168,7 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[WHILEBODY]]:
 ; CHECK: %[[CMP:.+]] = icmp
-; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
-; on inactive lanes.
-; CHECK: %[[CMP_ACTIVE:.+]] = and i1 %[[CMP]], {{%.*}}
-; CHECK: %[[CMP_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP_ACTIVE]])
-; CHECK: br i1 %[[CMP_ACTIVE_ANY]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
 
 ; CHECK: [[IFTHEN]]:
 ; CHECK: %[[CMP2:.+]] = icmp
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
index 1bbf53c652c9e..e5326572eb93d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
@@ -102,7 +102,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
index f226c3eb5bb8d..30b5c603d288b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
@@ -211,7 +211,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
index d1cf75289c5eb..fe9c347315149 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -128,7 +128,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 %n, i32 %x) #0 {
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 noundef %n, i32 noundef %x) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
index 3fc928055b86b..4e7d3dd3f6f64 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
@@ -107,7 +107,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
index b5c52ad8c3341..3dd1c4adb4953 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
@@ -116,7 +116,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
index e4e6badc21dea..31585396866e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -86,7 +86,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32
@@ -187,11 +187,7 @@ attributes #2 = { nobuiltin nounwind readonly }
 
 ; CHECK: [[IFELSE5]]:
 ; CHECK: %[[CMP7:.+]] = icmp
-; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
-; on inactive lanes.
-; CHECK: %[[CMP7_ACTIVE:.+]] = and i1 %[[CMP7]], {{%.*}}
-; CHECK: %[[CMP7_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP7_ACTIVE]])
-; CHECK: br i1 %[[CMP7_ACTIVE_ANY]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
 
 ; CHECK: [[FORCOND14PREHEADER]]:
 ; CHECK: br label %[[FORCOND14:.+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
index 13bbb4131361a..79279bc37c768 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
@@ -93,7 +93,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 %n) #0 {
+define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 noundef %n) #0 {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0) #2
   %conv = trunc i64 %call to i32

From 3798c49e80b0c160453c457ba8740a1e862c3a9e Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Mon, 29 Jan 2024 13:59:41 +0000
Subject: [PATCH 090/182] Update findDbgDeclare for LLVM 18

---
 .../vecz/source/transform/basic_mem2reg_pass.cpp               | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index b5559151f5200..cd2bb4f3c0dcf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -184,8 +184,7 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
       ToDelete.push_back(Store);
       DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
 #if LLVM_VERSION_GREATER_EQUAL(18, 0)
-      SmallVector<DbgDeclareInst *, 1> DbgIntrinsics;
-      findDbgDeclares(DbgIntrinsics, Alloca);
+      auto DbgIntrinsics = findDbgDeclares(Alloca);
 #elif LLVM_VERSION_GREATER_EQUAL(17, 0)
       auto DbgIntrinsics = FindDbgDeclareUses(Alloca);
 #else

From 0549866b4931b362e2ce84ecdcdb9608ea3bb361 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 31 Jan 2024 00:28:44 +0000
Subject: [PATCH 091/182] Enable extra warnings, adjust code.

We would like to be able to build with -pedantic -Wcast-qual, the former
because it flags accidental use of non-standard language constructs, the
latter because it is also enabled by LLVM. These are now enabled, and
code is updated to address the warnings raised.

The C standard is bumped from C99 to C11 to account for the fact that we
were already using C11 language features, which now got flagged by
-pedantic. The move to C11 also enables the
checkDeprecatedOrUnsafeBufferHandling warning, which tells us not to use
scanf and to use scanf_s instead, despite scanf_s being optional and not
supported on platforms we support, and despite the use of scanf being
safe. Therefore, this warning is disabled.
---
 .../vecz/source/include/analysis/instantiation_analysis.h       | 2 +-
 .../compiler_passes/vecz/source/offset_info.cpp                 | 2 +-
 .../compiler_passes/vecz/source/transform/printf_scalarizer.cpp | 2 +-
 .../compiler_passes/vecz/source/vectorization_context.cpp       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
index cadb756cef6a5..2a93187cdd979 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
@@ -31,6 +31,6 @@ class VectorizationContext;
 ///
 /// @return true iff the instruction requires instantiation.
 bool needsInstantiation(const VectorizationContext &Ctx, llvm::Instruction &I);
-};  // namespace vecz
+}  // namespace vecz
 
 #endif  // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index cc9f700804312..01b453b208da9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -50,7 +50,7 @@ uint8_t highbit(const uint32_t x) {
       31, 15, 28, 21, 19, 10, 12, 6,  14, 27, 9,  5,  26, 8, 25, 24,
   };
   return tab[(uint32_t)(x * deBruijn_magic) >> 27];
-};
+}
 
 // Returns a value extended or truncated to match the size type of the target.
 // This will return the original value if it is already the correct size.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
index 224cd5c3718cc..ff544ff69adaf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -85,7 +85,7 @@ bool IncrementPtr(const char **fmt) {
 GlobalVariable *GetNewFormatStringAsGlobalVar(
     Module &module, GlobalVariable *const string_value,
     const std::string &new_format_string) {
-  const ArrayRef<uint8_t> Elts((uint8_t *)new_format_string.data(),
+  const ArrayRef<uint8_t> Elts((const uint8_t *)new_format_string.data(),
                                new_format_string.size());
   Constant *new_format_string_const =
       ConstantDataArray::get(module.getContext(), Elts);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 933e6beb03d7f..a8529c5a91cb4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -705,7 +705,7 @@ std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
   }
   return std::nullopt;
 }
-};  // namespace
+}  // namespace
 
 bool VectorizationContext::defineInternalBuiltin(Function *F) {
   assert(F->isDeclaration() && "builtin is already defined");

From 0ce030eb0b13dc50f3a7527c2c5a0af0bc33e513 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 1 Feb 2024 19:06:41 +0000
Subject: [PATCH 092/182] Upgrade to clang-tidy-17.

* misc-include-cleaner is disabled globally because at the moment, it
  requires annotations in external headers we have no control over.
* misc-use-anonymous-namespace is disabled globally because it goes
  against LLVM style, and the warning is triggered by use of LLVM macros
  that we have no control over.
* modernize-macro-to-enum is disabled globally because it also warns on
  macros that should be usable in preprocessor conditions.

Everything else either has the code adjusted according to the intent of
the warning, or has the warning suppressed as appropriate. Warnings are
suppressed for false positives (intentionally unused function return
values, and two instances of clang-tidy warning about undefined code
that is actually well-defined), and for code that we cannot improve
without a bigger refactor (unused function return values in places where
we have no way to indicate an error).
---
 .../include/multi_llvm/vector_type_helper.h            |  4 ++--
 .../vecz/source/analysis/divergence_analysis.cpp       | 10 +++++-----
 .../vecz/source/transform/packetizer.cpp               |  6 +++---
 .../vecz/source/transform/scalarizer.cpp               |  8 ++++----
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
index 3b281a47b94b0..f6fb52dabf054 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
@@ -35,14 +35,14 @@ inline llvm::Type *getVectorElementType(const llvm::Type *ty) {
   return llvm::cast<llvm::VectorType>(ty)->getElementType();
 }
 
-inline unsigned getVectorNumElements(llvm::Type *ty) {
+inline uint64_t getVectorNumElements(llvm::Type *ty) {
   assert(ty->getTypeID() == llvm::Type::FixedVectorTyID &&
          "Not a fixed vector type");
   return llvm::cast<llvm::FixedVectorType>(ty)
       ->getElementCount()
       .getFixedValue();
 }
-inline unsigned getVectorNumElements(const llvm::Type *ty) {
+inline uint64_t getVectorNumElements(const llvm::Type *ty) {
   assert(ty->getTypeID() == llvm::Type::FixedVectorTyID &&
          "Not a fixed vector type");
   return llvm::cast<llvm::FixedVectorType>(ty)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
index 0d10d73b9a9a3..8fe467d75e3fd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -426,12 +426,12 @@ void DivergenceResult::markByAll(BasicBlock &src) {
           const auto *const DLoopTag = basicBlockTags[DIndex].loop;
           // If we are not in a loop, or the loop we live in does not diverge
           // nor does the one englobing us if it exists, then mark by_all.
-          Loop *parentLoop;
-          if (!DLoopTag || (!DLoopTag->isLoopDivergent() &&
-                            (!(parentLoop = DLoopTag->loop->getParentLoop()) ||
-                             isByAll(*parentLoop->getHeader())))) {
-            queue.push(DIndex);
+          if (DLoopTag) {
+            if (DLoopTag->isLoopDivergent()) continue;
+            Loop *parentLoop = DLoopTag->loop->getParentLoop();
+            if (parentLoop && !isByAll(*parentLoop->getHeader())) continue;
           }
+          queue.push(DIndex);
         }
       }
     }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 524a263521281..23e2674676870 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -3658,19 +3658,19 @@ ValuePacket Packetizer::Impl::packetizeInsertElement(
     if (Indices != Index) {
       Type *IdxTy = Index->getType();
       SmallVector<Constant *, 16> Offsets;
-      for (unsigned i = 0; i < Width; ++i) {
+      for (size_t i = 0; i < Width; ++i) {
         Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth));
       }
       Value *Add = B.CreateAdd(Indices, ConstantVector::get(Offsets));
 
-      for (unsigned i = 0; i < Width; ++i) {
+      for (size_t i = 0; i < Width; ++i) {
         Value *ExtractElt =
             (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt;
         Value *ExtractIdx = B.CreateExtractElement(Add, B.getInt32(i));
         Result = B.CreateInsertElement(Result, ExtractElt, ExtractIdx, Name);
       }
     } else {
-      for (unsigned i = 0; i < Width; ++i) {
+      for (size_t i = 0; i < Width; ++i) {
         Value *ExtractElt =
             (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt;
         Value *InsertIdx = B.CreateAdd(Index, B.getInt32(i * ScalarWidth));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 833b49bf09c4a..adc2d659c36f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -486,8 +486,8 @@ Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) {
   Type *DstAsIntTy = DstTy;
   Type *SrcEleTy = VecSrcTy->getElementType();
   Type *SrcEleAsIntTy = SrcEleTy;
-  const unsigned SrcEleBits = SrcEleTy->getScalarSizeInBits();
-  const unsigned DstBits = DstTy->getPrimitiveSizeInBits();
+  const uint64_t SrcEleBits = SrcEleTy->getScalarSizeInBits();
+  const uint64_t DstBits = DstTy->getPrimitiveSizeInBits();
   if (!DstTy->isIntegerTy()) {
     DstAsIntTy = IntegerType::get(BC->getContext(), DstBits);
   }
@@ -1068,8 +1068,8 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
     Value *SrcAsInt = Src;
     Type *DstEleTy = VecDstTy->getElementType();
     Type *DstEleAsIntTy = DstEleTy;
-    const unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
-    const unsigned LaneBits = DstEleTy->getPrimitiveSizeInBits();
+    const uint64_t SrcBits = SrcTy->getPrimitiveSizeInBits();
+    const uint64_t LaneBits = DstEleTy->getPrimitiveSizeInBits();
     if (!SrcTy->isIntegerTy()) {
       SrcAsIntTy = SrcTy->getIntNTy(BC->getContext(), SrcBits);
       SrcAsInt = B.CreateBitCast(SrcAsInt, SrcAsIntTy);

From ebb0a0d2e6cc884c8a1035bf4a825aad91ab6591 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 1 Feb 2024 21:05:16 +0000
Subject: [PATCH 093/182] Update tests for LLVM 19.

* LLVM 19 canonicalizes getelementptr instructions, so tests are updated
  to allow either old or new output.
* LLVM 19 renames a pass, which cannot be handled in a single test. The
  test is split in two, one for LLVM 16, 17, 18 which uses the old name,
  one for LLVM 19 which uses the new name.
---
 .../vecz/test/lit/llvm/constant_address.ll    |   2 +-
 .../lit/llvm/constant_address_with_uniform.ll |   2 +-
 .../vecz/test/lit/llvm/gep_duplication.ll     |   2 +-
 .../llvm/partial_linearization22-llvm18.ll    | 264 ++++++++++++++++++
 .../test/lit/llvm/partial_linearization22.ll  |   4 +-
 .../vecz/test/lit/llvm/scalarize_mixed_gep.ll |   2 +-
 6 files changed, 270 insertions(+), 6 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
index 3c7935414409a..a191e7314efc8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -54,5 +54,5 @@ attributes #1 = { nounwind readnone }
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT: %conv = trunc i64 %gid to i32
-; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+; CHECK-NEXT: %arrayidx = getelementptr inbounds {{i32|i8}}, ptr addrspace(1) %out, i64 {{3|12}}
 ; CHECK-NEXT: store i32 %conv, ptr addrspace(1) %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
index d2ff89e2e6aab..c58dfa1e0229d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -36,6 +36,6 @@ entry:
 ; CHECK: define spir_kernel void @__vecz_v4_test
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: %gid = call i32 @__mux_get_global_id(i32 0)
-; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 3
+; CHECK-NEXT: %arrayidx = getelementptr inbounds {{i32|i8}}, ptr addrspace(1) %out, i32 {{3|12}}
 ; CHECK: store i32 %gid, ptr addrspace(1) %arrayidx, align 4
 ; CHECK: store <4 x ptr addrspace(1)> %{{.+}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index bf3af3df43f6b..5ac166cedf570 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -26,7 +26,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; combination of instcombine and GVN).
 ; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
 ; CHECK: entry:
-; CHECK: getelementptr inbounds [2 x i32], ptr %myStruct, i{{32|64}} 0, i{{32|64}} 1
+; CHECK: getelementptr inbounds {{\[2 x i32]|i8}}, ptr %myStruct, {{i64 0, i64 1|i64 4}}
 ; CHECK-NOT: getelementptr {{.*}}%myStruct
 define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) {
 entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
new file mode 100644
index 0000000000000..36cdfa9b7bdb4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
@@ -0,0 +1,264 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: !llvm-19+
+; RUN: veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   f   c <--. |
+;   |\ / \   | |
+;   | |   d -' |
+;   | |\ / \   |
+;   | | |   e -'
+;   | | |\ /
+;   | | | g
+;   | | |/
+;   | | /
+;    \|/
+;     h
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--.
+;    /|    |
+;   f c <. |
+;   | |  | |
+;   | d -' |
+;   | |    |
+;   | e ---'
+;    \|
+;     g
+;     |
+;     h
+;
+; __kernel void partial_linearization22(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto h;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   if (n == 2) {
+;     goto h;
+;   }
+;
+; g:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto h;
+;
+; h:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  switch i32 %n, label %g [
+    i32 3, label %if.else
+    i32 2, label %h
+  ]
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %h
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+f:                                                ; preds = %while.body
+  %cmp24 = icmp eq i32 %n, 2
+  br i1 %cmp24, label %h, label %g
+
+g:                                                ; preds = %f, %e, %while.body5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %g
+  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
+  %cmp29 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp29, label %h, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc31 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
+  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization22
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP6:.+]] = icmp slt
+; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]]
+; CHECK: %[[F_EXIT_MASK:.+]] = select i1
+; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]])
+; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[LEAFBLOCK1:.*]]:
+; CHECK: %[[SWITCHLEAF:.+]] = icmp eq i32 %n, 3
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[LEAFBLOCK1]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[G_EXIT_MASK:.+]], %[[F]] ], [ false, %[[E]] ]
+; CHECK: br label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[F]]:
+; CHECK: %[[CMP24:.+]] = icmp eq i32 %n, 2
+; CHECK: %[[G_EXIT_MASK]] = select i1 %[[CMP24]], i1 false, i1 %[[F_EXIT_MASK]]
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FSPLIT:.+]]:
+; CHECK: %[[CMP24_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %cmp24.merge)
+; CHECK: br i1 %[[CMP24_ANY]], label %[[H:.+]], label %[[G]]
+
+; CHECK: [[GLOOPEXIT:.+]]:
+; CHECK: br label %[[GLOOPEXITELSE:.+]]
+
+; CHECK: [[GLOOPEXITELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 true, label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[HLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[HLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT]]
+
+;; CHECK: [[H]]:
+;; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
index 5a8b3dc38a0c6..291dafd8e1456 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -14,8 +14,8 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-12+
-; RUN: veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
+; REQUIRES: llvm-19+
+; RUN: veczc -k partial_linearization22 -vecz-passes="function(lower-switch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
index d9bc298514967..8abeed7bcdd12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -42,5 +42,5 @@ define void @bar(i64** %ptrptrs, i64 %val) {
 ; gets scalarized/re-packetized correctly
 
 ; CHECK: define void @__vecz_v4_bar
-; CHECK: %[[ADDR:.+]] = getelementptr inbounds i64, <4 x ptr> %{{.+}}, i64 2
+; CHECK: %[[ADDR:.+]] = getelementptr inbounds {{i64|i8}}, <4 x ptr> %{{.+}}, {{i64 2|i64 16}}
 ; CHECK: call void @__vecz_b_scatter_store8_Dv4_mDv4_u3ptr(<4 x i64> %.splat{{.*}}, <4 x ptr> %[[ADDR]])

From fb17e5f735cab116cb70ba3980ef90cb7c7c1c33 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 15 Feb 2024 14:31:31 +0000
Subject: [PATCH 094/182] [vecz] Use LLVM helper for testing for powers of 2

This should make the code a little more readable at first glance.
---
 .../compiler_passes/vecz/source/offset_info.cpp            | 7 ++++---
 .../vecz/source/transform/squash_small_vectors_pass.cpp    | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 01b453b208da9..afe3c31d30a9f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -22,6 +22,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/KnownBits.h>
+#include <llvm/Support/MathExtras.h>
 
 #include "analysis/instantiation_analysis.h"
 #include "analysis/stride_analysis.h"
@@ -40,7 +41,7 @@ inline uint64_t SizeOrZero(TypeSize &&T) {
 }
 
 uint8_t highbit(const uint32_t x) {
-  assert((x & (x - 1)) == 0 && "Value must be a power of two");
+  assert(isPowerOf2_32(x) && "Value must be a power of two");
   // This is a De Bruijn hash table, it returns the index of the highest
   // bit, which works when x is a power of 2. For details, see
   // https://en.wikipedia.org/wiki/De_Bruijn_sequence#Uses
@@ -636,7 +637,7 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
         // Don't need to do anything if the size is 1
         idxStride = idxOffset.ManifestStride;
       } else {
-        if ((MemSize & (MemSize - 1)) == 0) {
+        if (isPowerOf2_64(MemSize)) {
           // the size is a power of two, so shift to get the offset in bytes
           auto *const SizeVal = getSizeInt(B, highbit(MemSize));
           idxStride = B.CreateShl(idxOffset.ManifestStride, SizeVal);
@@ -694,7 +695,7 @@ Value *OffsetInfo::buildMemoryStride(IRBuilder<> &B, Type *PtrEleTy,
     return nullptr;
   }
 
-  if ((PtrEleSize & (PtrEleSize - 1)) == 0) {
+  if (isPowerOf2_64(PtrEleSize)) {
     auto ShiftVal = highbit(PtrEleSize);
     if (auto *BinOp = dyn_cast<BinaryOperator>(ManifestStride)) {
       if (BinOp->getOpcode() == Instruction::Shl) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
index b7f3f8a013f9c..40067523ecc03 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -89,7 +89,7 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
         auto *const ty = load->getType();
         auto *const scalarTy = ty->getScalarType();
         const unsigned numBits = ty->getPrimitiveSizeInBits();
-        if ((numBits & (numBits - 1)) == 0 && scalarTy != ty &&
+        if (isPowerOf2_32(numBits) && scalarTy != ty &&
             DL.fitsInLegalInteger(numBits)) {
           const auto align = load->getAlign();
           auto *const intTy = IntegerType::get(context, numBits);
@@ -133,7 +133,7 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
         auto *const ty = data->getType();
         auto *const scalarTy = ty->getScalarType();
         const unsigned numBits = ty->getPrimitiveSizeInBits();
-        if ((numBits & (numBits - 1)) == 0 && scalarTy != ty &&
+        if (isPowerOf2_32(numBits) && scalarTy != ty &&
             DL.fitsInLegalInteger(numBits)) {
           const auto align = store->getAlign();
           auto *const intTy = IntegerType::get(context, numBits);

From e0e0fa9484c7a579b47ac9b1d137e3c34f5e07f0 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 15 Feb 2024 16:27:49 +0000
Subject: [PATCH 095/182] [vecz] Add a pass to print StrideAnalysis results

This should help better singularly "unit test" the raw results of the
StrideAnalysis, in which we know there are some bugs.

Any bugs fixed in the StrideAnalysis can be added to this regression
test over time.
---
 .../vecz/source/analysis/stride_analysis.cpp  |  31 +++++
 .../source/include/analysis/stride_analysis.h |  13 +++
 .../compiler_passes/vecz/source/passes.def    |   2 +
 .../vecz/test/lit/llvm/stride_analysis.ll     | 109 ++++++++++++++++++
 4 files changed, 155 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
index 673997d4fae29..de1f24c6ae67c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -88,3 +88,34 @@ StrideAnalysisResult StrideAnalysis::run(llvm::Function &F,
   auto &UVR = AM.getResult<UniformValueAnalysis>(F);
   return Result(F, UVR, AC);
 }
+
+PreservedAnalyses StrideAnalysisPrinterPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &SAR = AM.getResult<StrideAnalysis>(F);
+  OS << "StrideAnalysis for function '" << F.getName() << "':\n";
+
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto MO = MemOp::get(&I)) {
+        auto *const Ptr = MO->getPointerOperand();
+        if (!Ptr) {
+          continue;
+        }
+        if (const OffsetInfo *Info = SAR.getInfo(Ptr)) {
+          OS << "* Stride for " << *Ptr << "\n";
+          OS << "  - "
+             << (Info->mayDiverge()
+                     ? "divergent"
+                     : (Info->hasStride()
+                            ? "linear"
+                            : (Info->isUniform() ? "uniform" : "unknown")));
+          if (Info->isStrideConstantInt()) {
+            OS << " stride of " << Info->getStrideAsConstantInt();
+          }
+          OS << "\n";
+        }
+      }
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
index 5dc1c64676d08..ec30ae43729b2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -123,6 +123,19 @@ class StrideAnalysis : public llvm::AnalysisInfoMixin<StrideAnalysis> {
   static llvm::AnalysisKey Key;
 };
 
+/// @brief Helper pass to print out the contents of the StrideAnalysis
+/// analysis.
+class StrideAnalysisPrinterPass
+    : public llvm::PassInfoMixin<StrideAnalysisPrinterPass> {
+  llvm::raw_ostream &OS;
+
+ public:
+  explicit StrideAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+};
+
 }  // namespace vecz
 
 #endif  // VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
index 4afe6cd9993e3..9b418f773d355 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
@@ -41,6 +41,8 @@ FUNCTION_PASS("packetizer", PacketizationPass())
 FUNCTION_PASS("inline-post-vecz", InlinePostVectorizationPass())
 FUNCTION_PASS("interleave-combine-loads", InterleavedGroupCombinePass(eInterleavedLoad))
 FUNCTION_PASS("interleave-combine-stores", InterleavedGroupCombinePass(eInterleavedStore))
+
+FUNCTION_PASS("print<strides>", StrideAnalysisPrinterPass(llvm::dbgs()))
 #undef FUNCTION_PASS
 
 #ifndef LOOP_PASS
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
new file mode 100644
index 0000000000000..f395e0317de82
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
@@ -0,0 +1,109 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes="print<strides>" -S < %s -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: StrideAnalysis for function '__vecz_v4_foo':
+define spir_kernel void @foo(ptr addrspace(1) align 1 %input) {
+entry:
+  %localid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %localsize0 = tail call i64 @__mux_get_local_size(i32 0)
+  %groupid0 = tail call i64 @__mux_get_group_id(i32 0)
+  %globalid0 = tail call i64 @__mux_get_global_id(i32 0)
+
+; CHECK: Stride for ptr addrspace(1) %input
+; CHECK-NEXT: uniform
+  %lduniform = load i8, ptr addrspace(1) %input, align 1
+
+; CHECK: Stride for %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0
+; CHECK-NEXT: linear stride of 1
+  %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0
+  %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1
+
+  %truncglobalid0 = trunc i64 %globalid0 to i32
+
+; CHECK: Stride for %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
+; CHECK-NEXT: linear stride of 1
+  %sexttruncglobalid0 = sext i32 %truncglobalid0 to i64
+  %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
+  %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1
+
+; CHECK: Stride for %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
+; CHECK-NEXT: divergent
+  %zexttruncglobalid0 = zext i32 %truncglobalid0 to i64
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
+  %ld2 = load i8, ptr addrspace(1) %arrayidx2, align 1
+
+; CHECK: Stride for %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0
+; CHECK-NEXT: linear stride of 4
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0
+  %ld3 = load i8, ptr addrspace(1) %arrayidx3, align 1
+
+; CHECK: Stride for %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul8
+; CHECK-NEXT: linear stride of 8
+  %globalid0mul8 = mul i64 %globalid0, 8
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul8
+  %ld4 = load i8, ptr addrspace(1) %arrayidx4, align 1
+
+; CHECK: Stride for %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul16
+; CHECK-NEXT: linear stride of 16
+  %globalid0mul16 = mul i64 %globalid0mul8, 2
+  %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul16
+  %ld5 = load i8, ptr addrspace(1) %arrayidx5, align 1
+
+; CHECK: Stride for %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0mul8
+; CHECK-NEXT: linear stride of 32
+  %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0mul8
+  %ld6 = load i32, ptr addrspace(1) %arrayidx6, align 1
+
+; CHECK: Stride for %arrayidx7 = getelementptr inbounds i16, ptr addrspace(1) %input, i64 %idxprom7
+; CHECK-NEXT: linear stride of 2
+  %mul7 = mul i64 %localsize0, %groupid0
+  %add7 = add i64 %mul7, %localid0
+  %trunc7 = trunc i64 %add7 to i32
+  %conv7 = add i32 %trunc7, -1
+  %idxprom7 = sext i32 %conv7 to i64
+  %arrayidx7 = getelementptr inbounds i16, ptr addrspace(1) %input, i64 %idxprom7
+  %ld7 = load i16, ptr addrspace(1) %arrayidx7, align 1
+
+; CHECK: Stride for %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom8
+; CHECK-NEXT: divergent
+  %mul8 = mul i64 %localsize0, %groupid0
+  %add8 = add i64 %mul8, %localid0
+  %trunc8 = trunc i64 %add8 to i32
+  %conv8 = add i32 %trunc8, -1
+  %idxprom8 = zext i32 %conv8 to i64
+  %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom8
+  %ld8 = load i8, ptr addrspace(1) %arrayidx8, align 1
+
+; CHECK: Stride for %arrayidx9 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom9
+; CHECK-NEXT: divergent
+  %mul9 = mul i64 %groupid0, %localsize0
+  %add9 = add nuw nsw i64 %localid0, 4294967295
+  %conv9 = add i64 %add9, %mul9
+  %idxprom9 = and i64 %conv9, 4294967295
+  %arrayidx9 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom9
+  %ld9 = load i8, ptr addrspace(1) %arrayidx9, align 1
+
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_local_size(i32)
+declare i64 @__mux_get_group_id(i32)
+declare i64 @__mux_get_global_id(i32)

From e2ecdab2e85fbb561488c41e339c0149d96e0ce9 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 19 Feb 2024 12:14:01 +0000
Subject: [PATCH 096/182] [vecz][NFC] Fix a couple of doxygen issues in
 OffsetInfo

---
 .../compiler_passes/vecz/source/include/offset_info.h     | 7 ++++---
 .../compiler_passes/vecz/source/offset_info.cpp           | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
index 32d48a351988d..a55230b7b2ec5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -75,7 +75,8 @@ struct OffsetInfo {
   uint64_t BitMask;
 
   /// @brief Construct a new offset information object from a general value
-  /// @param[in] B The StrideAnalysisResult used to retrieve other OffsetInfos.
+  /// @param[in] SAR The StrideAnalysisResult used to retrieve other
+  /// OffsetInfos.
   /// @param[in] V Offset value to analyze.
   OffsetInfo(StrideAnalysisResult &SAR, llvm::Value *V);
 
@@ -119,7 +120,7 @@ struct OffsetInfo {
   /// @brief Convert the bytewise stride into an element-wise stride based on
   /// the data type and data layout, as an integer.
   ///
-  /// @param[in] PtrTy The element data type.
+  /// @param[in] PtrEleTy The element data type.
   /// @param[in] DL The Data Layout.
   /// @return The memory stride as number of elements.
 
@@ -131,7 +132,7 @@ struct OffsetInfo {
   /// that the stride must be manifest first.
   ///
   /// @param[in] B an IRBuilder used for creating constants or instructions.
-  /// @param[in] PtrTy The element data type.
+  /// @param[in] PtrEleTy The element data type.
   /// @param[in] DL The Data Layout.
   /// @return The memory stride as number of elements.
   llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Type *PtrEleTy,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index afe3c31d30a9f..45ee0a3122b4b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -329,10 +329,10 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
         return setKind(eOffsetUniformVariable);
       case compiler::utils::eBuiltinUniformityInstanceID:
         if (Builtin.properties & compiler::utils::eBuiltinPropertyLocalID) {
-          // If the local size is unknown (represented by zero), the
-          // resulting mask will be ~0ULL (all ones). Potentially, it is
-          // possible to use the CL_​DEVICE_​MAX_​WORK_​ITEM_​SIZES
-          // property as an upper bound in this case.
+          // If the local size is unknown (represented by zero), the resulting
+          // mask will be ~0ULL (all ones). Potentially, it is possible to use
+          // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in
+          // this case.
           uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
           LocalBitMask |= LocalBitMask >> 32;
           LocalBitMask |= LocalBitMask >> 16;

From dfe08c6547329d16bc1af7f374094032fb0e814c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 19 Feb 2024 16:08:51 +0000
Subject: [PATCH 097/182] [vecz] Fix dropped value analysis in OffsetInfo

There were several cases in the OffsetInfo analysis where the OffsetInfo
was told to copy the stride information from another value (typically
one of the instruction's source operands) but the bitmask was not
updated. This meant that the value analysis - which works backwards from
the leaf instruction - would "stop" at that point, and become incorrect
as it would not take into account the range of values that the source
may have.

This was problematic because the analysis would come to the conclusion
that a value has a safe range of values and a valid stride, when in fact
it was truncated from a larger value before being zero-extended, and
thus divergent.

This was found through a segfault/invalid access in one of ArrayFire's
CannyEdgeDetector kernels.

Oftentimes we want to copy the bitmask *and* stride information over, so
a convenience function has been introduced. Other times we need to be
careful to update the bitmask correctly before copying the stride
information.
---
 .../vecz/source/include/offset_info.h         |  6 ++
 .../vecz/source/offset_info.cpp               | 29 +++++---
 .../vecz/test/lit/llvm/stride_analysis.ll     | 71 ++++++++++++++++++-
 3 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
index a55230b7b2ec5..c6d55e351145e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -255,6 +255,12 @@ struct OffsetInfo {
   /// @param[in] Other the other OffsetInfo to copy from
   /// @return Reference to the current object for chaining.
   OffsetInfo &copyStrideFrom(const OffsetInfo &Other);
+
+  /// @brief Copies the stride and bitmask information from another OffsetInfo
+  /// into this one
+  /// @param[in] Other the other OffsetInfo to copy from
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &copyStrideAndBitMaskFrom(const OffsetInfo &Other);
 };
 
 }  // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 45ee0a3122b4b..245bad2aa03d7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -275,6 +275,8 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
     const auto RHS = SAR.analyze(Select->getOperand(2));
     if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
         LHS.isStrideConstantInt()) {
+      // Merge the bitmasks from either source - we are selecting one of them.
+      BitMask = LHS.BitMask | RHS.BitMask;
       return copyStrideFrom(LHS);
     }
     return setMayDiverge();
@@ -282,13 +284,13 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
   if (auto *Phi = dyn_cast<PHINode>(Offset)) {
     if (auto *const CVal = Phi->hasConstantValue()) {
-      return copyStrideFrom(SAR.analyze(CVal));
+      return copyStrideAndBitMaskFrom(SAR.analyze(CVal));
     }
 
     auto NumIncoming = Phi->getNumIncomingValues();
     if (NumIncoming == 1) {
       // LCSSA Phi, just go right through it..
-      return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(0)));
+      return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0)));
     } else if (NumIncoming == 2) {
       auto identifyIncrement = [&](Value *incoming) -> bool {
         if (auto *BOp = dyn_cast<BinaryOperator>(incoming)) {
@@ -306,9 +308,9 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
       // Try the PHI node's incoming values both ways round.
       if (identifyIncrement(Phi->getIncomingValue(1))) {
-        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(0)));
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0)));
       } else if (identifyIncrement(Phi->getIncomingValue(0))) {
-        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(1)));
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(1)));
       }
     }
     return setMayDiverge();
@@ -351,11 +353,11 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
 OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
   if (BitCastInst *BCast = dyn_cast<BitCastInst>(Address)) {
-    return copyStrideFrom(SAR.analyze(BCast->getOperand(0)));
+    return copyStrideAndBitMaskFrom(SAR.analyze(BCast->getOperand(0)));
   } else if (auto *ASCast = dyn_cast<AddrSpaceCastInst>(Address)) {
-    return copyStrideFrom(SAR.analyze(ASCast->getOperand(0)));
+    return copyStrideAndBitMaskFrom(SAR.analyze(ASCast->getOperand(0)));
   } else if (auto *IntPtr = dyn_cast<IntToPtrInst>(Address)) {
-    return copyStrideFrom(SAR.analyze(IntPtr->getOperand(0)));
+    return copyStrideAndBitMaskFrom(SAR.analyze(IntPtr->getOperand(0)));
   } else if (auto *Arg = dyn_cast<Argument>(Address)) {
     // 'Pointer return' arguments should be treated as having an implicit ItemID
     // offset. This allows memory operations to be packetized instead of
@@ -395,7 +397,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
     // the IRBuilder insert point, we might not even be able to build the
     // offset expression instructions there.
     if (auto *const CVal = Phi->hasConstantValue()) {
-      return copyStrideFrom(SAR.analyze(CVal));
+      return copyStrideAndBitMaskFrom(SAR.analyze(CVal));
     }
 
     // In the simple case of a loop-incremented pointer using a GEP, we can
@@ -416,7 +418,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
             return setMayDiverge();
           }
         }
-        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(0)));
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0)));
       }
     } else if (auto *const GEP =
                    dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(0))) {
@@ -428,7 +430,7 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
             return setMayDiverge();
           }
         }
-        return copyStrideFrom(SAR.analyze(Phi->getIncomingValue(1)));
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(1)));
       }
     }
 
@@ -513,6 +515,8 @@ OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
     // constant stride, the result will also have the same constant stride.
     if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
         LHS.isStrideConstantInt()) {
+      // Merge the bitmasks from either source - we are selecting one of them.
+      BitMask = LHS.BitMask | RHS.BitMask;
       return copyStrideFrom(LHS);
     }
     return setMayDiverge();
@@ -1058,3 +1062,8 @@ OffsetInfo &OffsetInfo::copyStrideFrom(const OffsetInfo &Other) {
   ManifestStride = Other.ManifestStride;
   return *this;
 }
+
+OffsetInfo &OffsetInfo::copyStrideAndBitMaskFrom(const OffsetInfo &Other) {
+  BitMask = Other.BitMask;
+  return copyStrideFrom(Other);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
index f395e0317de82..5e1cf09efa858 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
@@ -18,7 +18,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK: StrideAnalysis for function '__vecz_v4_foo':
+; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_foo':
 define spir_kernel void @foo(ptr addrspace(1) align 1 %input) {
 entry:
   %localid0 = tail call i64 @__mux_get_local_id(i32 0)
@@ -103,6 +103,75 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_canny_regression':
+define spir_kernel void @canny_regression(ptr addrspace(1) align 1 %input) {
+entry:
+  %groupid0 = tail call i64 @__mux_get_group_id(i32 0)
+  %localid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %localsize0 = tail call i64 @__mux_get_local_size(i32 0)
+  %mul = mul i64 %groupid0, %localsize0
+  %add = add i64 %mul, %localid0
+  %0 = trunc i64 %add to i32
+  %conv = add i32 %0, -1
+  %trunclocalsize0 = trunc i64 %localsize0 to i32
+
+; CHECK: Stride for %arrayidx_pre = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_pre
+; CHECK-NEXT: divergent
+  %idxprom_pre = zext i32 %conv to i64
+  %arrayidx_pre = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_pre
+  %ld_pre = load i8, ptr addrspace(1) %arrayidx_pre, align 1
+
+  br label %for.body
+
+for.body:
+; The below is fundamentally the same stride calculation as %arrayidx_pre -
+; make sure the loop and the PHI don't throw off the analysis.
+; CHECK: Stride for %arrayidx_loop = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_loop
+; CHECK-NEXT: divergent
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gx2.050.us = phi i32 [ %conv, %entry ], [ %conv26.us, %for.body ]
+  %idxprom_loop = zext i32 %gx2.050.us to i64
+  %arrayidx_loop = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_loop
+
+  %ld_loop = load i8, ptr addrspace(1) %arrayidx_loop, align 1
+
+  %conv26.us = add i32 %gx2.050.us, %trunclocalsize0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit_cond = icmp ult i64 %iv.next, 2
+  br i1 %exit_cond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_select_regression':
+define spir_kernel void @select_regression(ptr addrspace(1) align 1 %input, i1 %cmp) {
+entry:
+  %groupid0 = tail call i64 @__mux_get_group_id(i32 0)
+  %localid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %localsize0 = tail call i64 @__mux_get_local_size(i32 0)
+  %mul = mul i64 %groupid0, %localsize0
+  %add = add i64 %mul, %localid0
+  %addtrunc = trunc i64 %add to i32
+
+; CHECK: Stride for %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom0
+; CHECK-NEXT: divergent
+  %idxprom0 = zext i32 %addtrunc to i64
+  %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom0
+  %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1
+
+; The below is fundamentally the same stride calculation as %arrayidx0 - make
+; sure the select doesn't throw off the analysis.
+; CHECK: Stride for %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom1
+; CHECK-NEXT: divergent
+  %sel1 = select i1 %cmp, i32 %addtrunc, i32 %addtrunc
+  %idxprom1 = zext i32 %sel1 to i64
+  %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom1
+  %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1
+
+  ret void
+}
+
 declare i64 @__mux_get_local_id(i32)
 declare i64 @__mux_get_local_size(i32)
 declare i64 @__mux_get_group_id(i32)

From c421b7ce5c08d56bbb8bf4b1ebd58e427dfc3b51 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 22 Feb 2024 12:32:50 +0000
Subject: [PATCH 098/182] [NFC] buildAfter: return IRBuilder<>.

In LLVM 19, we will no longer be able to use instruction pointers to
keep track of insertion points. This change prepares for that by making
a helper function that is only ever used to construct an IRBuilder<>
return an IRBuilder<> directly.
---
 .../include/transform/packetization_helpers.h    | 16 ++++++++--------
 .../source/transform/packetization_helpers.cpp   |  6 +++---
 .../vecz/source/transform/packetizer.cpp         |  8 +++++++-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index c5da96058e219..53141575a4280 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -41,11 +41,11 @@ namespace vecz {
 class TargetInfo;
 struct SimdPacket;
 
-/// @brief Provides the insertion point after the value V. Intended to be used
-/// in IRBuilder constructor. If V has a position in the function, (e.g., an
-/// Instruction), this method will return the next point after that. If V has
-/// no position (e.g., a Constant or an Argument) then this method will return
-/// a suitable insertion point at the beginning of the function.
+/// @brief Determines the insertion point after the value V. If V has a position
+/// in the function, (e.g., an Instruction), this method will return an
+/// IRBuilder set to the next point after that. If V has no position (e.g., a
+/// Constant or an Argument) then this method will return an IRBuilder set to a
+/// suitable insertion point at the beginning of the function.
 ///
 /// @param[in] V Value to insert instructions after, if an llvm::Instruction.
 /// @param[in] F Function to insert instructions into, if V is not an
@@ -53,9 +53,9 @@ struct SimdPacket;
 /// @param[in] IsPhi true if the instructions to insert are phis, false if the
 /// insertion point should be after all phis in the basic block.
 ///
-/// @return Insertion Point.
-llvm::Instruction *buildAfter(llvm::Value *V, llvm::Function &F,
-                              bool IsPhi = false);
+/// @return IRBuilder set to a suitable insertion point.
+llvm::IRBuilder<> buildAfter(llvm::Value *V, llvm::Function &F,
+                             bool IsPhi = false);
 
 /// @brief Utility function for building a shufflevector instruction, absorbing
 /// its operands where possible.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 6c7411ef6e78e..c48132c257184 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -99,7 +99,7 @@ Value *createFixedBroadcastOfScalableVector(const vecz::TargetInfo &TI,
 }  // namespace
 
 namespace vecz {
-Instruction *buildAfter(Value *V, Function &F, bool IsPhi) {
+IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) {
   if (auto *const I = dyn_cast<Instruction>(V)) {
     BasicBlock::iterator Next = I->getIterator();
     const BasicBlock::iterator End = Next->getParent()->end();
@@ -107,14 +107,14 @@ Instruction *buildAfter(Value *V, Function &F, bool IsPhi) {
       ++Next;
     } while (!IsPhi && (Next != End) &&
              (isa<PHINode>(Next) || isa<AllocaInst>(Next)));
-    return &*Next;
+    return {I->getParent(), Next};
   }
   // Else find the first point in the function after any allocas.
   auto it = F.getEntryBlock().begin();
   while (isa<AllocaInst>(*it)) {
     ++it;
   }
-  return &*it;
+  return {&F.getEntryBlock(), it};
 }
 
 Constant *getShuffleMask(ShuffleVectorInst *shuffle) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 23e2674676870..287e51508fd2e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1935,7 +1935,13 @@ Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
     // pressure, and to make it easier for CSE/GVN to combine them if there
     // are multiple uses of the same value (we could cache these?)
     auto *maskInst = dyn_cast<Instruction>(vecMask);
-    IRBuilder<> B(maskInst ? buildAfter(maskInst, F) : I);
+    IRBuilder<> B = [&] {
+      if (maskInst) {
+        return buildAfter(maskInst, F);
+      } else {
+        return IRBuilder<>(I);
+      }
+    }();
 
     Value *anyOfMask =
         createMaybeVPTargetReduction(B, TTI, vecMask, RecurKind::Or, VL);

From 9e84524bd5e52212f3845f3ae63c84783cfb7d41 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 22 Feb 2024 15:22:41 +0000
Subject: [PATCH 099/182] Keep debug information.

DeleteDebugInfoInstructions has a comment explaining that it intends to
remove invalid debug info instructions, but the ones that it is removing
seem perfectly valid. Given that Fraser recently removed other
generation of invalid debug info instructions, hopefully this means we
no longer have invalid debug info that we need to clean up.
---
 .../vecz/source/ir_cleanup.cpp                | 48 -------------------
 .../test/lit/llvm/packetization_debug_info.ll | 10 ++--
 .../vecz/test/lit/llvm/undef_debug_info.ll    |  4 --
 3 files changed, 4 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
index 3519c5b506897..b8bcf24c46c0a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -75,44 +75,6 @@ bool AreUsersDead(Instruction *I,
   return true;
 }
 
-/// @brief Mark any invalid debug intrinsics in the DbgUsers list for
-/// deletion. When an Instruction is deleted, its debug uses change to undef
-/// or an empty MDNode. In this case we add it in the 'to delete' list.
-///
-/// @param[in] DbgUsers Debug Intrinsic Instructions.
-/// @param[in,out] WorkList Newly detected Instructions marked for deletion.
-///
-/// @return void
-void DeleteDebugInfoInstructions(
-    const SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
-    SmallPtrSetImpl<Instruction *> &WorkList) {
-  for (llvm::DbgVariableIntrinsic *DII : DbgUsers) {
-    Value *Op = DII->getOperand(0);
-    // The first operand must be a non-null variable location argument.
-    if (Op) {
-      auto *MD = cast<MetadataAsValue>(Op)->getMetadata();
-
-      // Check the variable location is not an undef.
-      if (auto *V = dyn_cast<ValueAsMetadata>(MD)) {
-        Value *Var = V->getValue();
-        if (Var && !isa<UndefValue>(Var)) {
-          continue;
-        }
-      }
-
-      // Check the variable doesn't point to an empty MDNode.
-      if (auto *mdNode = dyn_cast<MDNode>(MD)) {
-        if (mdNode->getNumOperands() > 0) {
-          continue;
-        }
-      }
-    }
-
-    // Mark the Debug Info Intrinsic for deletion.
-    WorkList.insert(DII);
-  }
-}
-
 }  // namespace
 
 void IRCleanup::deleteInstructionLater(llvm::Instruction *I) {
@@ -124,23 +86,13 @@ void IRCleanup::deleteInstructionLater(llvm::Instruction *I) {
 void IRCleanup::deleteInstructions() {
   SmallPtrSet<Instruction *, 16> WorkList;
   SmallPtrSet<Instruction *, 16> VisitedForCycles;
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   bool progress = true;
   while (progress && !InstructionsToDelete.empty()) {
     progress = false;
     for (Instruction *I : InstructionsToDelete) {
       WorkList.erase(I);
       if (I->use_empty()) {
-        // Before we delete the current instruction we save its debug users, to
-        // check for potential loss of debug information after the removal of I.
-        findDbgUsers(DbgUsers, I);
         I->eraseFromParent();
-        // After we delete the instruction, its debug uses (if any) may become
-        // useless as a result of a loss of debug info. where the value of one
-        // or more source variables becomes unavailable, so at this point we
-        // will identify and delete those debug info instructions.
-        DeleteDebugInfoInstructions(DbgUsers, WorkList);
-        DbgUsers.clear();
         progress = true;
       } else if (PHINode *Phi = dyn_cast<PHINode>(I)) {
         if (AreUsersDead(Phi, InstructionsToDelete, WorkList,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index bcd40ab98077f..0dc600e899bc1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -51,18 +51,16 @@ entry:
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
   %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
-; FIXME: We're dropping the llvm.dbg.declare/llvm.dbg.value for %a here - we
-; could probably preserve it.
-; CHECK-NOT: call void @llvm.dbg.value(
+; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_A:![0-9]+]], metadata !DIExpression())
+; CHECK-SAME: !dbg [[A_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
   store i32 %2, i32* %a, align 4, !dbg !32
-; FIXME: We're dropping the llvm.dbg.declare/llvm.dbg.value for %a here - we
-; could probably preserve it.
-; CHECK-NOT: call void @llvm.dbg.value(
+; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_B:![0-9]+]], metadata !DIExpression())
+; CHECK-SAME: !dbg [[B_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
   %3 = load i64, i64* %tid, align 8, !dbg !33
   %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
index eae533dab66c1..523a70fde3913 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
@@ -114,7 +114,3 @@ attributes #3 = { nobuiltin }
 
 ; Vectorized kernel function
 ; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_test_fn({{.*}} !dbg {{![0-9]+}}
-
-; Check that there is no intrinsics using undefs
-; CHECK-NOT: call void @llvm.dbg.value(metadata {{.*}} undef
-; CHECK-NOT: call void @llvm.dbg.declare(metadata {{.*}} undef

From 42b425a8f003434aad6162c6ff5236ab06acca44 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 26 Feb 2024 14:10:31 +0000
Subject: [PATCH 100/182] Update for LLVM 19.

* When using the new debug info format, make sure to actually use the
  new debug info format by converting the builtins module after loading.
* When checking debug info, make sure to check both the old and the new
  debug info formats.
* When inserting debug info, make sure to insert in the appropriate
  format.
* When inserting PHIs, ensure they are inserted before any debug info.
  We are also inconsistent in whether trying to insert PHIs before or
  after existing PHIs; this commit tries to preserve the existing
  insertion points as much as possible.
* In add.cl, do not check that no vsetvli is generated. Even if we do
  not generate it, LLVM 19 generates it itself. This is fine.
---
 .../include/multi_llvm/basicblock_helper.h    | 41 ++++++++++
 .../vecz/source/control_flow_boscc.cpp        | 14 ++--
 .../control_flow_conversion_pass.cpp          | 33 ++++----
 .../transform/packetization_helpers.cpp       |  6 ++
 .../vecz/source/transform/scalarizer.cpp      | 75 +++++++++++++------
 5 files changed, 125 insertions(+), 44 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h
new file mode 100644
index 0000000000000..e4175a27ebb3a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h
@@ -0,0 +1,41 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_BASICBLOCK_HELPER_H_INCLUDED
+#define MULTI_LLVM_BASICBLOCK_HELPER_H_INCLUDED
+
+#include <llvm/IR/BasicBlock.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+inline void insertBefore(llvm::Instruction *const I,
+                         const llvm::BasicBlock::iterator InsertPos) {
+#if LLVM_VERSION_GREATER_EQUAL(18, 0)
+  I->insertBefore(InsertPos);
+#else
+  I->insertBefore(&*InsertPos);
+#endif
+}
+
+inline llvm::BasicBlock::iterator getFirstNonPHIIt(llvm::BasicBlock *const BB) {
+#if LLVM_VERSION_GREATER_EQUAL(18, 0)
+  return BB->getFirstNonPHIIt();
+#else
+  return BB->getFirstNonPHI()->getIterator();
+#endif
+}
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_BASICBLOCK_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index a02fca4dac328..f36979e9c59b4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -23,7 +23,7 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Cloning.h>
-#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/basicblock_helper.h>
 
 #include <numeric>
 #include <queue>
@@ -1023,8 +1023,8 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
                         << blendPoint->getName() << "\n");
 
       PHINode *blend = PHINode::Create(liveIn->getType(), 2,
-                                       liveIn->getName() + ".boscc_blend",
-                                       &blendPoint->front());
+                                       liveIn->getName() + ".boscc_blend");
+      multi_llvm::insertBefore(blend, blendPoint->begin());
       bool replaceUniform = false;
       bool replacePredicate = false;
       // For each predecessor, if it can reach the instruction, set the
@@ -1093,8 +1093,8 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
                        << " in " << target->getName() << "\n");
 
             PHINode *blend = PHINode::Create(
-                incoming->getType(), 1, incoming->getName() + ".boscc_lcssa",
-                &target->front());
+                incoming->getType(), 1, incoming->getName() + ".boscc_lcssa");
+            multi_llvm::insertBefore(blend, target->begin());
             blend->addIncoming(incoming, runtimeCheckerBlock);
             PHI->setIncomingValue(idx, blend);
           }
@@ -1216,8 +1216,8 @@ bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues(
     LoopTag *LTag, Instruction *from, Instruction *to) {
   auto createLatchIncoming = [&from, &LTag, this] {
     auto *ret =
-        PHINode::Create(from->getType(), 2, from->getName() + ".boscc_blend",
-                        &LTag->latch->front());
+        PHINode::Create(from->getType(), 2, from->getName() + ".boscc_blend");
+    multi_llvm::insertBefore(ret, LTag->latch->begin());
     Value *uniform = getUniformV(from);
     Value *default_val = getDefaultValue(from->getType());
     for (BasicBlock *pred : predecessors(LTag->latch)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index b8fe270bb0d0c..ed1a5f9408d4a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -38,6 +38,7 @@
 #include <llvm/Support/Error.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/basicblock_helper.h>
 
 #include <queue>
 #include <utility>
@@ -715,8 +716,8 @@ bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
     VECZ_ERROR_IF(!preheader, "BasicBlock tag is not defined");
 
     if (LTag->isLoopDivergent()) {
-      PHINode *PHI =
-          PHINode::Create(maskTy, 2, BB.getName() + ".entry_mask", &BB.front());
+      PHINode *PHI = PHINode::Create(maskTy, 2, BB.getName() + ".entry_mask");
+      multi_llvm::insertBefore(PHI, BB.begin());
       PHI->addIncoming(MaskInfos[preheader].exitMasks[&BB], preheader);
       maskInfo.entryMask = PHI;
       LLVM_DEBUG(dbgs() << "Loop divergent loop header " << BB.getName()
@@ -766,8 +767,9 @@ bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
     }
   } else {
     // A phi function of the predecessors otherwise.
-    PHINode *PHI = PHINode::Create(maskTy, numPreds,
-                                   BB.getName() + ".entry_mask", &BB.front());
+    PHINode *PHI =
+        PHINode::Create(maskTy, numPreds, BB.getName() + ".entry_mask");
+    multi_llvm::insertBefore(PHI, BB.begin());
     for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) {
       PHI->addIncoming(MaskInfos[*it].exitMasks[&BB], *it);
     }
@@ -948,8 +950,9 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
       // The value of the exit mask of a divergent loop is a phi function
       // between the mask update and the loop exit mask phi.
       auto *const exitMask =
-          PHINode::Create(maskTy, 2, exitBlock->getName() + ".loop_exit_mask",
-                          LTag.header->getFirstNonPHI());
+          PHINode::Create(maskTy, 2, exitBlock->getName() + ".loop_exit_mask");
+      multi_llvm::insertBefore(exitMask,
+                               multi_llvm::getFirstNonPHIIt(LTag.header));
       LMask.persistedDivergentExitMasks[exitingBlock] = exitMask;
       if (BOSCC) {
         BOSCC->createReference(exitMask, getDefaultValue(maskTy));
@@ -2094,7 +2097,7 @@ bool ControlFlowConversionState::Impl::generateDivergentLoopResults(
                     << LTag.loop->getName() << "\n");
 
   // First create instructions to save the value of the last iteration ...
-  IRBuilder<> B(getInsertionPt(*LTag.header));
+  IRBuilder<> B(LTag.header, multi_llvm::getFirstNonPHIIt(LTag.header));
   for (Value *LLV : LTag.loopLiveValues) {
     LTag.loopResultPrevs[LLV] =
         B.CreatePHI(LLV->getType(), 2, LLV->getName() + ".prev");
@@ -2120,7 +2123,8 @@ bool ControlFlowConversionState::Impl::generateDivergentLoopResults(
 
         uniformLRP->setIncomingValue(1, LLV);
 
-        uniformLRP->insertBefore(getInsertionPt(*uniformHeader));
+        multi_llvm::insertBefore(uniformLRP,
+                                 multi_llvm::getFirstNonPHIIt(uniformHeader));
         BOSCC->createReference(LRP, uniformLRP, true);
       }
     }
@@ -2193,8 +2197,9 @@ bool ControlFlowConversionState::Impl::blendDivergentLoopLiveValues(
     VECZ_ERROR_IF(
         !prev, "Divergent loop live value does not have a persist instruction");
 
-    PHINode *blend = PHINode::Create(
-        LLV->getType(), 2, LLV->getName() + ".blend", &LTag.pureExit->front());
+    PHINode *blend =
+        PHINode::Create(LLV->getType(), 2, LLV->getName() + ".blend");
+    multi_llvm::insertBefore(blend, LTag.pureExit->begin());
 
     // Replace all uses outside the loop.
     VECZ_FAIL_IF(
@@ -2258,8 +2263,8 @@ bool ControlFlowConversionState::Impl::blendDivergentLoopExitMasks(
           "Divergent loop exit mask does not have a persist instruction");
 
       PHINode *blend =
-          PHINode::Create(prev->getType(), 2, prev->getName() + ".blend",
-                          &LTag.pureExit->front());
+          PHINode::Create(prev->getType(), 2, prev->getName() + ".blend");
+      multi_llvm::insertBefore(blend, LTag.pureExit->begin());
 
       // Replace all uses outside the loop.
       VECZ_FAIL_IF(!replaceUsesOutsideDivergentLoop(LTag, update, blend,
@@ -2944,8 +2949,8 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
     Type *T = opDef->getType();
     const unsigned numPreds = std::distance(pred_begin(B), pred_end(B));
     Value *blend = nullptr;
-    PHINode *PHI =
-        PHINode::Create(T, numPreds, opDef->getName() + ".merge", &B->front());
+    PHINode *PHI = PHINode::Create(T, numPreds, opDef->getName() + ".merge");
+    multi_llvm::insertBefore(PHI, B->begin());
 
     auto const *const LTag = DR->getTag(B).loop;
     bool hasVisitedPred = false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index c48132c257184..dd444cbffeee7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -107,6 +107,12 @@ IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) {
       ++Next;
     } while (!IsPhi && (Next != End) &&
              (isa<PHINode>(Next) || isa<AllocaInst>(Next)));
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+    // If there is debug info between this instruction and the next, insert
+    // before the debug info. This is required for PHIs and makes sense for
+    // other instructions too.
+    Next.setHeadBit(true);
+#endif
     return {I->getParent(), Next};
   }
   // Else find the first point in the function after any allocas.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index adc2d659c36f7..a4c8470c5582f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -688,33 +688,13 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
     return;
   }
 
-  auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM);
-  if (!MDV) {
-    return;
-  }
-
   // Contains processed SIMD values for which we create scalar debug
   // instructions and is used to avoid duplicate LLVM dbg.value's.
   SmallPtrSet<Value *, 4> VectorElements;
 
   DIBuilder DIB(*Original->getModule(), false);
-  for (User *U : MDV->users()) {
-    DILocalVariable *DILocal = nullptr;
-    DebugLoc DILoc;
-
-    // These methods aren't virtual in DbgInfoIntrinsic for some reason
-    // TODO CA-1115 - Support llvm.dbg.addr() intrinsic
-    if (DbgValueInst *const DVI = dyn_cast<DbgValueInst>(U)) {
-      DILocal = DVI->getVariable();
-      DILoc = DVI->getDebugLoc();
-    } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(U)) {
-      DILocal = DDI->getVariable();
-      DILoc = DDI->getDebugLoc();
-    } else {
-      continue;
-    }
 
-    // Create new llvm.dbg.value() intrinsic across enabled SIMD lanes
+  auto CreateAndInsertDIExpr = [&](auto InsertDIExpr) {
     const auto bitSize = Original->getType()->getScalarSizeInBits();
     for (unsigned lane = 0; lane < Width; ++lane) {
       Value *LaneVal = Packet->at(lane);
@@ -732,12 +712,61 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
             DIExpression::createFragmentExpression(DIB.createExpression(),
                                                    lane * bitSize, bitSize);
         if (DIExpr) {
-          DIB.insertDbgValueIntrinsic(LaneVal, DILocal, *DIExpr, DILoc,
-                                      Original);
+          InsertDIExpr(LaneVal, *DIExpr);
           VectorElements.insert(LaneVal);
         }
       }
     }
+  };
+
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+  for (DPValue *const DPV : LAM->getAllDPValueUsers()) {
+    DILocalVariable *DILocal = nullptr;
+    DebugLoc DILoc;
+
+    switch (DPV->getType()) {
+      case DPValue::LocationType::Value:
+      case DPValue::LocationType::Declare:
+        DILocal = DPV->getVariable();
+        DILoc = DPV->getDebugLoc();
+        break;
+      default:
+        continue;
+    }
+
+    // Create new DPValue across enabled SIMD lanes
+    CreateAndInsertDIExpr([&](Value *LaneVal, DIExpression *DIExpr) {
+      DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
+    });
+  }
+#endif
+
+  auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM);
+  if (!MDV) {
+    return;
+  }
+
+  for (User *U : MDV->users()) {
+    DILocalVariable *DILocal = nullptr;
+    DebugLoc DILoc;
+
+    // These methods aren't virtual in DbgInfoIntrinsic for some reason
+    // TODO CA-1115 - Support llvm.dbg.addr() intrinsic
+    if (DbgValueInst *const DVI = dyn_cast<DbgValueInst>(U)) {
+      DILocal = DVI->getVariable();
+      DILoc = DVI->getDebugLoc();
+    } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(U)) {
+      DILocal = DDI->getVariable();
+      DILoc = DDI->getDebugLoc();
+    } else {
+      continue;
+    }
+
+    // Create new llvm.dbg.value() intrinsic across enabled SIMD lanes
+    CreateAndInsertDIExpr([&](Value *const LaneVal,
+                              DIExpression *const DIExpr) {
+      DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
+    });
   }
 }
 

From 715f8dc9c502871e5a87a03bdba2be2731030f6d Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 6 Mar 2024 12:14:41 +0000
Subject: [PATCH 101/182] Update for LLVM 19.

LLVM 19 changes ICmpInst's constructor to take a BasicBlock * rather
than a BasicBlock &. Use CmpInst::Create instead which continues to work
across versions.
---
 .../vecz/source/control_flow_boscc.cpp             | 14 ++++++++------
 .../vecz/source/control_flow_roscc.cpp             |  4 ++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index f36979e9c59b4..0563b0d584479 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -760,9 +760,10 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
       BOSCCIndirTag.loop->loop->addBasicBlockToLoop(BOSCCIndir, *LI);
     }
 
-    ICmpInst *cond = new ICmpInst(
-        *runtimeCheckerBlock, CmpInst::ICMP_EQ,
-        PassState.getMaskInfo(uniformB).exitMasks.lookup(succ), trueCI);
+    auto *cond =
+        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                        PassState.getMaskInfo(uniformB).exitMasks.lookup(succ),
+                        trueCI, "", runtimeCheckerBlock);
     BranchInst::Create(succ, BOSCCIndir, cond, runtimeCheckerBlock);
 
     if (i > 0) {
@@ -775,9 +776,10 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
   }
 
   BasicBlock *succ = succs[size - 1];
-  ICmpInst *cond = new ICmpInst(
-      *runtimeCheckerBlock, CmpInst::ICMP_EQ,
-      PassState.getMaskInfo(uniformB).exitMasks.lookup(succ), trueCI);
+  auto *cond =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                      PassState.getMaskInfo(uniformB).exitMasks.lookup(succ),
+                      trueCI, "", runtimeCheckerBlock);
 
   BasicBlock *connectionPoint = target;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
index cd8762dbd7d39..f59cc6209b361 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
@@ -128,8 +128,8 @@ bool ControlFlowConversionState::ROSCCGadget::run(Function &F) {
 
     BasicBlock *ReturnBlock = Which ? SuccT : SuccF;
     Value *Cond = Branch->getCondition();
-    ICmpInst *newCond =
-        new ICmpInst(*BB, CmpInst::ICMP_EQ, Cond, Which ? falseCI : trueCI);
+    auto *newCond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Cond,
+                                    Which ? falseCI : trueCI, "", BB);
     newCond->setName(Twine(Cond->getName(), ".ROSCC"));
     BranchInst::Create(newBB, ReturnBlock, newCond, BB);
 

From 8e00b3f0d7ccec070d2fa49af4ca71df83db884d Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 25 Mar 2024 01:47:22 +0000
Subject: [PATCH 102/182] Update for LLVM 19.

Various new debug info things have been renamed.
---
 .../vecz/source/transform/scalarizer.cpp           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index a4c8470c5582f..6b678b8249d3a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -720,21 +720,21 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
   };
 
 #if LLVM_VERSION_GREATER_EQUAL(19, 0)
-  for (DPValue *const DPV : LAM->getAllDPValueUsers()) {
+  for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) {
     DILocalVariable *DILocal = nullptr;
     DebugLoc DILoc;
 
-    switch (DPV->getType()) {
-      case DPValue::LocationType::Value:
-      case DPValue::LocationType::Declare:
-        DILocal = DPV->getVariable();
-        DILoc = DPV->getDebugLoc();
+    switch (DVR->getType()) {
+      case DbgVariableRecord::LocationType::Value:
+      case DbgVariableRecord::LocationType::Declare:
+        DILocal = DVR->getVariable();
+        DILoc = DVR->getDebugLoc();
         break;
       default:
         continue;
     }
 
-    // Create new DPValue across enabled SIMD lanes
+    // Create new DbgVariableRecord across enabled SIMD lanes
     CreateAndInsertDIExpr([&](Value *LaneVal, DIExpression *DIExpr) {
       DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
     });

From 61d211314e9b702a1d3a9d8350097ee163055702 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Sun, 31 Mar 2024 02:49:30 +0100
Subject: [PATCH 103/182] LLVM 19 update: trunc to i1.

LLVM 19 uses trunc x to i1 as the canonical representation, rather than
LLVM 18's icmp ne (and x, 1), 0, for truncations to i1. As the purpose
of these tests is not how that truncation is performed, allow anything.
---
 ...roup_reductions_spv_khr_uniform_group_instructions.ll | 9 +++------
 ...roup_reductions_spv_khr_uniform_group_instructions.ll | 4 +---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 5ee579906c23e..054f7d91cafa6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -143,8 +143,7 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and(
-; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
-; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T]], {{.*}})
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T:%.*]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[R]])
 ; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
@@ -164,8 +163,7 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or(
-; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
-; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T]], {{.*}})
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T:%.*]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[R]])
 ; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
@@ -185,8 +183,7 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor(
-; CHECK: [[T:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
-; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T]], {{.*}})
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T:%.*]], {{.*}})
 ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[R]])
 ; CHECK: [[R:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 2439e1b8bd854..c455acf490e74 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -178,9 +178,7 @@ entry:
 
 ; CHECK-LABEL: @__vecz_v4_reduce_logical_xor(
 ; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
-; CHECK: [[T:%.*]] = and i4 [[X]], 1
-; CHECK: [[T0:%.*]] = icmp ne i4 [[T]], 0
-; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[T0]])
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[T:%.*]])
 ; CHECK: [[E:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
 define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {

From e278c605b4fad381ab53beeb5d0ce06a38e8e498 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 16 Apr 2024 09:45:50 +0100
Subject: [PATCH 104/182] Accept nuw/nsw flags on trunc.

LLVM 19 adds nuw/nsw flags to trunc instructions that can indicate that
the value is unchanged by the operation. Allow this in the tests where
it shows up.
---
 .../lit/llvm/VectorPredication/compute_vector_length.ll   | 4 ++--
 .../test/lit/llvm/VectorPredication/load_add_store.ll     | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index f0c0335724115..633fac20e4050 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -40,7 +40,7 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 ; CHECK-F2: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK-F2: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
 ; CHECK-F2: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 2)
-; CHECK-F2: [[VL1:%.*]] = trunc i64 [[VL0]] to i32
+; CHECK-F2: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32
 ; CHECK-F2: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
 ; CHECK-F2: store i32 [[RED]], ptr addrspace(1) {{.*}}
 
@@ -51,6 +51,6 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 ; CHECK-S4: [[VF0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-S4: [[VF1:%.*]] = shl i64 [[VF0]], 2
 ; CHECK-S4: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 [[VF1]])
-; CHECK-S4: [[VL1:%.*]] = trunc i64 [[VL0]] to i32
+; CHECK-S4: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32
 ; CHECK-S4: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
 ; CHECK-S4: store i32 [[RED]], ptr addrspace(1) {{.*}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index f364b93b90bf1..7b274d63d6a6b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK_4F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK_4F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
 ; CHECK_4F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 4)
-; CHECK_4F: [[VL:%.*]] = trunc i64 [[T0]] to i32
+; CHECK_4F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32
 ; CHECK_4F: [[LHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
 ; CHECK_4F: [[RHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
 ; CHECK_4F: [[ADD:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[LHS]], <4 x i32> [[RHS]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
@@ -57,7 +57,7 @@ entry:
 ; CHECK_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK_1S: [[T1:%.*]] = shl i64 [[T0]], 2
 ; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
-; CHECK_1S: [[VL:%.*]] = trunc i64 [[T2]] to i32
+; CHECK_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
 ; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 4 x i1> (undef|poison), <vscale x 4 x i32> zeroinitializer\)]], i32 [[VL]])
 ; CHECK_1S: [[RHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
 ; CHECK_1S: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[LHS]], <vscale x 4 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
@@ -81,7 +81,7 @@ entry:
 ; CHECK_V4_2F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK_V4_2F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
 ; CHECK_V4_2F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 2)
-; CHECK_V4_2F: [[VL:%.*]] = trunc i64 [[T0]] to i32
+; CHECK_V4_2F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32
 ; Each WI performs 4 elements, so multiply the VL by 4
 ; CHECK_V4_2F: [[SVL:%.*]] = shl nuw nsw i32 [[VL]], 2
 ; CHECK_V4_2F: [[LHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
@@ -96,7 +96,7 @@ entry:
 ; CHECK_V4_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK_V4_1S: [[T1:%.*]] = shl i64 [[T0]], 2
 ; CHECK_V4_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
-; CHECK_V4_1S: [[VL:%.*]] = trunc i64 [[T2]] to i32
+; CHECK_V4_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
 ; Each WI performs 4 elements, so multiply the VL by 4
 ; CHECK_V4_1S: [[SVL:%.*]] = shl i32 [[VL]], 2
 ; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 16 x i1> (undef|poison), <vscale x 16 x i32> zeroinitializer\)]], i32 [[SVL]])

From 7580493fdcb0938c43fef694a87e36f6ee447fe1 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 26 Apr 2024 12:23:04 +0100
Subject: [PATCH 105/182] [LLVM 19] Adjust tests for llvm.ct* intrinsics.

LLVM 19 adds the range attribute to the return type of llvm.ct*
intrinsics. No code changes are needed to deal with this, but it appears
in a few tests that did not allow attributes. This commit updates the
tests to allow {{.*}} for attributes of affected functions.
---
 .../lit/llvm/ScalableVectors/intrinsics.ll     | 12 ++++++------
 .../vecz/test/lit/llvm/intrinsics-scalarize.ll | 18 +++++++++---------
 .../vecz/test/lit/llvm/intrinsics.ll           | 12 ++++++------
 ...tions_spv_khr_uniform_group_instructions.ll |  2 +-
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
index e9c1dfd32d483..12632b5696d2b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -166,18 +166,18 @@ declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
 declare i64 @__mux_get_global_id(i32)
 
 ; CTPOP: void @__vecz_nxv2_ctpop
-; CTPOP: = call <vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> %{{.*}})
-; CTPOP: = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> %{{.*}})
+; CTPOP: = call {{.*}}<vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> %{{.*}})
+; CTPOP: = call {{.*}}<vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> %{{.*}})
 
 ; CTLZ: void @__vecz_nxv4_ctlz
 ; ... but it does widen ctlz
-; CTLZ: = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %{{.*}}, i1 false)
-; CTLZ: = call <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %{{.*}}, i1 false)
 
 ; CTTZ: void @__vecz_nxv8_cttz
 ; ... and cttz
-; CTTZ: = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %{{.*}}, i1 false)
-; CTTZ: = call <vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> %{{.*}}, i1 false)
 
 ; SADD_SAT: void @__vecz_nxv2_sadd_sat
 ; SADD_SAT: = call <vscale x 2 x i32> @llvm.sadd.sat.nxv2i32(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
index 12cbf0c96934b..b85ec08e1b9bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
@@ -172,19 +172,19 @@ declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
 declare i64 @__mux_get_global_id(i32)
 
 ; CTPOP: void @__vecz_v2_ctpop
-; CTPOP: = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
-; CTPOP: = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
-; CTPOP: = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
+; CTPOP: = call {{.*}}<2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
+; CTPOP: = call {{.*}}<2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
+; CTPOP: = call {{.*}}<2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
 
 ; CTLZ: void @__vecz_v4_ctlz
-; CTLZ: = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
-; CTLZ: = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
-; CTLZ: = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
 
 ; CTTZ: void @__vecz_v8_cttz
-; CTTZ: = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
-; CTTZ: = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
-; CTTZ: = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
 
 ; SADD_SAT: void @__vecz_v2_sadd_sat
 ; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
index ccdccba5a3d6d..ec79e3578faa6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
@@ -172,16 +172,16 @@ declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
 declare i64 @__mux_get_global_id(i32)
 
 ; CTPOP: void @__vecz_v2_ctpop
-; CTPOP: = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
-; CTPOP: = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %{{.*}})
+; CTPOP: = call {{.*}}<2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
+; CTPOP: = call {{.*}}<4 x i8> @llvm.ctpop.v4i8(<4 x i8> %{{.*}})
 
 ; CTLZ: void @__vecz_v4_ctlz
-; CTLZ: = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
-; CTLZ: = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<8 x i8> @llvm.ctlz.v8i8(<8 x i8> %{{.*}}, i1 false)
 
 ; CTTZ: void @__vecz_v8_cttz
-; CTTZ: = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
-; CTTZ: = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<16 x i8> @llvm.cttz.v16i8(<16 x i8> %{{.*}}, i1 false)
 
 ; SADD_SAT: void @__vecz_v2_sadd_sat
 ; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index c455acf490e74..8f5e65d11968a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -177,7 +177,7 @@ entry:
 }
 
 ; CHECK-LABEL: @__vecz_v4_reduce_logical_xor(
-; CHECK: [[X:%.*]] = call i4 @llvm.ctpop.i4(i4 {{%.*}})
+; CHECK: [[X:%.*]] = call {{.*}}i4 @llvm.ctpop.i4(i4 {{%.*}})
 ; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[T:%.*]])
 ; CHECK: [[E:%.*]] = zext i1 %call2 to i32
 ; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4

From 17d65f8d9f70fc0107ba39f08d35742eb022b535 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Wed, 1 May 2024 13:04:35 +0100
Subject: [PATCH 106/182] Update lit check for vector.splice

---
 .../vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll | 2 +-
 .../test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 3cbb091d746e0..830a0ae5d4370 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -107,7 +107,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
 
 ;------- target-dependent slide-up code:
-; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
 ; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
index 8c3b185d0c5f1..abe654d8cdb0d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -111,7 +111,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 ; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
 
 ;------- target-dependent slide-up code:
-; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
 ; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]

From 4eeda203949a8a05d7b6da2080242c81a3ac2ff9 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Thu, 2 May 2024 15:13:31 +0100
Subject: [PATCH 107/182] Moved compiler-pipeline out of utils so it can be
 included directly.

We wish to build compiler-pipeline separately to satisfy the needs of
sycl native cpu to just use our passes. This separates it from some
unrelated libraries. This is a step towards restructuring the compiler
libraries, so they can be used more independently.
---
 .../include/compiler/utils/address_spaces.h   |   38 +
 .../include/compiler/utils/attributes.h       |  196 +
 .../include/compiler/utils/barrier_regions.h  |  378 ++
 .../include/compiler/utils/builtin_info.h     |  859 ++++
 .../include/compiler/utils/cl_builtin_info.h  |  216 +
 .../compiler/utils/define_mux_builtins_pass.h |   36 +
 .../include/compiler/utils/device_info.h      |  127 +
 .../include/compiler/utils/dma.h              |   91 +
 .../utils/encode_kernel_metadata_pass.h       |   60 +
 .../compiler/utils/group_collective_helpers.h |  112 +
 .../include/compiler/utils/mangling.h         |  408 ++
 .../include/compiler/utils/metadata.h         |  297 ++
 .../utils/optimal_builtin_replacement_pass.h  |  115 +
 .../include/compiler/utils/pass_functions.h   |  339 ++
 .../include/compiler/utils/pass_machinery.h   |  145 +
 .../compiler/utils/prepare_barriers_pass.h    |   45 +
 ...eplace_local_module_scope_variables_pass.h |   44 +
 .../include/compiler/utils/scheduling.h       |  143 +
 .../compiler/utils/sub_group_analysis.h       |  115 +
 .../compiler/utils/target_extension_types.h   |  144 +
 .../utils/unique_opaque_structs_pass.h        |   55 +
 .../compiler/utils/work_item_loops_pass.h     |  117 +
 .../compiler_pipeline/source/attributes.cpp   |  219 +
 .../source/barrier_regions.cpp                | 1497 +++++++
 .../compiler_pipeline/source/builtin_info.cpp | 1255 ++++++
 .../source/cl_builtin_info.cpp                | 3671 +++++++++++++++++
 .../source/define_mux_builtins_pass.cpp       |   62 +
 .../compiler_pipeline/source/dma.cpp          |   74 +
 .../source/encode_kernel_metadata_pass.cpp    |   51 +
 .../source/group_collective_helpers.cpp       |   73 +
 .../compiler_pipeline/source/mangling.cpp     |  912 ++++
 .../compiler_pipeline/source/metadata.cpp     |  394 ++
 .../source/mux_builtin_info.cpp               | 1331 ++++++
 .../optimal_builtin_replacement_pass.cpp      |  312 ++
 .../source/pass_functions.cpp                 |  756 ++++
 .../source/pass_machinery.cpp                 |  136 +
 .../source/prepare_barriers_pass.cpp          |  127 +
 ...lace_local_module_scope_variables_pass.cpp |  641 +++
 .../compiler_pipeline/source/scheduling.cpp   |  154 +
 .../source/sub_group_analysis.cpp             |  168 +
 .../source/target_extension_types.cpp         |  162 +
 .../source/unique_opaque_structs_pass.cpp     |  282 ++
 .../source/work_item_loops_pass.cpp           | 1980 +++++++++
 43 files changed, 18337 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
new file mode 100644
index 0000000000000..09216f9c02032
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
@@ -0,0 +1,38 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// LLVM address space identifiers.
+
+#ifndef COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
+#define COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
+
+namespace compiler {
+namespace utils {
+namespace AddressSpace {
+enum {
+  Private = 0,
+  Global = 1,
+  Constant = 2,
+  Local = 3,
+  Generic = 4,
+};
+}
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
new file mode 100644
index 0000000000000..851847a725d69
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
@@ -0,0 +1,196 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
+#define COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+
+#include <optional>
+
+namespace llvm {
+class CallInst;
+class Function;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+/// @brief Encodes information that a function is a kernel
+///
+/// @param[in] F Function in which to encode the information.
+void setIsKernel(llvm::Function &F);
+
+/// @brief Encodes information that a function is a kernel entry point
+///
+/// @param[in] F Function in which to encode the information.
+void setIsKernelEntryPt(llvm::Function &F);
+
+/// @brief Returns whether the function is a kernel under compilation.
+///
+/// @param[in] F Function to check.
+bool isKernel(const llvm::Function &F);
+
+/// @brief Returns whether the function is a kernel entry point under
+/// compilation.
+///
+/// @param[in] F Function to check.
+bool isKernelEntryPt(const llvm::Function &F);
+
+/// @brief Drops any information about whether a function is a kernel.
+///
+/// @param[in] F Function to drop information from.
+void dropIsKernel(llvm::Function &F);
+
+/// @brief Takes information about kernels from one function to another.
+///
+/// Removes information from the old function, and overwrites any such
+/// information in the new function.
+///
+/// @param[in] ToF Function to copy to.
+/// @param[in] FromF Function to copy from.
+void takeIsKernel(llvm::Function &ToF, llvm::Function &FromF);
+
+/// @brief Sets the original function name as an attribute.
+void setOrigFnName(llvm::Function &F);
+
+/// @brief Retrieves the original function name from the given Function.
+///
+/// @return The original function name (via function attributes) or an empty
+/// string if none is found.
+llvm::StringRef getOrigFnName(const llvm::Function &F);
+
+/// @brief Retrieves the original function name from the given Function, or the
+/// Function's name.
+///
+/// @return The original function name (via function attributes) or the
+/// function's name if none is found.
+llvm::StringRef getOrigFnNameOrFnName(const llvm::Function &F);
+
+/// @brief Sets the original function name as an attribute.
+void setBaseFnName(llvm::Function &F, llvm::StringRef N);
+
+/// @brief Retrieves the base function name component from the given Function.
+///
+/// @return The base function name (via function attributes) or an empty string
+/// if none is found.
+llvm::StringRef getBaseFnName(const llvm::Function &F);
+
+/// @brief Retrieves the base function name component from the given Function,
+/// or the Function's name.
+///
+/// @return The base function name (via function attributes) or the function's
+/// name if none is found.
+llvm::StringRef getBaseFnNameOrFnName(const llvm::Function &F);
+
+/// @brief Retrieves the base function name from the given Function and
+/// sets it if none is found.
+/// @param F The function to read "base function name" attributes from
+/// @param SetFromF The function whose name is set as F's base function
+/// name if none is found in F.
+llvm::StringRef getOrSetBaseFnName(llvm::Function &F,
+                                   const llvm::Function &SetFromF);
+
+/// @brief Sets the local memory usage estimation for the given function.
+///
+/// @param[in] F the function in which to add the attribute
+/// @param[in] LocalMemUsage the (estimated) local memory usage in bytes
+void setLocalMemoryUsage(llvm::Function &F, uint64_t LocalMemUsage);
+
+/// @brief Gets the local memory usage estimation for the given function.
+///
+/// @param[in] F Function from which to pull the attribute
+/// @return the (estimated) local memory usage in bytes if present,
+/// std::nullopt otherwise.
+std::optional<uint64_t> getLocalMemoryUsage(const llvm::Function &F);
+
+/// @brief Sets information about a function's required DMA size as an
+/// attribute.
+///
+/// @param[in] F Function in which to add the attribute.
+/// @param[in] DMASizeBytes DMA size in bytes.
+void setDMAReqdSizeBytes(llvm::Function &F, uint32_t DMASizeBytes);
+
+/// @brief Retrieves information about a function's required DMA size as an
+/// attribute.
+///
+/// @param[in] F Function from which to pull the attribute
+/// @return The required DMA size order if present, else `std::nullopt`
+std::optional<uint32_t> getDMAReqdSizeBytes(const llvm::Function &F);
+
+/// @brief Determines the ordering of work item execution after a barrier.
+enum class BarrierSchedule {
+  /// @brief The barrier pass is free to schedule work items in any order.
+  Unordered = 0,
+  /// @brief The barrier region is entirely uniform (no dependence on work item
+  /// ID) such that execution of multiple work items is redundant and we are
+  /// free to execute the region for only a single work item. Additionally,
+  /// such a region is not allowed to read from or write to the barrier struct
+  /// (the region cannot use any variables defined outwith it, nor define any
+  /// variables used outwith it). Used by work group collectives to initialize
+  /// their accumulators.
+  Once,
+  /// @brief The barrier region should execute all vectorized work items first,
+  /// followed by the scalar tail.
+  ScalarTail,
+  /// @brief The barrier region must be executed in Local Linear ID order.
+  Linear,
+};
+
+/// @brief Sets the work item execution schedule for the given barrier.
+///
+/// @param[in] CI the barrier call instruction
+/// @param[in] Sched the execution schedule to set
+void setBarrierSchedule(llvm::CallInst &CI, BarrierSchedule Sched);
+
+/// @brief Gets the work item execution schedule for the given barrier.
+///
+/// @param[in] CI the barrier call instruction
+/// @return the execution schedule for this barrier
+BarrierSchedule getBarrierSchedule(const llvm::CallInst &CI);
+
+/// @brief Marks a kernel's subgroups as degenerate
+///
+/// @param[in] F Function in which to encode the information.
+void setHasDegenerateSubgroups(llvm::Function &F);
+
+/// @brief Returns whether the kernel has degenerate subgroups.
+///
+/// @param[in] F Function to check.
+bool hasDegenerateSubgroups(const llvm::Function &F);
+
+/// @brief Marks a function as not explicitly using subgroups
+///
+/// May be set even with unresolved external functions, assuming those don't
+/// explicitly use subgroups.
+///
+/// @param[in] F Function in which to encode the information.
+void setHasNoExplicitSubgroups(llvm::Function &F);
+
+/// @brief Returns whether the kernel does not explicitly use subgroups
+///
+/// @param[in] F Function to check.
+bool hasNoExplicitSubgroups(const llvm::Function &F);
+
+/// @brief Returns the mux subgroup size for the current function.
+///
+/// Currently always returns 1!
+unsigned getMuxSubgroupSize(const llvm::Function &F);
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
new file mode 100644
index 0000000000000..0553b475e11e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -0,0 +1,378 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Barrier regions, used by the WorkItemLoopsPass.
+
+#ifndef COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
+#define COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
+
+#include <compiler/utils/attributes.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SetVector.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include <multi_llvm/llvm_version.h>
+
+#include "pass_functions.h"
+
+namespace llvm {
+class BasicBlock;
+class CallInst;
+class DbgDeclareInst;
+class FenceInst;
+class Function;
+class Instruction;
+class Module;
+class StructType;
+class Type;
+class Value;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+enum { kBarrier_EndID = 0, kBarrier_FirstID, kBarrier_StartNewID };
+
+class Barrier;
+class BuiltinInfo;
+
+template <class T, size_t N>
+using OrderedSet =
+    llvm::SetVector<T, llvm::SmallVector<T, N>, llvm::SmallPtrSet<T, N>>;
+
+/// @brief Struct to store information about an inter-barrier region.
+struct BarrierRegion {
+  /// @brief the barrier id of this region
+  unsigned id = 0;
+  /// @brief the barrier call instruction for this region
+  llvm::Instruction *barrier_inst = nullptr;
+  /// @brief the entry block of this region
+  llvm::BasicBlock *entry = nullptr;
+
+  llvm::DenseSet<llvm::Value *> defs;
+  /// @brief barrier crossing uses that are defined in this region
+  OrderedSet<llvm::Value *, 16> uses_int;
+  /// @brief barrier crossing uses that are defined in another region
+  OrderedSet<llvm::Value *, 16> uses_ext;
+  /// @brief the blocks in this region
+  std::vector<llvm::BasicBlock *> blocks;
+  /// @brief the exit blocks of this region
+  llvm::SmallPtrSet<llvm::BasicBlock *, 4> barrier_blocks;
+  /// @brief the barrier ids of the successor regions
+  llvm::SmallVector<unsigned, 4> successor_ids;
+  /// @brief the work item execution schedule for this region
+  BarrierSchedule schedule = BarrierSchedule::Unordered;
+};
+
+using BarrierGraph = llvm::SmallVector<BarrierRegion, 8>;
+
+class Barrier {
+ public:
+  Barrier(llvm::Module &m, llvm::Function &f, bool IsDebug)
+      : live_var_mem_ty_(nullptr),
+        size_t_bytes(compiler::utils::getSizeTypeBytes(m)),
+        module_(m),
+        func_(f),
+        is_debug_(IsDebug),
+        max_live_var_alignment(0) {}
+
+  /// @brief perform the Barrier Region analysis and kernel splitting
+  void Run(llvm::ModuleAnalysisManager &mam);
+
+  /// @brief return whether the barrier struct needs to contain anything
+  bool hasLiveVars() const { return !whole_live_variables_set_.empty(); }
+
+  /// @brief returns the StructType of the barrier struct
+  llvm::StructType *getLiveVarsType() const { return live_var_mem_ty_; }
+
+  /// @brief returns the maximum alignment of the barrier struct
+  unsigned getLiveVarMaxAlignment() const { return max_live_var_alignment; }
+
+  /// @brief gets the split subkernel for the given barrier id
+  llvm::Function *getSubkernel(unsigned id) const {
+    return kernel_id_map_.find(id)->second;
+  }
+
+  /// @brief gets the number of regions/subkernels
+  size_t getNumSubkernels() const { return kernel_id_map_.size(); }
+
+  llvm::CallInst *getBarrierCall(unsigned id) const {
+    return llvm::dyn_cast_or_null<llvm::CallInst>(
+        barrier_graph[id - kBarrier_FirstID].barrier_inst);
+  }
+
+  /// @brief gets the size of the fixed sized part of the barrier struct
+  size_t getLiveVarMemSizeFixed() const { return live_var_mem_size_fixed; }
+
+  /// @brief gets the minimum size of the scalable part of the barrier struct
+  size_t getLiveVarMemSizeScalable() const {
+    return live_var_mem_size_scalable;
+  }
+
+  /// @brief gets the element index of the first scalable member of the barrier
+  /// struct
+  size_t getLiveVarMemScalablesIndex() const {
+    return live_var_mem_scalables_index;
+  }
+
+  /// @brief gets the barrier IDs of the successors of the given barrier region
+  const llvm::SmallVectorImpl<unsigned> &getSuccessorIds(unsigned id) const {
+    return barrier_graph[id - kBarrier_FirstID].successor_ids;
+  }
+
+  /// @brief gets the barrier IDs of the successors of the given barrier region
+  BarrierSchedule getSchedule(unsigned id) const {
+    return barrier_graph[id - kBarrier_FirstID].schedule;
+  }
+
+  /// @brief replaces a subkernel with a given function
+  void replaceSubkernel(llvm::Function *from, llvm::Function *to);
+
+  /// @brief Type containing list of debug intrinsics and the source variable
+  /// byte offset in the live variables struct.
+  // TODO CA-1115 llvm.dbg.declare is being deprecated
+  using debug_intrinsics_t =
+      llvm::SmallVector<std::pair<llvm::DbgDeclareInst *, unsigned>, 4>;
+  const debug_intrinsics_t &getDebugIntrinsics() const {
+    return debug_intrinsics_;
+  }
+
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+  using debug_variable_records_t =
+      llvm::SmallVector<std::pair<llvm::DbgVariableRecord *, unsigned>, 4>;
+  const debug_variable_records_t &getDebugDbgVariableRecords() const {
+    return debug_variable_records_;
+  }
+#endif
+
+  /// @brief gets the original function
+  llvm::Function &getFunc() { return func_; }
+  const llvm::Function &getFunc() const { return func_; }
+
+  /// @brief struct to help retrieval of values from the barrier struct
+  struct LiveValuesHelper {
+    const Barrier &barrier;
+    /// @brief A cache of queried live-values addresses (inside the live
+    /// variables struct), stored by the pair (value, member_idx).
+    llvm::DenseMap<std::pair<const llvm::Value *, unsigned>, llvm::Value *>
+        live_GEPs;
+    llvm::DenseMap<const llvm::Value *, llvm::Value *> reloads;
+    llvm::IRBuilder<> gepBuilder;
+    llvm::Value *barrier_struct = nullptr;
+    llvm::Value *vscale = nullptr;
+
+    LiveValuesHelper(const Barrier &b, llvm::Instruction *i, llvm::Value *s)
+        : barrier(b), gepBuilder(i), barrier_struct(s) {}
+
+    LiveValuesHelper(const Barrier &b, llvm::BasicBlock *bb, llvm::Value *s)
+        : barrier(b), gepBuilder(bb), barrier_struct(s) {}
+
+    /// @brief Return a GEP instruction pointing to the given value/idx pair in
+    /// the barrier struct.
+    ///
+    /// @return The GEP corresponding to the address of the value in the
+    /// struct, or nullptr if the value could not be found in the struct.
+    llvm::Value *getGEP(const llvm::Value *live, unsigned member_idx = 0);
+
+    /// @brief Return a GEP instruction corresponding to the address of
+    /// the given ExtractValueInst in the barriers struct.
+    ///
+    /// @return The GEP corresponding to the address of the value in the
+    /// struct, or nullptr if the value is not an ExtractValueInst.
+    llvm::Value *getExtractValueGEP(const llvm::Value *live);
+
+    /// @brief get a value reloaded from the barrier struct.
+    ///
+    /// @param[in] live the live value to retrieve from the barrier
+    /// @param[in] ir where to insert new instructions
+    /// @param[in] name a postfix to append to new value names
+    /// @param[in] reuse whether to generate the load for a given value only
+    /// once, returning the previously cached value on further requests.
+    llvm::Value *getReload(llvm::Value *live, llvm::IRBuilderBase &ir,
+                           const char *name, bool reuse = false);
+  };
+
+ private:
+  /// @brief The first is set for livein and the second is set for liveout
+  using live_in_out_t =
+      std::pair<llvm::DenseSet<llvm::Value *>, llvm::DenseSet<llvm::Value *>>;
+  /// @brief Type for memory allocation of live variables at all of barriers
+  using live_variable_mem_t = OrderedSet<llvm::Value *, 32>;
+  /// @brief Type for index of live variables on live variable information
+  /// Indexed by the pair (value, member_idx)
+  using live_variable_index_map_t =
+      llvm::DenseMap<std::pair<const llvm::Value *, unsigned>, unsigned>;
+  /// @brief Type for index of live variables on live variable information
+  /// Indexed by the pair (value, member_idx)
+  using live_variable_scalables_map_t = live_variable_index_map_t;
+  /// @brief Type for ids of barriers
+  using barrier_id_map_t = llvm::DenseMap<llvm::BasicBlock *, unsigned>;
+  /// @brief Type for ids of new kernel functions
+  using kernel_id_map_t = llvm::DenseMap<unsigned, llvm::Function *>;
+  /// @brief Type for map from ids to fence instructions
+  using fence_id_map_t = llvm::DenseMap<unsigned, llvm::FenceInst *>;
+  /// @brief Type between block and instruction for barrier.
+  using barrier_block_inst_map_t =
+      llvm::DenseMap<llvm::BasicBlock *, llvm::Instruction *>;
+  /// @brief Type between block and block for barrier.
+  using barrier_block_block_set_t = llvm::DenseSet<llvm::BasicBlock *>;
+  /// @brief Type between barrier id and stub call instructions. First
+  /// component of the pair is invoked before the barrier, the second after.
+  using debug_stub_map_t =
+      llvm::DenseMap<unsigned, std::pair<llvm::CallInst *, llvm::CallInst *>>;
+
+  /// @brief Keep whole live variables at all of barriers.
+  live_variable_mem_t whole_live_variables_set_;
+  /// @brief Keep index of live variables on live variable information.
+  live_variable_index_map_t live_variable_index_map_;
+  /// @brief Keep offsets of scalable live variables.
+  live_variable_scalables_map_t live_variable_scalables_map_;
+  /// @brief Keep ids of barriers.
+  barrier_id_map_t barrier_id_map_;
+  /// @brief Keep ids of barriers.
+  kernel_id_map_t kernel_id_map_;
+  /// @brief Keep struct types for live variables' memory layout.
+  llvm::StructType *live_var_mem_ty_;
+  /// @brief The total size of the non-scalable barrier struct
+  size_t live_var_mem_size_fixed = 0;
+  /// @brief The total unscaled size of the scalable barrier struct
+  size_t live_var_mem_size_scalable = 0;
+  /// @brief The index of the scalables buffer array in the barrier struct.
+  size_t live_var_mem_scalables_index = 0;
+  /// @brief Keep barriers.
+  llvm::SmallVector<llvm::CallInst *, 8> barriers_;
+  /// @brief Set of basic blocks that have a barrier as their successor
+  barrier_block_block_set_t barrier_successor_set_;
+  /// @brief Map between barrier ids and call instructions invoking stubs
+  debug_stub_map_t barrier_stub_call_map_;
+  /// @brief List of debug intrinsics and byte offsets into live variable struct
+  debug_intrinsics_t debug_intrinsics_;
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+  /// @brief List of debug DbgVariableRecords and byte offsets into live
+  /// variable struct
+  debug_variable_records_t debug_variable_records_;
+#endif
+
+  size_t size_t_bytes;
+
+  BarrierGraph barrier_graph;
+
+  llvm::Module &module_;
+  llvm::Function &func_;
+
+  BuiltinInfo *bi_ = nullptr;
+
+  /// @brief Set to true if we want to debug the kernel. This involves adding
+  /// debug stub functions and an extra alloca to aide debugging.
+  const bool is_debug_;
+
+  // @brief max alignment required for the live variables.
+  unsigned max_live_var_alignment;
+
+  /// @brief Find Barriers.
+  void FindBarriers();
+
+  /// @brief Split block with barrier.
+  void SplitBlockwithBarrier();
+
+  /// @brief Generate an empty kernel that only duplicates the source kernel's
+  /// CFG
+  ///
+  /// This is used to do a "dry run" of kernel splitting in order to obtain the
+  /// dominator tree, which is needed for correct identification of values that
+  /// cross the barrier.
+  ///
+  /// @param[in] region the region to clone into the new kernel.
+  /// @param[out] bbmap a mapping of original blocks onto the empty clones.
+  /// @return the fake kernel
+  llvm::Function *GenerateFakeKernel(
+      BarrierRegion &region,
+      llvm::DenseMap<llvm::BasicBlock *, llvm::BasicBlock *> &bbmap);
+
+  /// @brief Obtain a set of Basic Blocks for an inter-barrier region
+  ///
+  /// It traverses the CFG, following successors, until it hits a barrier,
+  /// building the region's internal data.
+  ///
+  /// @param[out] region the region to process
+  void GatherBarrierRegionBlocks(BarrierRegion &region);
+
+  /// @brief Obtain a set of Values used in a region that cross a barrier
+  ///
+  /// A value use crosses a barrier in the following cases:
+  /// * Its use is not in the same region as the defintion
+  /// * Its definition does not dominate the use
+  ///
+  /// @param[in] region The inter-barrier region
+  /// @param[in] ignore set of values to ignore
+  void GatherBarrierRegionUses(BarrierRegion &region,
+                               llvm::DenseSet<llvm::Value *> &ignore);
+
+  /// @brief Find livein and liveout variables per each basic block.
+  void FindLiveVariables();
+
+  /// @brief Remove variables that are better recalculated than stored in the
+  ///        barrier, for instance casts and vector splats.
+  void TidyLiveVariables();
+
+  /// @brief Pad the field types to an alignment by adding an int array if
+  /// needed
+  /// @param field_tys The vector of types representing the final structure
+  /// @param offset The current offset in the structure
+  /// @param alignment The required alignment
+  /// @return The new offset (or original offset if no padding needed)
+  unsigned PadTypeToAlignment(llvm::SmallVectorImpl<llvm::Type *> &field_tys,
+                              unsigned offset, unsigned alignment);
+
+  /// @brief Make type for whole live variables.
+  void MakeLiveVariableMemType();
+
+  /// @brief Generate new kernel from an inter-barrier region such that no call
+  /// to barriers occur within it.
+  ///
+  /// @param[in] region the inter-barrier region to create the kernel from
+  /// @return the new kernel
+  llvm::Function *GenerateNewKernel(BarrierRegion &region);
+
+  /// @brief This function is a copy from llvm::CloneBasicBlock. In order to
+  /// update live variable information, some of codes are added.
+  ///
+  /// @param[in] bb Basic block to copy.
+  /// @param[out] vmap Map for value for cloning.
+  /// @param[in] name_suffix Name for suffix.
+  /// @param[out] live_defs_info Live definitions' info current basic block.
+  /// @param[in] F Current function.
+  ///
+  /// @return Return cloned basic block.
+  llvm::BasicBlock *CloneBasicBlock(llvm::BasicBlock *bb,
+                                    llvm::ValueToValueMapTy &vmap,
+                                    const llvm::Twine &name_suffix,
+                                    live_variable_mem_t &live_defs_info,
+                                    llvm::Function *F);
+
+  /// @brief Seperate kernel function with barrier boundary.
+  void SeperateKernelWithBarrier();
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
new file mode 100644
index 0000000000000..e96b99073463c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
@@ -0,0 +1,859 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Information about compiler builtins.
+
+#ifndef COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
+#define COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
+
+#include <compiler/utils/group_collective_helpers.h>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/ConstantRange.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+/// @addtogroup utils
+/// @{
+
+using BuiltinID = int32_t;
+
+enum BaseBuiltinID {
+  eBuiltinUnknown,
+  eBuiltinInvalid,
+
+  // Mux builtins
+  eMuxBuiltinIsFTZ,
+  eMuxBuiltinUseFast,
+  eMuxBuiltinIsEmbeddedProfile,
+  eMuxBuiltinGetGlobalSize,
+  eMuxBuiltinGetGlobalId,
+  eMuxBuiltinGetGlobalOffset,
+  eMuxBuiltinGetLocalSize,
+  eMuxBuiltinGetLocalId,
+  eMuxBuiltinSetLocalId,
+  eMuxBuiltinGetSubGroupId,
+  eMuxBuiltinSetSubGroupId,
+  eMuxBuiltinGetNumGroups,
+  eMuxBuiltinGetNumSubGroups,
+  eMuxBuiltinSetNumSubGroups,
+  eMuxBuiltinGetMaxSubGroupSize,
+  eMuxBuiltinSetMaxSubGroupSize,
+  eMuxBuiltinGetGroupId,
+  eMuxBuiltinGetWorkDim,
+  eMuxBuiltinDMARead1D,
+  eMuxBuiltinDMARead2D,
+  eMuxBuiltinDMARead3D,
+  eMuxBuiltinDMAWrite1D,
+  eMuxBuiltinDMAWrite2D,
+  eMuxBuiltinDMAWrite3D,
+  eMuxBuiltinDMAWait,
+  eMuxBuiltinGetGlobalLinearId,
+  eMuxBuiltinGetLocalLinearId,
+  eMuxBuiltinGetEnqueuedLocalSize,
+  eMuxBuiltinGetSubGroupSize,
+  eMuxBuiltinGetSubGroupLocalId,
+  // Synchronization builtins
+  eMuxBuiltinMemBarrier,
+  eMuxBuiltinSubGroupBarrier,
+  eMuxBuiltinWorkGroupBarrier,
+#define GROUP_BUILTINS(SCOPE)                                                  \
+  eFirstMux##SCOPE##groupCollectiveBuiltin,                                    \
+      eMuxBuiltin##SCOPE##groupAll = eFirstMux##SCOPE##groupCollectiveBuiltin, \
+      eMuxBuiltin##SCOPE##groupAny, eMuxBuiltin##SCOPE##groupBroadcast,        \
+      eMuxBuiltin##SCOPE##groupReduceAdd, eMuxBuiltin##SCOPE##groupReduceFAdd, \
+      eMuxBuiltin##SCOPE##groupReduceSMin,                                     \
+      eMuxBuiltin##SCOPE##groupReduceUMin,                                     \
+      eMuxBuiltin##SCOPE##groupReduceFMin,                                     \
+      eMuxBuiltin##SCOPE##groupReduceSMax,                                     \
+      eMuxBuiltin##SCOPE##groupReduceUMax,                                     \
+      eMuxBuiltin##SCOPE##groupReduceFMax, eMuxBuiltin##SCOPE##groupReduceMul, \
+      eMuxBuiltin##SCOPE##groupReduceFMul, eMuxBuiltin##SCOPE##groupReduceAnd, \
+      eMuxBuiltin##SCOPE##groupReduceOr, eMuxBuiltin##SCOPE##groupReduceXor,   \
+      eMuxBuiltin##SCOPE##groupReduceLogicalAnd,                               \
+      eMuxBuiltin##SCOPE##groupReduceLogicalOr,                                \
+      eMuxBuiltin##SCOPE##groupReduceLogicalXor,                               \
+      eMuxBuiltin##SCOPE##groupScanAddInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFAddInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanAddExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFAddExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMinInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMinInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMinInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMinExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMinExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMinExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMaxInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMaxInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMaxInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMaxExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMaxExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMaxExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanMulInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFMulInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanMulExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFMulExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanAndInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanAndExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanOrInclusive,                                \
+      eMuxBuiltin##SCOPE##groupScanOrExclusive,                                \
+      eMuxBuiltin##SCOPE##groupScanXorInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanXorExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanLogicalAndInclusive,                        \
+      eMuxBuiltin##SCOPE##groupScanLogicalAndExclusive,                        \
+      eMuxBuiltin##SCOPE##groupScanLogicalOrInclusive,                         \
+      eMuxBuiltin##SCOPE##groupScanLogicalOrExclusive,                         \
+      eMuxBuiltin##SCOPE##groupScanLogicalXorInclusive,                        \
+      eMuxBuiltin##SCOPE##groupScanLogicalXorExclusive
+  GROUP_BUILTINS(Work),
+  eLastMuxWorkgroupCollectiveBuiltin =
+      eMuxBuiltinWorkgroupScanLogicalXorExclusive,
+  GROUP_BUILTINS(Sub),
+  // Extra subgroup shuffle operations
+  eMuxBuiltinSubgroupShuffle,
+  eMuxBuiltinSubgroupShuffleUp,
+  eMuxBuiltinSubgroupShuffleDown,
+  eMuxBuiltinSubgroupShuffleXor,
+  eLastMuxSubgroupCollectiveBuiltin = eMuxBuiltinSubgroupShuffleXor,
+  GROUP_BUILTINS(Vec),
+  eLastMuxVecgroupCollectiveBuiltin =
+      eMuxBuiltinVecgroupScanLogicalXorExclusive,
+
+  // Marker - target builtins should start from here.
+  eFirstTargetBuiltin,
+};
+
+/// @brief Describes the uniformity of a builtin's return values. An uniform
+/// value is the same for all instances (e.g. SIMD lanes).
+enum BuiltinUniformity : int32_t {
+  /// @brief The uniformity of the builtin's return value cannot be determined.
+  eBuiltinUniformityUnknown,
+  /// @brief The builtin never returns uniform values.
+  eBuiltinUniformityNever,
+  /// @brief The builtin always returns uniform values.
+  eBuiltinUniformityAlways,
+  /// @brief The builtin returns uniform values if its inputs are uniform.
+  eBuiltinUniformityLikeInputs,
+  /// @brief The builtin returns a sequential instance ID value
+  /// (e.g. get_local_id in OpenCL).
+  eBuiltinUniformityInstanceID,
+  /// @brief The builtin might return a sequential instance ID value,
+  /// if its argument can be zero (e.g. get_local_id(x)).
+  eBuiltinUniformityMaybeInstanceID
+};
+
+/// @brief Describes certain properties of builtin functions that the vectorizer
+/// needs to know about.
+enum BuiltinProperties : int32_t {
+  /// @brief The builtin has no special propery.
+  eBuiltinPropertyNone = 0,
+  /// @brief The builtin returns a value related to the geometry of the work
+  /// space, such as its dimension or an index into that dimensions.
+  eBuiltinPropertyWorkItem = (1 << 0),
+  /// @brief The builtin can affect the execution flow (e.g. barrier).
+  eBuiltinPropertyExecutionFlow = (1 << 1),
+  /// @brief The builtin implements a reduction, that is, it takes vector
+  /// arguments and returns a scalar value.
+  eBuiltinPropertyReduction = (1 << 2),
+  /// @brief The builtin has known side-effects.
+  eBuiltinPropertySideEffects = (1 << 3),
+  /// @brief The builtin is known to have no runtime side-effects. This is
+  /// equivalent to 'readonly' or 'readnone' in IR. The return value depends
+  /// only on the values of the arguments.
+  eBuiltinPropertyNoSideEffects = (1 << 4),
+  /// @brief The builtin can be instantiated, even if it has side-effects.
+  /// Builtins with 'NoSideEffects' should not be instantiated unless they
+  /// also have this flag, because of the 'noduplicate' IR attribute.
+  eBuiltinPropertySupportsInstantiation = (1 << 5),
+  /// @brief The builtin has no vector equivalent. There may be functions that
+  /// have the same signature that a vector equivalent function would have,
+  /// but these functions should not be used for that purpose. This can also
+  /// mean that a vector builtin has no scalar equivalent.
+  eBuiltinPropertyNoVectorEquivalent = (1 << 6),
+  /// @brief The builtin has a vector equivalent. This is used for the LLVM
+  /// intrinsics, since for the OpenCL builtins we can determine that
+  /// programmatically. It can also mean that a builtin has a scalar equivalent.
+  eBuiltinPropertyVectorEquivalent = (1 << 7),
+  /// @brief The builtin can be emitted inline.
+  eBuiltinPropertyCanEmitInline = (1 << 8),
+  /// @brief The builtin returns a value through its pointer argument. The
+  /// returned type is equal to the function return type.
+  eBuiltinPropertyPointerReturnEqualRetTy = (1 << 9),
+  /// @brief The builtin wants to be inlined post vectorization
+  eBuiltinPropertyInlinePostVectorization = (1 << 10),
+  /// @brief The builtin returns a value through its pointer argument. The
+  /// returned value is an i32 scalar or vector, matching the function return
+  /// type: float -> i32, <4 x double> -> <4 x i32>, etc
+  eBuiltinPropertyPointerReturnEqualIntRetTy = (1 << 11),
+  /// @brief The builtin returns local work item ID.
+  eBuiltinPropertyLocalID = (1 << 12),
+  /// @brief The builtin is atomic
+  eBuiltinPropertyAtomic = (1 << 13),
+  /// @brief The builtin is rematerializable on the other side of a barrier
+  ///
+  /// The WorkItemLoopsPass queries this property to prune the number of live
+  /// variables that are stored and passed between barrier regions. Calls to
+  /// rematerializable builtins are removed from the live variable structure,
+  /// and are re-inserted into each barrier region that requires their results.
+  eBuiltinPropertyRematerializable = (1 << 14),
+  /// @brief The builtin should be lowered to a mux builtin.
+  ///
+  /// This mapping takes place in BuiltinInfo::lowerBuiltinToMuxBuiltin.
+  eBuiltinPropertyLowerToMuxBuiltin = (1 << 15),
+  /// @brief The builtin is known not be be convergent, i.e., it does not
+  /// depend on any other work-item in any way.
+  eBuiltinPropertyKnownNonConvergent = (1 << 16),
+};
+
+/// @brief struct to hold information about a builtin function
+struct Builtin {
+  /// @brief the builtin Function
+  const llvm::Function &function;
+  /// @brief ID for internal use
+  const BuiltinID ID;
+  /// @brief the Builtin Properties
+  const BuiltinProperties properties;
+  /// @brief list of types used in overloading this builtin (only relevant for
+  /// overloadable mux builtins)
+  std::vector<llvm::Type *> mux_overload_info = {};
+
+  /// @brief returns whether the builtin is valid
+  bool isValid() const { return ID != eBuiltinInvalid; }
+
+  /// @brief returns whether the builtin is unknown
+  bool isUnknown() const { return ID == eBuiltinUnknown; }
+};
+
+/// @brief struct to hold information about a builtin function call
+struct BuiltinCall : public Builtin {
+  /// @brief the call instruction
+  const llvm::CallInst &call;
+  /// @brief the uniformity of the builtin call
+  const BuiltinUniformity uniformity;
+
+  /// @brief constructor
+  BuiltinCall(const Builtin &B, const llvm::CallInst &CI, BuiltinUniformity U)
+      : Builtin(B), call(CI), uniformity(U) {}
+};
+
+namespace MuxBuiltins {
+constexpr const char isftz[] = "__mux_isftz";
+constexpr const char usefast[] = "__mux_usefast";
+constexpr const char isembeddedprofile[] = "__mux_isembeddedprofile";
+constexpr const char get_global_size[] = "__mux_get_global_size";
+constexpr const char get_global_id[] = "__mux_get_global_id";
+constexpr const char get_global_offset[] = "__mux_get_global_offset";
+constexpr const char get_local_size[] = "__mux_get_local_size";
+constexpr const char get_local_id[] = "__mux_get_local_id";
+constexpr const char get_sub_group_id[] = "__mux_get_sub_group_id";
+constexpr const char get_num_groups[] = "__mux_get_num_groups";
+constexpr const char get_num_sub_groups[] = "__mux_get_num_sub_groups";
+constexpr const char get_max_sub_group_size[] = "__mux_get_max_sub_group_size";
+constexpr const char get_group_id[] = "__mux_get_group_id";
+constexpr const char get_work_dim[] = "__mux_get_work_dim";
+constexpr const char dma_read_1d[] = "__mux_dma_read_1D";
+constexpr const char dma_read_2d[] = "__mux_dma_read_2D";
+constexpr const char dma_read_3d[] = "__mux_dma_read_3D";
+constexpr const char dma_write_1d[] = "__mux_dma_write_1D";
+constexpr const char dma_write_2d[] = "__mux_dma_write_2D";
+constexpr const char dma_write_3d[] = "__mux_dma_write_3D";
+constexpr const char dma_wait[] = "__mux_dma_wait";
+constexpr const char get_global_linear_id[] = "__mux_get_global_linear_id";
+constexpr const char get_local_linear_id[] = "__mux_get_local_linear_id";
+constexpr const char get_enqueued_local_size[] =
+    "__mux_get_enqueued_local_size";
+constexpr const char get_sub_group_size[] = "__mux_get_sub_group_size";
+constexpr const char get_sub_group_local_id[] = "__mux_get_sub_group_local_id";
+
+// Barriers
+constexpr const char mem_barrier[] = "__mux_mem_barrier";
+constexpr const char sub_group_barrier[] = "__mux_sub_group_barrier";
+constexpr const char work_group_barrier[] = "__mux_work_group_barrier";
+
+// DMA Event Type
+constexpr const char dma_event_type[] = "__mux_dma_event_t";
+
+// Internal Mux Functions
+constexpr const char set_local_id[] = "__mux_set_local_id";
+constexpr const char set_sub_group_id[] = "__mux_set_sub_group_id";
+constexpr const char set_num_sub_groups[] = "__mux_set_num_sub_groups";
+constexpr const char set_max_sub_group_size[] = "__mux_set_max_sub_group_size";
+}  // namespace MuxBuiltins
+
+static inline llvm::Type *getPointerReturnPointeeTy(const llvm::Function &F,
+                                                    BuiltinProperties Props) {
+  if (Props & eBuiltinPropertyPointerReturnEqualRetTy) {
+    return F.getReturnType();
+  }
+  if (Props & eBuiltinPropertyPointerReturnEqualIntRetTy) {
+    llvm::Type *I32Ty = llvm::IntegerType::getInt32Ty(F.getContext());
+    if (auto *VTy = llvm::dyn_cast<llvm::VectorType>(F.getReturnType())) {
+      return llvm::VectorType::get(I32Ty,
+                                   multi_llvm::getVectorElementCount(VTy));
+    }
+    return I32Ty;
+  }
+  return nullptr;
+}
+
+/// @brief Describes how builtins should be materialized.
+enum BuiltinMatFlags : int32_t {
+  /// @brief Use default materialization options.
+  eBuiltinMatDefault = 0,
+  /// @brief The body of the builtin should be materialized.
+  eBuiltinMatDefinition = (1 << 0)
+};
+
+class BIMuxInfoConcept;
+class BILangInfoConcept;
+
+/// @brief A class that encapsulates information and transformations concerning
+/// compiler builtin functions.
+///
+/// It provides methods for querying data about builtin functions, methods for
+/// emitting bodies of builtins "inline", and methods for materializing
+/// builtins from an external source.
+///
+/// It contains a BIMuxInfoConcept implementation to provide mux builtin
+/// information on a target-by-target basis.
+///
+/// It contains an optional BILangInfoConcept implementation to provide builtin
+/// information on a target-by-target basis.
+class BuiltinInfo {
+ public:
+  // Default-construct a BuiltinInfo without a concrete set of language-level
+  // builtins.
+  BuiltinInfo() : MuxImpl(std::make_unique<BIMuxInfoConcept>()) {}
+
+  BuiltinInfo(std::unique_ptr<BILangInfoConcept> &&LangImpl)
+      : MuxImpl(std::make_unique<BIMuxInfoConcept>()),
+        LangImpl(std::move(LangImpl)) {}
+
+  BuiltinInfo(std::unique_ptr<BIMuxInfoConcept> &&MuxImpl,
+              std::unique_ptr<BILangInfoConcept> &&LangImpl)
+      : MuxImpl(std::move(MuxImpl)), LangImpl(std::move(LangImpl)) {}
+
+  BuiltinInfo(BuiltinInfo &&) = default;
+  BuiltinInfo &operator=(BuiltinInfo &&RHS) = default;
+
+  /// @brief Retrieves the optional module containing builtin definitions.
+  llvm::Module *getBuiltinsModule();
+
+  /// @brief Determine general properties for the given builtin function.
+  /// @param[in] F Function to analyze.
+  /// @return Analyzed properties for the builtin.
+  Builtin analyzeBuiltin(const llvm::Function &F) const;
+
+  /// @brief Determine general properties for the given builtin function.
+  /// @param[in] CI Call instruction to analyze.
+  /// @return Analyzed properties for the builtin call.
+  BuiltinCall analyzeBuiltinCall(const llvm::CallInst &CI,
+                                 unsigned SimdDimIdx) const;
+
+  /// @brief Try to find a builtin function that is a vector equivalent of the
+  /// given function with the given vector width, if it exists.
+  /// @param[in] B Builtin to query for a vector equivalent.
+  /// @param[in] Width Vector width.
+  /// @param[in] M Optional module where the vector equivalent should be
+  /// declared.
+  /// @return Equivalent vector builtin function on success.
+  llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
+                                      llvm::Module *M = nullptr);
+
+  /// @brief Try to find a builtin function that is a scalar equivalent of the
+  /// given function, if it exists.
+  /// @param[in] B Builtin to query for a scalar equivalent.
+  /// @param[in] M Optional module where the vector equivalent should be
+  /// declared.
+  /// @return Equivalent scalar builtin function on success.
+  llvm::Function *getScalarEquivalent(const Builtin &B, llvm::Module *M);
+
+  /// @brief Emit an inline implementation of the builtin function F.
+  /// @param[in] Builtin Builtin function to emit an implementation for.
+  /// @param[in] B Insertion point for the implementation.
+  /// @param[in] Args Arguments to the builtin function.
+  /// @return A value that implements the builtin function or null.
+  llvm::Value *emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B,
+                                 llvm::ArrayRef<llvm::Value *> Args);
+
+  /// @brief Return a known range of values this call may return.
+  /// @param[in] CI Call instruction to analyze.
+  /// @param[in] MaxLocalSizes The maximum local work-group sizes in each of
+  /// the 3 dimensions that this target supports.
+  /// @param[in] MaxGlobalSizes The maximum global work-group sizes in each of
+  /// the 3 dimensions that this target supports.
+  std::optional<llvm::ConstantRange> getBuiltinRange(
+      llvm::CallInst &CI, std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+      std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const;
+
+  /// @brief Lowers a call to a language-level builtin to an instruction
+  /// sequences calling a mux builtin.
+  ///
+  /// For a call to a builtin for which the property
+  /// eBuiltinPropertyLowerToMuxBuiltin is set, the target must then re-express
+  /// the call to a new sequence, usually involving mux builtins.
+  llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &CI);
+
+  /// @brief Get a builtin for printf.
+  /// @return An identifier for the builtin, or the invalid builtin if there
+  /// is none. This builtin should have a signature of `<void type | integer
+  /// type> <builtin name>(<char*>, ...)`.
+  BuiltinID getPrintfBuiltin() const;
+
+  /// @brief Returns true if the given ID is a ComputeMux builtin ID.
+  static bool isMuxBuiltinID(BuiltinID ID) {
+    return ID > eBuiltinInvalid && ID < eFirstTargetBuiltin;
+  }
+
+  /// @brief Returns true if the given ID is an overloadable ComputeMux builtin
+  /// ID.
+  ///
+  /// These builtins *require* extra overloading info when declaring or
+  /// defining.
+  static bool isOverloadableMuxBuiltinID(BuiltinID ID);
+
+  /// @brief Returns true if the given ID is a ComputeMux barrier builtin ID.
+  static bool isMuxControlBarrierID(BuiltinID ID) {
+    return ID == eMuxBuiltinSubGroupBarrier ||
+           ID == eMuxBuiltinWorkGroupBarrier;
+  }
+
+  /// @brief Returns true if the given ID is a ComputeMux DMA builtin ID.
+  static bool isMuxDmaBuiltinID(BuiltinID ID) {
+    return ID == eMuxBuiltinDMAWait || ID == eMuxBuiltinDMARead1D ||
+           ID == eMuxBuiltinDMARead2D || ID == eMuxBuiltinDMARead3D ||
+           ID == eMuxBuiltinDMAWrite1D || ID == eMuxBuiltinDMAWrite2D ||
+           ID == eMuxBuiltinDMAWrite3D;
+  }
+
+  /// @brief Gets information about a mux group operation builtin
+  static std::optional<GroupCollective> isMuxGroupCollective(BuiltinID ID);
+
+  /// @brief Returns the mux builtin ID matching the group collective, or
+  /// eBuiltinInvalid.
+  static BuiltinID getMuxGroupCollective(const GroupCollective &Group);
+
+  /// @brief Returns true if the mux builtin has a barrier ID as its first
+  /// operand.
+  static bool isMuxBuiltinWithBarrierID(BuiltinID ID) {
+    if (isMuxControlBarrierID(ID)) {
+      return true;
+    }
+    auto Info = isMuxGroupCollective(ID);
+    return Info && Info->isWorkGroupScope();
+  }
+
+  /// @brief Returns true if the mux builtin has a barrier ID as its first
+  /// operand, and applies at Work Group scope.
+  static bool isMuxBuiltinWithWGBarrierID(BuiltinID ID) {
+    if (ID == eMuxBuiltinWorkGroupBarrier) {
+      return true;
+    }
+    auto Info = isMuxGroupCollective(ID);
+    return Info && Info->isWorkGroupScope();
+  }
+
+  /// @brief Maps a ComputeMux builtin ID to its function name.
+  ///
+  /// @param OverloadInfo An array of types required to resolve certain
+  /// overloadable builtins, e.g., group builtins.
+  static std::string getMuxBuiltinName(
+      BuiltinID ID, llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief Mangles a type using the LLVM intrinsic scheme
+  ///
+  /// This is an extremely simple mangling scheme matching LLVM's intrinsic
+  /// mangling system. It is only designed to be used with a specific set of
+  /// types and is not a general-purpose mangler.
+  ///
+  /// * iXXX -> iXXX
+  /// * half -> f16
+  /// * float -> f32
+  /// * double -> f64
+  /// * <N x Ty> -> vNTy
+  /// * <vscale x N x Ty> -> nxvNTy
+  static std::string getMangledTypeStr(llvm::Type *Ty);
+
+  /// @brief Demangles a type using the LLVM intrinsic scheme - returns nullptr
+  /// if it was unable to demangle a type.
+  ///
+  /// @see getMangledTypeStr
+  static std::pair<llvm::Type *, llvm::StringRef> getDemangledTypeFromStr(
+      llvm::StringRef TyStr, llvm::LLVMContext &Ctx);
+
+  /// @brief Defines the body of a ComputeMux builtin declaration
+  ///
+  /// If the Module already has a function definition with the corresponding
+  /// function name, it is left alone and returned.
+  ///
+  /// Will declare any builtins it requires as transitive dependencies.
+  ///
+  /// @param OverloadInfo An array of types required to resolve certain
+  /// overloadable builtins, e.g., group builtins.
+  llvm::Function *defineMuxBuiltin(
+      BuiltinID, llvm::Module &M,
+      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief Gets a ComputeMux builtin from the module, or declares it
+  ///
+  /// @param OverloadInfo An array of types required to resolve certain
+  /// overloadable builtins, e.g., group builtins.
+  llvm::Function *getOrDeclareMuxBuiltin(
+      BuiltinID, llvm::Module &M,
+      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  struct SchedParamInfo {
+    /// @brief An identifier providing resolution for targets to identify
+    /// specific scheduling parameters.
+    ///
+    /// By default, will be the index into the list returned by
+    /// getMuxSchedulingParameters.
+    unsigned ID;
+    /// @brief The parameter type
+    llvm::Type *ParamTy;
+    /// @brief A (possibly empty) set of parameter attributes to apply to all
+    /// functions featuring this parameter.
+    llvm::AttributeSet ParamAttrs;
+    /// @brief The name of the parameter, to aid debugging. May be empty.
+    std::string ParamName;
+    /// @brief A human-readable name to be emitted in !mux-scheduling-params
+    std::string ParamDebugName;
+    /// @brief True if the parameter is passed externally by the driver to the
+    /// kernel entry point, else false if this parameter is initialized by the
+    /// kernel at the top level.
+    ///
+    /// This provides an interface to passes such as AddKernelWrapperPass.
+    ///
+    /// If true, the parameter is passed through every layer of kernels. If
+    /// false, the parameter must be initialized by
+    /// initializeSchedulingParamForWrappedKernel.
+    bool PassedExternally;
+    /// @brief An optional type to aid targets in remembering the underlying
+    /// parameter type, if the parameter is a pointer.
+    llvm::Type *ParamPointeeTy = nullptr;
+    /// @brief An optional value specifying the concrete function argument.
+    llvm::Argument *ArgVal = nullptr;
+  };
+
+  /// @brief Returns a target-specific list of scheduling parameters to be
+  /// applied to all builtins for which requiresSchedulingParameters returns
+  /// true.
+  ///
+  /// This list of parameters that dictates the order of parameters added to
+  /// each builtin. As such it must be constant and immutable for each Module.
+  ///
+  /// This list is emitted into the module as metadata by the
+  /// AddSchedulingParametersPass for user reference.
+  ///
+  /// This function does not have to fill in SchedParamInfo::ArgVal, as this
+  /// query is not specific to one function.
+  llvm::SmallVector<SchedParamInfo, 4> getMuxSchedulingParameters(
+      llvm::Module &);
+
+  /// @brief Returns target-specific scheduling parameters from a concrete
+  /// function.
+  ///
+  /// Uses metadata returned via
+  /// compiler::utils::getSchedulingParameterFunctionMetadata to determine
+  /// whether the function contains scheduling parameters.
+  ///
+  /// If set, this function should return the same result as
+  /// getMuxSchedulingParameters, but with SchedParamInfo::ArgVal filled in to
+  /// correspond to the actual concrete llvm::Argument values of the given
+  /// function. Note that not all ArgVals are guaranteed to be populated, as a
+  /// function may contain only a subset of the target's list of scheduling
+  /// parameters.
+  ///
+  /// If not set, this function returns an empty list.
+  llvm::SmallVector<SchedParamInfo, 4> getFunctionSchedulingParameters(
+      llvm::Function &);
+
+  /// @brief Responsible for initializing a scheduling parameter for which
+  /// PassedExternally is 'false'.
+  ///
+  /// This is conceptually used to initialize scheduling parameters which are
+  /// used for scheduling "internally" and do not make up the driver-facing
+  /// kernel ABI.
+  ///
+  /// @param Info The SchedParamInfo dictating which kind of scheduling
+  /// parameter to initialize.
+  /// @param B An IRBuilder providing the insertion point at which to insert
+  /// initialization instructions.
+  /// @param IntoF The function into which initialization instructions are to be
+  /// inserted.
+  /// @param CalleeF The function for which the initialization is taking place.
+  /// CalleeF will be called by IntoF.
+  llvm::Value *initializeSchedulingParamForWrappedKernel(
+      const SchedParamInfo &Info, llvm::IRBuilder<> &B, llvm::Function &IntoF,
+      llvm::Function &CalleeF);
+
+  /// @brief Returns true if the builtin ID requires extra scheduling
+  /// parameters to function.
+  ///
+  /// This function only handles mux builtins, and does not to defer any of
+  /// BuiltinInfo's implementation instances.
+  ///
+  /// These parameters will to be added to the function (and its callers) by
+  /// the AddSchedulingParametersPass.
+  bool requiresSchedulingParameters(BuiltinID ID);
+
+  /// @brief Returns the remapped type for a target extension type
+  ///
+  /// This method is intended for target implementations to be able signal to
+  /// the DefineTargetExtTysPass how LLVM's target extension types should be
+  /// remapped across the module. There is a default implementation: see
+  /// BIMuxInfoConcept::getRemappedTargetExtTy
+  ///
+  /// This method is safe to call before LLVM 17 but will do nothing (there are
+  /// no target extension types before LLVM 17). Otherwise this method asserts
+  /// that the type is a target extension type.
+  ///
+  /// @param Ty The target extension type to remap
+  /// @param M The Module in which to replace the type
+  /// @return The remapped type, or nullptr if the type does not require
+  /// remapping
+  llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M);
+
+  /// Handle the invalidation of this information.
+  ///
+  /// When used as a result of BuiltinInfoAnalysis this method will be called
+  /// when the function this was computed for changes. When it returns false,
+  /// the information is preserved across those changes.
+  bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &,
+                  llvm::ModuleAnalysisManager::Invalidator &) {
+    return false;
+  }
+
+ private:
+  /// @brief Try to identify a builtin function.
+  /// @param[in] F The function to identify.
+  /// @return Valid builtin ID if the name was identified, as well as any types
+  /// required to overload the builtin ID.
+  std::pair<BuiltinID, std::vector<llvm::Type *>> identifyMuxBuiltin(
+      const llvm::Function &F) const;
+
+  /// @brief Determine whether the given builtin function returns uniform values
+  /// or not. An optional call instruction can be passed for more accuracy.
+  /// @param[in] B the builtin to analyze uniformity.
+  /// @param[in] CI Optional argument list from a call instruction.
+  /// @param[in] SimdDimIdx Index of current vectorization dimension.
+  /// @return Uniformity value for the builtin.
+  BuiltinUniformity isBuiltinUniform(const Builtin &B, const llvm::CallInst *CI,
+                                     unsigned SimdDimIdx) const;
+
+  std::unique_ptr<BIMuxInfoConcept> MuxImpl;
+  std::unique_ptr<BILangInfoConcept> LangImpl;
+};
+
+/// @brief An interface class that provides mux- and target-specific
+/// information and transformations to an instance of BuiltinInfo. All methods
+/// are to be called through from the equivalent methods in BuiltinInfo.
+class BIMuxInfoConcept {
+ public:
+  virtual ~BIMuxInfoConcept() = default;
+
+  /// @brief See BuiltinInfo::defineMuxBuiltin.
+  virtual llvm::Function *defineMuxBuiltin(
+      BuiltinID, llvm::Module &M,
+      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief See BuiltinInfo::getOrDeclareMuxBuiltin.
+  virtual llvm::Function *getOrDeclareMuxBuiltin(
+      BuiltinID, llvm::Module &M,
+      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief See BuiltinInfo::getMuxSchedulingParameters
+  virtual llvm::SmallVector<BuiltinInfo::SchedParamInfo, 4>
+  getMuxSchedulingParameters(llvm::Module &);
+
+  /// @brief See BuiltinInfo::getFunctionSchedulingParameters
+  virtual llvm::SmallVector<BuiltinInfo::SchedParamInfo, 4>
+  getFunctionSchedulingParameters(llvm::Function &);
+
+  /// @brief See BuiltinInfo::initializeSchedulingParamForWrappedKernel
+  virtual llvm::Value *initializeSchedulingParamForWrappedKernel(
+      const BuiltinInfo::SchedParamInfo &Info, llvm::IRBuilder<> &B,
+      llvm::Function &IntoF, llvm::Function &CalleeF);
+
+  /// @brief Sets default builtin attributes on the given function.
+  static void setDefaultBuiltinAttributes(llvm::Function &F,
+                                          bool AlwaysInline = true);
+
+  /// @brief Returns true if the mux builtin requires scheduling parameters to
+  /// function.
+  virtual bool requiresSchedulingParameters(BuiltinID);
+
+  /// @brief See BuiltinInfo::getRemappedTargetExtTy
+  ///
+  /// This method is overridable but the default implementation provides the
+  /// following mappings:
+  ///   * spirv.Event -> i32
+  ///   * spirv.Sampler -> i32
+  ///   * spirv.Image -> MuxImage* (regardless of image parameters)
+  virtual llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M);
+
+  /// @see BuiltinInfo::getBuiltinRange
+  virtual std::optional<llvm::ConstantRange> getBuiltinRange(
+      llvm::CallInst &, BuiltinID ID, std::array<std::optional<uint64_t>, 3>,
+      std::array<std::optional<uint64_t>, 3>) const;
+
+  enum MemScope : uint32_t {
+    MemScopeCrossDevice = 0,
+    MemScopeDevice = 1,
+    MemScopeWorkGroup = 2,
+    MemScopeSubGroup = 3,
+    MemScopeWorkItem = 4,
+  };
+
+  enum MemSemantics : uint32_t {
+    // Only set one of the following bits at a time:
+    MemSemanticsRelaxed = 0x0,
+    MemSemanticsAcquire = 0x2,
+    MemSemanticsRelease = 0x4,
+    MemSemanticsAcquireRelease = 0x8,
+    MemSemanticsSequentiallyConsistent = 0x10,
+    MemSemanticsMask = 0x1F,
+    // What kind of memory is controlled by a barrier
+    MemSemanticsSubGroupMemory = 0x80,
+    MemSemanticsWorkGroupMemory = 0x100,
+    MemSemanticsCrossWorkGroupMemory = 0x200,
+  };
+
+ protected:
+  llvm::Function *defineGetGlobalId(llvm::Module &M);
+  llvm::Function *defineGetGlobalSize(llvm::Module &M);
+  llvm::Function *defineGetLocalLinearId(llvm::Module &M);
+  llvm::Function *defineGetGlobalLinearId(llvm::Module &M);
+  llvm::Function *defineGetEnqueuedLocalSize(llvm::Module &M);
+  llvm::Function *defineMemBarrier(llvm::Function &F, unsigned ScopeIdx,
+                                   unsigned SemanticsIdx);
+  llvm::Function *defineGetSubGroupSize(llvm::Function &F);
+  llvm::Function *defineGetSubGroupLocalId(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_read_1D` and
+  /// `__mux_dma_write_1D`.
+  ///
+  /// These routines are not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. They are
+  /// essentially a memcpy.
+  llvm::Function *defineDMA1D(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_read_2D`
+  /// and `__mux_dma_write_2D`.
+  ///
+  /// These routines are not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. They are
+  /// essentially a memcpy.
+  llvm::Function *defineDMA2D(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_read_3D`
+  /// and `__mux_dma_write_3D`.
+  ///
+  /// These routines are not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. They are
+  /// essentially a memcpy.
+  llvm::Function *defineDMA3D(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_wait`.
+  ///
+  /// This routine is not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. This
+  /// implementation does nothing and simply returns.
+  llvm::Function *defineDMAWait(llvm::Function &F);
+};
+
+/// @brief An interface class that provides language-specific information and
+/// transformations to an instance of BuiltinInfo. All methods are to be called
+/// through from the equivalent methods in BuiltinInfo.
+class BILangInfoConcept {
+ public:
+  virtual ~BILangInfoConcept() = default;
+
+  /// @see BuiltinInfo::getBuiltinsModule
+  virtual llvm::Module *getBuiltinsModule() { return nullptr; }
+  /// @see BuiltinInfo::analyzeBuiltin
+  virtual Builtin analyzeBuiltin(const llvm::Function &F) const = 0;
+  /// @see BuiltinInfo::isBuiltinUniform
+  virtual BuiltinUniformity isBuiltinUniform(const Builtin &B,
+                                             const llvm::CallInst *,
+                                             unsigned) const = 0;
+  /// @see BuiltinInfo::getVectorEquivalent
+  virtual llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
+                                              llvm::Module *M = nullptr) = 0;
+  /// @see BuiltinInfo::getScalarEquivalent
+  virtual llvm::Function *getScalarEquivalent(const Builtin &B,
+                                              llvm::Module *M) = 0;
+  /// @see BuiltinInfo::emitBuiltinInline
+  virtual llvm::Value *emitBuiltinInline(
+      llvm::Function *Builtin, llvm::IRBuilder<> &B,
+      llvm::ArrayRef<llvm::Value *> Args) = 0;
+  /// @see BuiltinInfo::getBuiltinRange
+  virtual std::optional<llvm::ConstantRange> getBuiltinRange(
+      llvm::CallInst &, std::array<std::optional<uint64_t>, 3>,
+      std::array<std::optional<uint64_t>, 3>) const {
+    return std::nullopt;
+  }
+
+  /// @see BuiltinInfo::lowerBuiltinToMuxBuiltin
+  virtual llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &,
+                                                      BIMuxInfoConcept &) {
+    return nullptr;
+  }
+  /// @see BuiltinInfo::getPrintfBuiltin
+  virtual BuiltinID getPrintfBuiltin() const = 0;
+};
+
+/// @brief Caches and returns the BuiltinInfo for a Module.
+class BuiltinInfoAnalysis
+    : public llvm::AnalysisInfoMixin<BuiltinInfoAnalysis> {
+  friend AnalysisInfoMixin<BuiltinInfoAnalysis>;
+
+ public:
+  using Result = BuiltinInfo;
+  using CallbackFn = std::function<Result(const llvm::Module &)>;
+
+  BuiltinInfoAnalysis();
+
+  BuiltinInfoAnalysis(CallbackFn BICallback) : BICallback(BICallback) {}
+
+  /// @brief Retrieve the BuiltinInfo for the requested module.
+  Result run(llvm::Module &M, llvm::ModuleAnalysisManager &) {
+    return BICallback(M);
+  }
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "BuiltinInfo analysis"; }
+
+ private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+
+  /// @brief Callback function producing a BuiltinInfo on demand.
+  CallbackFn BICallback;
+};
+
+/// @}
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
new file mode 100644
index 0000000000000..7c80403f7d35c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
@@ -0,0 +1,216 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief OpenCL's BuiltinInfo implementation.
+
+#ifndef COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
+#define COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+
+namespace compiler {
+namespace utils {
+/// @addtogroup utils
+/// @{
+
+/// @brief Convenience function for constructing a CLBuiltinInfo as a unique_ptr
+/// @param[in] builtins the Builtin module
+/// @return a std::unique_ptr to a new CLBuiltinInfo
+std::unique_ptr<BILangInfoConcept> createCLBuiltinInfo(llvm::Module *builtins);
+
+/// @brief Builtin loader base class.
+class CLBuiltinLoader {
+ protected:
+  CLBuiltinLoader() = default;
+
+ public:
+  virtual ~CLBuiltinLoader() = default;
+
+  /// @brief Load a builtin function.
+  /// @param[in] BuiltinName Name of the builtin function to materialize.
+  /// @param[in] DestM Optional module in which to load the builtin function.
+  /// @param[in] Flags Materialization flags to use.
+  /// @return Pointer to the materialized builtin function on success.
+  /// If a module is passed, the returned builtin function must live in
+  /// that module.
+  virtual llvm::Function *materializeBuiltin(llvm::StringRef BuiltinName,
+                                             llvm::Module *DestM,
+                                             BuiltinMatFlags Flags);
+
+  /// @brief Expose any builtins Module
+  virtual llvm::Module *getBuiltinsModule() { return nullptr; }
+};
+
+/// @brief Simple Builtin loader wrapping a given builtins module.
+class SimpleCLBuiltinLoader final : public CLBuiltinLoader {
+ public:
+  SimpleCLBuiltinLoader(llvm::Module *builtins) : BuiltinModule(builtins) {}
+
+  ~SimpleCLBuiltinLoader() = default;
+
+  /// @brief Expose any builtins Module
+  virtual llvm::Module *getBuiltinsModule() override { return BuiltinModule; }
+
+ private:
+  /// @brief Loaded builtins module.
+  llvm::Module *BuiltinModule;
+};
+
+///  @brief A class that encapsulates information and transformations concerning
+/// compiler OpenCL builtin functions.
+class CLBuiltinInfo : public BILangInfoConcept {
+ public:
+  /// @brief Constructs a CLBuiltinInfo from a given Builtins module
+  CLBuiltinInfo(llvm::Module *Builtins);
+
+  /// @brief Constructs a CLBuiltinInfo with a user-provided loader
+  CLBuiltinInfo(std::unique_ptr<CLBuiltinLoader> L) : Loader(std::move(L)) {}
+
+  ~CLBuiltinInfo();
+
+  llvm::Module *getBuiltinsModule() override;
+
+  /// @see BuiltinInfo::isBuiltinUniform
+  BuiltinUniformity isBuiltinUniform(const Builtin &B, const llvm::CallInst *CI,
+                                     unsigned SimdDimIdx) const override;
+
+  /// @see BuiltinInfo::analyzeBuiltin
+  Builtin analyzeBuiltin(const llvm::Function &F) const override;
+  /// @see BuiltinInfo::getVectorEquivalent
+  llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
+                                      llvm::Module *M = nullptr) override;
+  /// @see BuiltinInfo::getScalarEquivalent
+  llvm::Function *getScalarEquivalent(const Builtin &B,
+                                      llvm::Module *M) override;
+  /// @see BuiltinInfo::emitBuiltinInline
+  llvm::Value *emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B,
+                                 llvm::ArrayRef<llvm::Value *> Args) override;
+
+  /// @see BuiltinInfo::lowerBuiltinToMuxBuiltin
+  llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &,
+                                              BIMuxInfoConcept &) override;
+  /// @see BuiltinInfo::getPrintfBuiltin
+  BuiltinID getPrintfBuiltin() const override;
+
+ private:
+  BuiltinID identifyBuiltin(const llvm::Function &) const;
+
+  llvm::Function *materializeBuiltin(
+      llvm::StringRef BuiltinName, llvm::Module *DestM = nullptr,
+      BuiltinMatFlags Flags = eBuiltinMatDefault);
+
+  llvm::Instruction *lowerGroupBuiltinToMuxBuiltin(llvm::CallInst &CI,
+                                                   BuiltinID ID,
+                                                   BIMuxInfoConcept &BIMuxImpl);
+  llvm::Instruction *lowerAsyncBuiltinToMuxBuiltin(llvm::CallInst &CI,
+                                                   BuiltinID ID,
+                                                   BIMuxInfoConcept &BIMuxImpl);
+
+  llvm::Value *emitBuiltinInline(BuiltinID ID, llvm::IRBuilder<> &B,
+                                 llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineAsLLVMBinaryIntrinsic(llvm::IRBuilder<> &B,
+                                                      llvm::Value *LHS,
+                                                      llvm::Value *RHS,
+                                                      llvm::Intrinsic::ID ID);
+  // 6.2 Conversions & Type Casting
+  llvm::Value *emitBuiltinInlineAs(llvm::Function *F, llvm::IRBuilder<> &B,
+                                   llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineConvert(llvm::Function *F, BuiltinID ID,
+                                        llvm::IRBuilder<> &B,
+                                        llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.5 Geometric Built-in Functions
+  llvm::Value *emitBuiltinInlineGeometrics(BuiltinID builtinID,
+                                           llvm::IRBuilder<> &B,
+                                           llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineDot(llvm::IRBuilder<> &B,
+                                    llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineCross(llvm::IRBuilder<> &B,
+                                      llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineLength(llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineNormalize(llvm::IRBuilder<> &B,
+                                          llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.6 Relational Built-in Functions
+  llvm::Value *emitBuiltinInlineRelationalsWithTwoArguments(
+      BuiltinID BuiltinID, llvm::IRBuilder<> &B,
+      llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineRelationalsWithOneArgument(BuiltinID BuiltinID,
+                                                           llvm::IRBuilder<> &B,
+                                                           llvm::Value *Arg);
+  llvm::Value *emitBuiltinInlineAll(llvm::IRBuilder<> &B,
+                                    llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineAny(llvm::IRBuilder<> &B,
+                                    llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineSelect(llvm::Function *F, llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.7 Vector Data Load/Store Functions
+  llvm::Value *emitBuiltinInlineVLoad(llvm::Function *F, unsigned Width,
+                                      llvm::IRBuilder<> &B,
+                                      llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineVStore(llvm::Function *F, unsigned Width,
+                                       llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineVLoadHalf(llvm::Function *F,
+                                          llvm::IRBuilder<> &B,
+                                          llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineVStoreHalf(llvm::Function *F,
+                                           llvm::StringRef Mode,
+                                           llvm::IRBuilder<> &B,
+                                           llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.12 Miscellaneous Vector Functions
+  llvm::Value *emitBuiltinInlineShuffle(BuiltinID BuiltinID,
+                                        llvm::IRBuilder<> &B,
+                                        llvm::ArrayRef<llvm::Value *> Args);
+
+  llvm::Value *emitBuiltinInlinePrintf(BuiltinID BuiltinID,
+                                       llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+
+  /// @brief Return the name of the builtin with the given identifier.
+  /// @param[in] ID Identifier of the builtin to return the name.
+  /// @return Name of the builtin.
+  llvm::StringRef getBuiltinName(BuiltinID ID) const;
+
+  /// @brief Declare the specified OpenCL builtin in the given module.
+  /// @param[in] M Module in which declare the builtin.
+  /// @param[in] ID Builtin identifier.
+  /// @param[in] RetTy Return type for the builtin.
+  /// @param[in] ArgTys List of argument types.
+  /// @param[in] ArgQuals List of argument qualifiers.
+  /// @param[in] Suffix Optional builtin name suffix.
+  /// @return Builtin function declaration.
+  llvm::Function *declareBuiltin(llvm::Module *M, BuiltinID ID,
+                                 llvm::Type *RetTy,
+                                 llvm::ArrayRef<llvm::Type *> ArgTys,
+                                 llvm::ArrayRef<TypeQualifiers> ArgQuals,
+                                 llvm::Twine Suffix = "");
+
+  /// @brief BuiltinLoader used to load builtins.
+  std::unique_ptr<CLBuiltinLoader> Loader;
+};
+
+/// @}
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
new file mode 100644
index 0000000000000..e1e74ec666a8a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
@@ -0,0 +1,36 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+
+#ifndef COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
+#define COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+class DefineMuxBuiltinsPass final
+    : public llvm::PassInfoMixin<DefineMuxBuiltinsPass> {
+ public:
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
new file mode 100644
index 0000000000000..177ae0c99b4df
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
@@ -0,0 +1,127 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Information about compiler device information.
+
+#ifndef COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
+#define COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Bitfield of all possible floating point capabilities.
+///
+/// Each Mux device struct has a member which denotes the floating point
+/// capabilities of that device, as a bitfield of the following enum.
+///
+/// NOTE: Must be kept in sync with mux_floating_point_capabilities_e in
+/// mux/include/mux/mux.h! This should probably be placed in an intermediary
+/// mux/compiler library and shared as part of CA-4236.
+enum device_floating_point_capabilities_e {
+  /// @brief Denormals supported.
+  device_floating_point_capabilities_denorm = 0x1,
+  /// @brief INF and NaN are supported.
+  device_floating_point_capabilities_inf_nan = 0x2,
+  /// @brief Round to nearest even supported.
+  device_floating_point_capabilities_rte = 0x4,
+  /// @brief Round to zero supported.
+  device_floating_point_capabilities_rtz = 0x8,
+  /// @brief Round to positive infinity supported.
+  device_floating_point_capabilities_rtp = 0x10,
+  /// @brief Round to negative infinity supported.
+  device_floating_point_capabilities_rtn = 0x20,
+  /// @brief Fused multiply add supported.
+  device_floating_point_capabilities_fma = 0x40,
+  /// @brief Floating point operations are written in software.
+  device_floating_point_capabilities_soft = 0x80,
+  /// @brief Binary format conforms to the IEEE-754 specification.
+  device_floating_point_capabilities_full = 0x100
+};
+
+struct DeviceInfo {
+  DeviceInfo() = default;
+
+  /// @brief Construct a DeviceInfo from individual properties
+  ///
+  /// @param h Enumeration of half-precision floating-point capabilities
+  /// @param f Enumeration of single-precision floating-point capabilities
+  /// @param d Enumeration of double-precision floating-point capabilities
+  /// @param max_work_width  The maximum number of work-items of a work-group
+  /// allowed to execute in one invocation of a kernel.
+  DeviceInfo(uint32_t h, uint32_t f, uint32_t d, uint32_t max_work_width)
+      : half_capabilities(h),
+        float_capabilities(f),
+        double_capabilities(d),
+        max_work_width(max_work_width) {}
+
+  uint32_t half_capabilities = 0;
+  uint32_t float_capabilities = 0;
+  uint32_t double_capabilities = 0;
+  uint32_t max_work_width = 0;
+
+  /// @brief List of supported 'required' sub-group sizes reported by this
+  /// device.
+  ///
+  /// These are only the sub-group sizes that can be requested as 'required' for
+  /// a kernel; the compiler may produce a wide range of other sub-group sizes
+  /// on undecorated kernels, assuming sub-groups are supported by the device.
+  std::vector<uint32_t> reqd_sub_group_sizes;
+
+  /// @brief Handle invalidation events from the new pass manager.
+  ///
+  /// @return false, as this analysis can never be invalidated.
+  bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &,
+                  llvm::ModuleAnalysisManager::Invalidator &) {
+    return false;
+  }
+};
+
+/// @brief Caches and returns the device information for a Module.
+class DeviceInfoAnalysis : public llvm::AnalysisInfoMixin<DeviceInfoAnalysis> {
+  friend AnalysisInfoMixin<DeviceInfoAnalysis>;
+
+ public:
+  using Result = DeviceInfo;
+
+  DeviceInfoAnalysis() = default;
+  DeviceInfoAnalysis(Result res) : Info(res) {}
+
+  /// @brief Retrieve the DeviceInfo for the requested module.
+  Result run(llvm::Module &, llvm::ModuleAnalysisManager &) {
+    return Info ? *Info : Result();
+  }
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Device info analysis"; }
+
+ private:
+  /// @brief Optional device information
+  std::optional<Result> Info;
+
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
new file mode 100644
index 0000000000000..2dfb8121b891a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
@@ -0,0 +1,91 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// LLVM DMA pass utility functions.
+
+#ifndef COMPILER_UTILS_DMA_H_INCLUDED
+#define COMPILER_UTILS_DMA_H_INCLUDED
+
+#include <llvm/ADT/Twine.h>
+#include <llvm/IR/IRBuilder.h>
+
+#include <functional>
+
+namespace llvm {
+class BasicBlock;
+class Module;
+class Value;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+class BIMuxInfoConcept;
+
+/// @addtogroup utils
+/// @{
+
+/// @brief Helper function to check the local ID of the current thread.
+///
+/// @param[in] bb Basic block to generate the check in.
+/// @param[in] x The local id in the x dimension to compare against.
+/// @param[in] y The local id in the y dimension to compare against.
+/// @param[in] z The local id in the z dimension to compare against.
+/// @param[in] GetLocalIDFn Function used to get the local work-item ID
+///
+/// @return A true Value if the local ID equals that passed via the index
+/// arguments, false otherwise.
+llvm::Value *isThreadEQ(llvm::BasicBlock *bb, unsigned x, unsigned y,
+                        unsigned z, llvm::Function &GetLocalIDFn);
+
+/// @brief Helper function to check if the local ID of the current thread is {0,
+/// 0, 0}.
+///
+/// @param[in] bb Basic block to generate the check in.
+/// @param[in] GetLocalIDFn Function used to get the local work-item ID
+///
+/// @return A true Value if the local ID is {0, 0, 0} / false otherwise.
+llvm::Value *isThreadZero(llvm::BasicBlock *bb, llvm::Function &GetLocalIDFn);
+
+/// @brief Insert 'thread-checking' logic in the entry block, so that control
+/// branches to the 'true' block when the current work-item in the first in the
+/// work-group (e.g. ID zero in all dimensions) or to the 'false' block for
+/// other work-items
+///
+/// @param[in] entryBlock Block to insert the 'thread-checking' logic
+/// @param[in] trueBlock Block to execute only on the first work-item
+/// @param[in] falseBlock Block to execute on all other work-items
+/// @param[in] GetLocalIDFn Function used to get the local work-item ID
+void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock,
+                      llvm::BasicBlock *falseBlock,
+                      llvm::Function &GetLocalIDFn);
+
+/// @brief Gets or creates the __mux_dma_event_t type.
+///
+/// This type may be declared by other passes hence we "get or create it".
+///
+/// @param[in] m LLVM Module to get or create the type in.
+///
+/// @return The opaque struct declaration of the __mux_dma_event_t type.
+llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m);
+
+/// @}
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_DMA_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
new file mode 100644
index 0000000000000..c01cb00528d85
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
@@ -0,0 +1,60 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// EncodeKernelMetadataPass pass.
+
+#ifndef COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
+#define COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Sets up the per-function mux metadata used by later passes.
+/// Transfers per-module !opencl.kernel metadata to mux kernel metadata.
+struct TransferKernelMetadataPass
+    : public llvm::PassInfoMixin<TransferKernelMetadataPass> {
+  explicit TransferKernelMetadataPass() {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+};
+
+struct EncodeKernelMetadataPassOptions {
+  std::string KernelName;
+  std::optional<std::array<uint64_t, 3>> LocalSizes = std::nullopt;
+};
+
+struct EncodeKernelMetadataPass
+    : public llvm::PassInfoMixin<EncodeKernelMetadataPass> {
+  EncodeKernelMetadataPass(EncodeKernelMetadataPassOptions Options)
+      : KernelName(Options.KernelName), LocalSizes(Options.LocalSizes) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+
+ private:
+  std::string KernelName;
+  std::optional<std::array<uint64_t, 3>> LocalSizes;
+};
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
new file mode 100644
index 0000000000000..2ef9ed9907720
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
@@ -0,0 +1,112 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Helper functions for working with sub_group and work_group functions.
+
+#ifndef COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
+#define COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
+
+#include <llvm/Analysis/IVDescriptors.h>
+
+namespace llvm {
+class Constant;
+class Function;
+class Type;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+/// @brief Utility function for retrieving the neutral value of a
+/// reduction/scan operation. A neutral value is one that does not affect the
+/// result of a given operation, e.g., adding 0 or multiplying by 1.
+///
+/// @param[in] Kind The kind of scan/reduction operation
+/// @param[in] Ty The type of the returned neutral value. Must match the type
+/// assumed by @a Kind, e.g., a floating-point type for floating-point
+/// operations.
+///
+/// @return The neutral value, or nullptr if unhandled.
+llvm::Constant *getNeutralVal(llvm::RecurKind Kind, llvm::Type *Ty);
+
+/// @brief Utility function for retrieving the identity value of a
+/// reduction/scan operation. The identity value is one that is expected to be
+/// found in the first element of an exclusive scan. It is equal to the neutral
+/// value (see @ref getNeutralVal) in all cases except in floating-point
+/// min/max, where -INF/+INF is the expected identity and in floating-point
+/// addition, where 0.0 (not -0.0 which is the neutral value) is the expected
+/// identity.
+///
+/// @param[in] Kind The kind of scan/reduction operation
+/// @param[in] Ty The type of the returned neutral value. Must match the type
+/// assumed by @a Kind, e.g., a floating-point type for floating-point
+/// operations.
+///
+/// @return The neutral value, or nullptr if unhandled.
+llvm::Constant *getIdentityVal(llvm::RecurKind Kind, llvm::Type *Ty);
+
+/// @brief Represents a work-group or sub-group collective operation.
+struct GroupCollective {
+  /// @brief The different operation types a group collective can represent.
+  enum class OpKind {
+    All,
+    Any,
+    Reduction,
+    ScanInclusive,
+    ScanExclusive,
+    Broadcast,
+    Shuffle,
+    ShuffleUp,
+    ShuffleDown,
+    ShuffleXor,
+  };
+
+  /// @brief The possible scopes of a group collective.
+  enum class ScopeKind { WorkGroup, SubGroup, VectorGroup };
+
+  /// @brief The operation type of the group collective.
+  OpKind Op = OpKind::All;
+  /// @brief The scope of the group collective operation.
+  ScopeKind Scope = ScopeKind::WorkGroup;
+  /// @brief The llvm recurrence operation this can be mapped to. For broadcasts
+  /// this will be None.
+  llvm::RecurKind Recurrence = llvm::RecurKind::None;
+  /// @brief True if the operation is logical, rather than bitwise.
+  bool IsLogical = false;
+  /// @brief Returns true for Any/All type collective operations.
+  bool isAnyAll() const { return Op == OpKind::Any || Op == OpKind::All; }
+  /// @brief Returns true for inclusive/exclusive scan collective operations.
+  bool isScan() const {
+    return Op == OpKind::ScanExclusive || Op == OpKind::ScanInclusive;
+  }
+  /// @brief Returns true for reduction collective operations.
+  bool isReduction() const { return Op == OpKind::Reduction; }
+  /// @brief Returns true for broadcast collective operations.
+  bool isBroadcast() const { return Op == OpKind::Broadcast; }
+  bool isShuffleLike() const {
+    return Op == OpKind::Shuffle || Op == OpKind::ShuffleUp ||
+           Op == OpKind::ShuffleDown || Op == OpKind::ShuffleXor;
+  }
+  /// @brief Returns true for sub-group collective operations.
+  bool isSubGroupScope() const { return Scope == ScopeKind::SubGroup; }
+  /// @brief Returns true for work-group collective operations.
+  bool isWorkGroupScope() const { return Scope == ScopeKind::WorkGroup; }
+};
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
new file mode 100644
index 0000000000000..cec95aeb0bc55
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
@@ -0,0 +1,408 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Simple function mangling framework.
+
+#ifndef COMPILER_UTILS_MANGLING_H_INCLUDED
+#define COMPILER_UTILS_MANGLING_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+
+#include <optional>
+
+namespace llvm {
+class LLVMContext;
+class Type;
+class raw_ostream;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+/// @brief Describes type qualifiers, which are aspects that need to be taken
+/// into account when mangling function names. Such aspects are not represented
+/// in the LLVM type. This is why such qualifiers need to be used along types.
+enum TypeQualifier : int32_t {
+  /// @brief The type has no special qualifier.
+  eTypeQualNone = 0,
+  /// @brief The type is a signed integer.
+  eTypeQualSignedInt = 1,
+  /// @brief The type is a constant pointer.
+  eTypeQualPointerConst = 2,
+  /// @brief The type is a volatile pointer.
+  eTypeQualPointerVolatile = 4,
+  /// @brief The type is a restrict pointer.
+  eTypeQualPointerRestrict = 8
+};
+
+/// @brief Contains a small hierarchical list of TypeQualifier.
+///
+/// This hierarchy maps to derived types such as pointers or vectors:
+/// * First qualifier for the pointer type.
+/// * Second qualifier for the pointed-to type.
+class TypeQualifiers final {
+  using StorageT = uint64_t;
+
+ public:
+  /// @brief Create a type qualifier list with no qualifiers.
+  TypeQualifiers();
+  /// @brief Create a type qualifier list with one qualifiers.
+  ///
+  /// @param[in] Qual First qualifier.
+  TypeQualifiers(TypeQualifier Qual);
+  /// @brief Create a type qualifier list with two qualifiers.
+  ///
+  /// @param[in] Qual1 First qualifier.
+  /// @param[in] Qual2 Second qualifier.
+  TypeQualifiers(TypeQualifier Qual1, TypeQualifier Qual2);
+
+  /// @brief Create a type qualifier list with one qualifiers.
+  /// @note Convenience function that allows bit manipulation of qualifiers.
+  ///
+  /// @param[in] Qual First qualifier.
+  TypeQualifiers(unsigned Qual);
+  /// @brief Create a type qualifier list with two qualifiers.
+  /// @note Convenience function that allows bit manipulation of qualifiers.
+  ///
+  /// @param[in] Qual1 First qualifier.
+  /// @param[in] Qual2 Second qualifier.
+  TypeQualifiers(unsigned Qual1, unsigned Qual2);
+
+  /// @brief Number of type qualifiers contained in the list.
+  StorageT getCount() const;
+
+  /// @brief Top-most qualifier from the list.
+  TypeQualifier front() const;
+
+  /// @brief Remove the top-most qualifier from the list and returns it.
+  TypeQualifier pop_front();
+
+  /// @brief Return the qualifier at the given index.
+  TypeQualifier at(unsigned Idx) const;
+
+  /// @brief Add a qualifier to the list, making it bottom-most.
+  ///
+  /// @param[in] Qual Qualifier to add to the list.
+  ///
+  /// @return true if there was enough space to add the qualifier, or false.
+  bool push_back(TypeQualifier Qual);
+  /// @brief Add a qualifier to the list, making it bottom-most.
+  /// @note Convenience function that allows bit manipulation of qualifiers.
+  ///
+  /// @param[in] Qual Qualifier to add to the list.
+  ///
+  /// @return true if there was enough space to add the qualifier, or false.
+  bool push_back(unsigned Qual);
+  /// @brief Add qualifiers to the end of the list.
+  ///
+  /// @param[in] Quals Qualifiers to add to the list.
+  ///
+  /// @return true if there was enough space to add the qualifiers, or false.
+  bool push_back(TypeQualifiers Quals);
+
+  /// @brief Determine whether two qualifier lists are equal.
+  bool operator==(const TypeQualifiers &other) {
+    return storage_ == other.storage_;
+  }
+
+  /// @brief Determine whether two qualifier lists are different.
+  bool operator!=(const TypeQualifiers &other) { return !(*this == other); }
+
+ private:
+  /// @brief Set the number of type qualifiers contained in the list.
+  void setCount(StorageT newCount);
+
+  /// @brief Bits that make up the list. Deliberately small to pass by value.
+  StorageT storage_;
+
+  /// @brief Number of bits used to encode the size of the list.
+  const static unsigned NumCountBits = 4;
+
+  /// @brief Number of bits used to encode one qualifier in the list.
+  const static unsigned NumQualBits = 10;
+
+  /// @brief Number of bits that can be used to store the list.
+  const static unsigned NumStorageBits = sizeof(StorageT) * 8;
+
+  /// @brief Maximum size of the list.
+  const static unsigned MaxSize = (NumStorageBits - NumCountBits) / NumQualBits;
+
+  static_assert(MaxSize < (1 << NumCountBits) - 1, "MaxSize cannot be encoded");
+};
+
+/// @brief Helps with light parsing such as demangling function names.
+class Lexer final {
+ public:
+  /// @brief Create a new lexer with the given text.
+  ///
+  /// @param[in] text Text to lex.
+  Lexer(llvm::StringRef text);
+
+  /// @brief Number of characters left to lex.
+  unsigned Left() const;
+  /// @brief Current lexing position in the text.
+  unsigned CurrentPos() const;
+  /// @brief String containing the text remaining to be lexed.
+  llvm::StringRef TextLeft() const;
+  /// @brief Current character.
+  /// @return Character or negative value if no text is left.
+  int Current() const;
+
+  /// @brief Consume one character, advancing to the next character in the
+  /// string.
+  /// @return true if a character was consumed, false if no text left.
+  bool Consume();
+  /// @brief Consume several characters, advancing through the string.
+  ///
+  /// @param[in] Size Number of characters to consume.
+  ///
+  /// @return true if Size characters were consumed, false otherwise.
+  bool Consume(unsigned Size);
+  /// @brief Consume a string, and skip past it.
+  ///
+  /// @param[in] Pattern String to consume.
+  ///
+  /// @return true if Pattern was found and consumed, false otherwise.
+  bool Consume(llvm::StringRef Pattern);
+  /// @brief Consume an unsigned integer, and skip past it.
+  ///
+  /// @param[out] Result Consumed unsigned integer.
+  ///
+  /// @return true if an unsigned integer was consumed, false otherwise.
+  bool ConsumeInteger(unsigned &Result);
+  /// @brief Consume a signed integer, and skip past it.
+  ///
+  /// @param[out] Result Consumed signed integer.
+  ///
+  /// @return true if a signed integer was consumed, false otherwise.
+  bool ConsumeSignedInteger(int &Result);
+  /// @brief Consume consecutive alphabetic characters and skip past them.
+  ///
+  /// @param[out] Result Consumed string.
+  ///
+  /// @return true if an alphabetic string was consumed, false otherwise.
+  bool ConsumeAlpha(llvm::StringRef &Result);
+  /// @brief Consume consecutive alphanumeric characters and skip past them.
+  ///
+  /// @param[out] Result Consumed string.
+  ///
+  /// @return true if an alphanumeric string was consumed, false otherwise.
+  bool ConsumeAlphanumeric(llvm::StringRef &Result);
+  /// @brief Consume all characters until C is found. C is not consumed.
+  ///
+  /// @param[in] C Delimiter character.
+  /// @param[out] Result Consumed string.
+  ///
+  /// @return true if C was found, false otherwise.
+  bool ConsumeUntil(char C, llvm::StringRef &Result);
+  /// @brief Consume all whitespace characters
+  ///
+  /// @return true if any whitespace was consumed or false otherwise
+  bool ConsumeWhitespace();
+
+ private:
+  /// @brief Text to lex.
+  llvm::StringRef Text;
+  /// @brief Current lexing position into the text.
+  unsigned Pos;
+};
+
+/// @brief Converts between mangled and non-mangled function names.
+class NameMangler final {
+ public:
+  /// @brief Create a new name mangler.
+  ///
+  /// @param[in] context LLVM context to use.
+  NameMangler(llvm::LLVMContext *context);
+
+  /// @brief Determine the mangled name of a function.
+  ///
+  /// @param[in] Name Non-mangled name of the function.
+  /// @param[in] Tys List of types, one for each function argument.
+  /// @param[in] Quals Qualifiers, one for each type in Tys..
+  ///
+  /// @return The mangled name of the function.
+  std::string mangleName(llvm::StringRef Name, llvm::ArrayRef<llvm::Type *> Tys,
+                         llvm::ArrayRef<TypeQualifiers> Quals);
+
+  /// @brief Try to mangle the given qualified type.
+  ///
+  /// @param[in] O Output stream to write the mangled name to.
+  /// @param[in] Type Type to mangle.
+  /// @param[in] Quals Type qualifiers.
+  ///
+  /// @return true if the type name could be mangled.
+  bool mangleType(llvm::raw_ostream &O, llvm::Type *Type, TypeQualifiers Quals);
+
+  /// @brief Try to mangle the given qualified type, taking substitutions into
+  /// account.
+  ///
+  /// @param[in] O Output stream to write the mangled name to.
+  /// @param[in] Type Type to mangle.
+  /// @param[in] Quals Type qualifiers.
+  /// @param[in] PrevTys Previously mangled types.
+  /// @param[in] PrevQuals Qualifiers for previously mangled types.
+  ///
+  /// @return true if the type name could be mangled.
+  bool mangleType(llvm::raw_ostream &O, llvm::Type *Type, TypeQualifiers Quals,
+                  llvm::ArrayRef<llvm::Type *> PrevTys,
+                  llvm::ArrayRef<TypeQualifiers> PrevQuals);
+
+  /// @brief Remove the mangling of a function name, retrieving argument types
+  ///        and qualifiers in the process.
+  ///
+  /// @param[in] Name Mangled function name to demangle.
+  /// @param[out] Types Vector that will receive LLVM types for the arguments.
+  /// @param[out] Quals Vector that will receive type qualifiers for the
+  /// arguments.
+  ///
+  /// @return Demangled name or an empty string on failure
+  llvm::StringRef demangleName(llvm::StringRef Name,
+                               llvm::SmallVectorImpl<llvm::Type *> &Types,
+                               llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+
+  /// @brief Remove the mangling of a function name, retrieving argument types
+  ///        and qualifiers in the process.
+  ///
+  /// @param[in] Name Mangled function name to demangle.
+  /// @param[out] Types Vector that will receive LLVM types for the arguments.
+  /// @param[out] PointerElementTypes Vector that will receive LLVM types for
+  /// the *first level* of pointer element types.
+  /// @param[out] Quals Vector that will receive type qualifiers for the
+  /// arguments.
+  ///
+  /// For example:
+  ///   _Z3fooPii
+  ///     Types[0]               = PointerType
+  ///     PointerElementTypes[0] = i32
+  ///     Quals[0] = (PointerQual, SignedIntQual)
+  ///
+  ///     Types[1] = i32
+  ///     PointerElementTypes[1] = nullptr
+  ///     Quals[1] = (SignedIntQual)
+  ///
+  /// @return Demangled name or an empty string on failure
+  llvm::StringRef demangleName(
+      llvm::StringRef Name, llvm::SmallVectorImpl<llvm::Type *> &Types,
+      llvm::SmallVectorImpl<llvm::Type *> &PointerElementTypes,
+      llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+
+  /// @brief Remove the mangling of a function name.
+  ///
+  /// @param[in] Name Mangled function name to demangle.
+  ///
+  /// @return Demangled name or original name if not mangled.
+  llvm::StringRef demangleName(llvm::StringRef Name);
+
+ private:
+  /// @brief Try to mangle the given qualified type. This only works for simple
+  /// types that do not require string manipulation.
+  ///
+  /// @param[in] Ty Type to mangle.
+  /// @param[in] Qual Type qualifier.
+  ///
+  /// @return Mangled name of the type or nullptr.
+  const char *mangleSimpleType(llvm::Type *Ty, TypeQualifier Qual);
+  /// @brief Try to mangle the given builtin type name. This only works for
+  /// 'spirv' target extension types (LLVM 17+).
+  ///
+  /// @param[in] Ty type to mangle.
+  ///
+  /// @return string if builtin type could be mangled otherwise empty string.
+  std::optional<std::string> mangleBuiltinType(llvm::Type *Ty);
+  /// @brief Try to demangle the given type name. This only works for simple
+  /// types that do not require string manipulation.
+  ///
+  /// @param[in,out] L Lexer for the mangled type name.
+  /// @param[out] Ty Demangled type.
+  /// @param[out] Qual Demangled type qualifier.
+  ///
+  /// @return true if the type name could be demangled.
+  bool demangleSimpleType(Lexer &L, llvm::Type *&Ty, TypeQualifier &Qual);
+  /// @brief Try to demangle the given type name. This only works for opencl
+  /// builtin types.
+  ///
+  /// @param[in,out] L Lexer for the mangled type name.
+  /// @param[out] Ty Demangled type.
+  ///
+  /// @return true if the type name could be demangled.
+  bool demangleOpenCLBuiltinType(Lexer &L, llvm::Type *&Ty);
+  /// @brief Try to demangle the given type.
+  ///
+  /// @param[in] L Lexer currently pointing at a type.
+  /// @param[out] Ty Demangled type.
+  /// @param[out] PointerEltTy If null, unchanged. Else, set to the demangled
+  /// pointer element type, if Ty is a non-opaque pointer type. Else set to
+  /// nulltpr.
+  /// @param[out] Quals Demangled type qualifiers.
+  /// @param[in] CtxTypes Previously demangled types, used for substitutions.
+  /// @param[in] CtxQuals Previously demangled qualifiers.
+  ///
+  /// @return true if the type could be demangled, false otherwise.
+  bool demangleType(Lexer &L, llvm::Type *&Ty, llvm::Type **PointerEltTy,
+                    TypeQualifiers &Quals,
+                    llvm::SmallVectorImpl<llvm::Type *> &CtxTypes,
+                    llvm::SmallVectorImpl<TypeQualifiers> &CtxQuals);
+
+  /// @brief Demangle a name.
+  ///
+  /// @param[in] L Lexer currently pointing at a mangled name.
+  ///
+  /// @return Demangled name or an empty string.
+  llvm::StringRef demangleName(Lexer &L);
+  /// @brief Determine the type 'index' the substitution refers to.
+  ///
+  /// @param[in] SubID Substitution ID.
+  /// @param[in] Tys List of types.
+  /// @param[in] Quals Qualifiers for the types.
+  ///
+  /// @return Resolved type index or negative value.
+  int resolveSubstitution(unsigned SubID,
+                          llvm::SmallVectorImpl<llvm::Type *> &Tys,
+                          llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+  /// @brief Try to emit a substituion for the given type instead of mangling
+  /// it.
+  ///
+  /// @param[in,out] O Stream to write the substitution to.
+  /// @param[in] Ty Type to mangle
+  /// @param[in] Quals Qualifiers for the type.
+  /// @param[in] PrevTys Types that have previously been mangled.
+  /// @param[in] PrevQuals Qualifiers for the previously mangled types.
+  ///
+  /// @return true if a substitution was emitted, false otherwise.
+  bool emitSubstitution(llvm::raw_ostream &O, llvm::Type *Ty,
+                        TypeQualifiers Quals,
+                        llvm::ArrayRef<llvm::Type *> PrevTys,
+                        llvm::ArrayRef<TypeQualifiers> PrevQuals);
+  /// @brief Determine whether the type is a builtin type or not. Builtin types
+  /// are not considered for substitutions.
+  ///
+  /// @param[in] Ty Type to analyze.
+  /// @param[in] Quals Type qualifiers.
+  ///
+  /// @return true if the type is a builtin type, or false.
+  bool isTypeBuiltin(llvm::Type *Ty, TypeQualifiers &Quals);
+
+  /// @brief LLVM context used to access LLVM types.
+  llvm::LLVMContext *Context;
+};
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_MANGLING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
new file mode 100644
index 0000000000000..493b9df6ee04a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
@@ -0,0 +1,297 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_METADATA_H_INCLUDED
+#define COMPILER_UTILS_METADATA_H_INCLUDED
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Metadata.h>
+
+#include <optional>
+
+#include "vectorization_factor.h"
+
+namespace llvm {
+class Function;
+class Module;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+/// @brief OpenCL C standard to target.
+enum OpenCLCVer {
+  /// @brief OpenCL C 1.0
+  OpenCLC10 = (1 * 100 + 0) * 1000,
+  /// @brief OpenCL C 1.1
+  OpenCLC11 = (1 * 100 + 1) * 1000,
+  /// @brief OpenCL C 1.2
+  OpenCLC12 = (1 * 100 + 2) * 1000,
+  /// @brief OpenCL C 2.0
+  OpenCLC20 = (2 * 100 + 0) * 1000,
+  /// @brief OpenCL C 3.0
+  OpenCLC30 = (3 * 100 + 0) * 1000,
+};
+
+/// @brief Returns the OpenCL version, encoded as (Major*100 + Minor)*1000.
+///
+/// If the Module does not contain any information, then OpenCLC12 is returned.
+uint32_t getOpenCLVersion(const llvm::Module &m);
+
+/// @brief Describes the state of vectorization on a function/loop.
+struct VectorizationInfo {
+  /// @brief The VectorizationFactor. A scalar value if unvectorized.
+  VectorizationFactor vf;
+  /// @brief The dimension along which vectorization took place.
+  unsigned simdDimIdx;
+  /// @brief Whether or not the function/loop was vector-predicated.
+  bool IsVectorPredicated;
+};
+
+/// @brief Encodes metadata indicating vectorization failure to a kernel, along
+/// with the the vectorization factor and dimension that failed.
+///
+/// @param[in] f Function in which to encode the link.
+/// @param[in] info Vectorization info serving as the key.
+void encodeVectorizationFailedMetadata(llvm::Function &f,
+                                       const VectorizationInfo &info);
+
+/// @brief Encodes the vectorization metadata linking the original kernel to a
+/// vectorized one, using the vectorization factor and dimension as the key.
+///
+/// @param[in] origF Original function in which to encode the link.
+/// @param[in] vectorizedF Vectorized function to link.
+/// @param[in] info Vectorization factor serving as the key.
+void linkOrigToVeczFnMetadata(llvm::Function &origF,
+                              llvm::Function &vectorizedF,
+                              const VectorizationInfo &info);
+
+/// @brief Encodes the vectorization metadata linking a vectorized kernel back
+/// to its original one, using the vectorization factor and dimension as the
+/// key.
+///
+/// @param[in] vectorizedF Vectorized function in which to encode the link.
+/// @param[in] origF Original function to link.
+/// @param[in] info Vectorization factor serving as the key.
+void linkVeczToOrigFnMetadata(llvm::Function &vectorizedF,
+                              llvm::Function &origF,
+                              const VectorizationInfo &info);
+
+using LinkMetadataResult = std::pair<llvm::Function *, VectorizationInfo>;
+
+/// @brief Decodes the metadata linking a kernel to its vectorized variant.
+///
+/// @param[in] f Function for which to decode the metadata.
+/// @param[out] factors unordered vector of recovered vectorization links.
+///
+/// @return true on success, false if there is no vectorization metadata for the
+/// function.
+bool parseOrigToVeczFnLinkMetadata(
+    llvm::Function &f, llvm::SmallVectorImpl<LinkMetadataResult> &factors);
+
+/// @brief Decodes the metadata linking a vectorized kernel back to its
+/// original one.
+///
+/// @param[in] f Function for which to decode the metadata.
+///
+/// @return On success, a pair containing a pointer to the original kernel
+/// function and the vectorization factor used as the key. The original
+/// function may be null. On decoding failure, std::nullopt.
+std::optional<LinkMetadataResult> parseVeczToOrigFnLinkMetadata(
+    llvm::Function &f);
+
+/// @brief Drops "base" vectorization metadata from a function, if present.
+///
+/// @param[in] f Function to drop metadata from.
+void dropVeczOrigMetadata(llvm::Function &f);
+
+/// @brief Drops "derived" vectorization metadata from a function, if present.
+///
+/// @param[in] f Function to drop metadata from.
+void dropVeczDerivedMetadata(llvm::Function &f);
+
+/// @brief Encodes metadata indicating the various components that constitute a
+/// kernel function wrapped with the WorkItemLoopsPass.
+///
+/// @param[in] f Function in which to encode the metadata.
+/// @param[in] simdDimIdx The dimension (0,1,2) along which vectorization took
+/// place.
+/// @param[in] mainInfo VectorizationInfo used on the 'main' work-item
+/// iterations.
+/// @param[in] tailInfo VectorizationInfo used on the tail iterations, if
+/// applicable.
+///
+/// Note that a 'tail' is defined as the work done to execute work-items not
+/// covered by the 'main' body. Therefore an unvectorized kernel should expect
+/// a scalar 'main' vectorization factor and no 'tail' (rather than the other
+/// way round).
+
+/// Some examples of *typical* usage:
+/// 1. An unvectorized kernel will encode a scalar VF for the main iterations
+/// and nothing for the tail ones.
+/// 2. A vectorized kernel will encode vectorization factor for its main
+/// iterations. If it handles the case in which the local work-group size does
+/// not evenly divide the vectorization factor, it will encode how it manages
+/// the tail iterations. This is *typically* with a series of scalar
+/// iterations, encoded in tailVF.
+/// 3. Vector-predicated kernels with no tails will encode the *maximum* VF used
+/// for the main loop, with no tail iterations.
+///
+/// This metadata is encoded as:
+/// define void @foo() !codeplay_ca_wrapper !X
+/// !X = { !Main, !Tail }
+/// !Main = { i32 mKnownMin, i32 mIsScalable, i32 simdDimIdx, i32 mIsVP }
+/// if tailVF is None:
+///   !Tail = {}
+/// else
+///   !Tail = { i32 tKnownMin, i32 tIsScalable, i32 simdDimIdx, i32 tIsVP }
+void encodeWrapperFnMetadata(llvm::Function &f,
+                             const VectorizationInfo &mainInfo,
+                             std::optional<VectorizationInfo> tailInfo);
+
+/// @brief Decodes the metadata describing a wrapped kernel's loop structure.
+///
+/// @param[in] f Function for which to decode the metadata.
+///
+/// @return On success, a pair containing the VectorizationInfo for the main
+/// loop(s) and the (optional) VectorizationInfo info for the tail loop(s). On
+/// decoding failure, std::nullopt.
+std::optional<std::pair<VectorizationInfo, std::optional<VectorizationInfo>>>
+parseWrapperFnMetadata(llvm::Function &f);
+
+/// @brief Copies function metadata from one function to another.
+///
+/// @param[in] fromF Function from which to copy the metadata.
+/// @param[in] toF Function onto which to copy the metadata.
+/// @param[in] includeDebug Whether or not to copy debug function metadata.
+void copyFunctionMetadata(llvm::Function &fromF, llvm::Function &toF,
+                          bool includeDebug = false);
+
+/// @brief Encodes information about a function's local work group size as
+/// metadata.
+///
+/// @param[in] f Function in which to encode the metadata.
+/// @param[in] localSizes array of size information to encode.
+void encodeLocalSizeMetadata(llvm::Function &f,
+                             const std::array<uint64_t, 3> &localSizes);
+
+/// @brief Retrieves information about a function's local sizes via metadata.
+///
+/// @param[in] f Function from which to decode the metadata
+/// @returns The local size array if present, else `std::nullopt`
+std::optional<std::array<uint64_t, 3>> getLocalSizeMetadata(
+    const llvm::Function &f);
+
+/// @brief Drops all !mux_scheduled_fn metadata from a function.
+void dropSchedulingParameterMetadata(llvm::Function &f);
+
+/// @brief Retrieves the indices of scheduling parameters from the function.
+llvm::SmallVector<int, 4> getSchedulingParameterFunctionMetadata(
+    const llvm::Function &f);
+
+/// @brief Sets scheduling-parameter metadata on the given function
+void setSchedulingParameterFunctionMetadata(llvm::Function &f,
+                                            llvm::ArrayRef<int> idxs);
+
+/// @brief Sets module-level metadata describing the set of scheduling
+/// parameters.
+void setSchedulingParameterModuleMetadata(llvm::Module &m,
+                                          llvm::ArrayRef<std::string> names);
+
+/// @brief Retrieves module-level metadata describing the set of scheduling
+/// parameters or nullptr.
+llvm::NamedMDNode *getSchedulingParameterModuleMetadata(const llvm::Module &m);
+
+/// @brief If the given function parameter index is considered a scheduling
+/// parameter, it returns the corresponding index into the target's list of
+/// scheduling parameters.
+///
+/// It uses !mux_scheduled_fn metadata for this check.
+std::optional<unsigned> isSchedulingParameter(const llvm::Function &f,
+                                              unsigned idx);
+
+/// @brief Extracts the required work group size from a kernel's function
+/// metadata.
+///
+/// @param[in] f Kernel for extraction.
+///
+/// @return The work group size or std::nullopt if there is no such metadata.
+std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
+    const llvm::Function &f);
+
+/// @brief Extracts the required work group size from an opencl.kernels subnode,
+/// which is similar to the function metadata, but the size is stored under
+/// different indices than on a function.
+///
+/// @param[in] node Kernel's subnode for extraction.
+///
+/// @return The work group size or std::nullopt if there is no such metadata.
+std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
+    const llvm::MDNode &node);
+
+/// @brief Extracts the maximum work dimension from a kernel's function
+/// metadata
+///
+/// @param[in] f Kernel for extraction.
+///
+/// @return The maximum work dimension or std::nullopt if there is no such
+/// metadata.
+std::optional<uint32_t> parseMaxWorkDimMetadata(const llvm::Function &f);
+
+/// @brief Describes the state of vectorization on a function/loop.
+struct KernelInfo {
+  explicit KernelInfo(llvm::StringRef name) : Name(name) {}
+  /// @brief The function name
+  std::string Name;
+  /// @brief The required work-group size. Optional.
+  std::optional<std::array<uint64_t, 3>> ReqdWGSize;
+};
+
+/// @brief Helper function to populate a list of kernels and associated
+/// information from a module.
+///
+/// @param m Module to retrieve kernels from
+/// @param results List of kernel info parsed from metadata or taken from the
+/// module.
+void populateKernelList(llvm::Module &m,
+                        llvm::SmallVectorImpl<KernelInfo> &results);
+
+/// @brief Replaces instances of kernel fromF with toF in module-level
+/// !opencl.kernels metadata.
+/// @param fromF Function to replace with toF in metadata
+/// @param toF Function with which to replace references to fromF
+/// @param M Module in which to find the metadata
+void replaceKernelInOpenCLKernelsMetadata(llvm::Function &fromF,
+                                          llvm::Function &toF, llvm::Module &M);
+
+/// @brief Encodes information about a function's local work group size as
+/// metadata.
+///
+/// @param[in] f Function in which to encode the metadata.
+/// @param[in] size sub-group size information to encode.
+void encodeReqdSubgroupSizeMetadata(llvm::Function &f, uint32_t size);
+
+/// @brief Retrieves information about a function's required sub-group size via
+/// metadata.
+///
+/// @param[in] f Function from which to decode the metadata
+/// @returns The required sub-group size if present, else `std::nullopt`
+std::optional<uint32_t> getReqdSubgroupSize(const llvm::Function &f);
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_METADATA_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
new file mode 100644
index 0000000000000..c28241bb71351
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
@@ -0,0 +1,115 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Optimal builtin replacement pass.
+
+#ifndef COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
+#define COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
+
+#include <compiler/utils/mangling.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LazyCallGraph.h>
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @brief A Callgraph optimization pass which replaces calls to builtin
+/// functions with more optimal versions, either via inlined code, or calls to
+/// suitable llvm intrinsics which will later be lowered to optimal machine
+/// code. When run with a non-null BuiltinInfo analysis, the builtin info is
+/// queried to determine the properties of each call in the graph.
+///
+/// A set of replacement functions with identical signatures is kept by this
+/// pass. These are invoked in order one after another on each call instruction
+/// in the call graph. If any replacement returns a non-null `Value*` it is
+/// used to replace the call and no further replacements are attempted on that
+/// call. It is assumed that no replacement introduces new calls to the graph.
+/// The set of replacements can be modified by users by setting
+/// `adjustReplacements`.
+///
+/// The default set of replacement functions, in order, is:
+/// * replaceAbacusCLZ
+/// * replaceAbacusMulhi
+/// * replaceAbacusFMinFMax
+/// * Invoking emitBuiltinInline from BuiltinInfo analysis
+class OptimalBuiltinReplacementPass
+    : public llvm::PassInfoMixin<OptimalBuiltinReplacementPass> {
+ public:
+  using ReplacementFnTy = std::function<llvm::Value *(
+      llvm::CallBase &, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &)>;
+
+  /// @brief Constructor. Sets up default builtin replacements.
+  OptimalBuiltinReplacementPass();
+
+  llvm::PreservedAnalyses run(llvm::LazyCallGraph::SCC &C,
+                              llvm::CGSCCAnalysisManager &AM,
+                              llvm::LazyCallGraph &CG,
+                              llvm::CGSCCUpdateResult &UR);
+
+  /// @brief A callback invoked per-SCC before any replacements are performed,
+  /// allowing customization of the replacements to be performed. The default
+  /// set of replacements are passed in and may be modified in any way.
+  std::function<void(std::vector<ReplacementFnTy> &)> adjustReplacements;
+
+  /// @brief Replaces calls __abacus_clz(ty) with @llvm.ctlz(ty, i1 false)
+  /// indicating that zero does not produce a poison result.
+  /// Note: This replacement is not performend on 64-bit scalar or vectors of
+  /// 64-bit scalar types.
+  static llvm::Value *replaceAbacusCLZ(
+      llvm::CallBase &CB, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
+
+  /// @brief Replaces __abacus_mul_hi(ty lhs, ty rhs) with a sequence:
+  ///   %lhs.ext = ext ty %lhs to x2bw(ty)
+  ///   %rhs.ext = ext ty %rhs to x2bw(ty)
+  ///   %mul.ext = mul x2bw(ty) %lhs.ext, %rhs.ext
+  ///   %lo.part = ashr x2bw(ty) %mul.ext, bw(ty)
+  ///   %res = trunc x2bw(ty) %lo.part to ty
+  /// Where x2bw(ty) returns a type with twice the (element) bit-width, and
+  /// bw(ty) returns the bit-width of a (element) type as an integer.
+  /// This pattern is better matched by LLVM and target backends often produce
+  /// "mul_hi" instructions as a result.
+  static llvm::Value *replaceAbacusMulhi(
+      llvm::CallBase &, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
+
+  /// @brief Replaces __abacus_(fmin|fmax)(ty1 lhs, ty2 rhs) with
+  /// @llvm.(minnum|maxnum)(ty1 lhs, ty1 rhs), where ty2 may be a scalar type
+  /// which is splatted to a vector of ty1, where appropriate.
+  /// Note: This replacement is not performed on ARM or AArch64 targets, due to
+  /// LLVM backend bugs (https://llvm.org/PR27363).
+  static llvm::Value *replaceAbacusFMinFMax(
+      llvm::CallBase &, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
+
+ private:
+  std::vector<ReplacementFnTy> replacements;
+
+  llvm::Value *replaceBuiltinWithInlineIR(llvm::CallBase &CB) const;
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
new file mode 100644
index 0000000000000..ea7be99445996
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
@@ -0,0 +1,339 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// LLVM pass utility functions.
+
+#ifndef COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
+#define COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
+
+#include <llvm/ADT/Twine.h>
+#include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+
+#include <array>
+#include <functional>
+
+namespace llvm {
+class Argument;
+class BasicBlock;
+class Constant;
+class ConstantExpr;
+class Function;
+class IntegerType;
+class LLVMContext;
+class Module;
+class ModulePass;
+class Type;
+class Value;
+class IRBuilderBase;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+/// @addtogroup utils
+/// @{
+
+/// @brief Calculate (approximately) the amount of private memory used by a
+/// kernel.
+///
+/// @param fn The kernel function
+///
+/// @return uint64_t The private memory used by the kernel function in bytes.
+uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn);
+
+/// @brief Forces a constant expression or constant vector back to a normal
+/// instruction
+///
+/// @param[in] constant to be replaced
+void replaceConstantExpressionWithInstruction(llvm::Constant *const constant);
+
+/// @brief remap operands of a constant expression
+///
+/// @note This will create a new constant expression and replace references to
+/// the original constant with the new one
+///
+/// @param[in] expr Constant expression to be remapped
+/// @param[in] from Constant which if found in expression will be
+/// replaced
+/// @param[in] to Constant which will replace any operands which are `from`
+void remapConstantExpr(llvm::ConstantExpr *expr, llvm::Constant *from,
+                       llvm::Constant *to);
+
+/// @brief remap operands of a constant array
+///
+/// @note This will create a new constant array and replace references to
+/// the original constant with the new one
+///
+/// @param[in] arr Constant array to be remapped
+/// @param[in] from Constant which if found in array will be
+/// replaced
+/// @param[in] to Constant which will replace any operands which are `from`
+void remapConstantArray(llvm::ConstantArray *arr, llvm::Constant *from,
+                        llvm::Constant *to);
+
+/// @brief Discover if input function references debug info metadata nodes
+///
+/// @param[in] func Function to check
+/// @param[in,out] vmap Value map updated with identity mappings of any debug
+/// info metadata found
+///
+/// @return bool True if function contains debug info, false otherwise
+bool funcContainsDebugMetadata(const llvm::Function &func,
+                               llvm::ValueToValueMapTy &vmap);
+
+/// @brief Return a copy of a function's function, return, and parameter
+/// attributes.
+///
+/// Only parameter attributes from indices 0 to numParams are copied. If
+/// numParams is negative, all parameter attributes are copied.
+llvm::AttributeList getCopiedFunctionAttrs(const llvm::Function &oldFn,
+                                           int numParams = -1);
+
+/// @brief Copy a function's attributes to a new function.
+///
+/// @param[in] oldFn Function to copy function attributes from.
+/// @param[in] newFn Function to copy function attributes to.
+/// @param[in] numParams number of parameters to copy attributes from, starting
+/// from the first parameter. If set to a negative number, will copy all
+/// parameter attributes.
+void copyFunctionAttrs(const llvm::Function &oldFn, llvm::Function &newFn,
+                       int numParams = -1);
+
+using ParamTypeAttrsPair = std::pair<llvm::Type *, llvm::AttributeSet>;
+
+using UpdateMDCallbackFn =
+    std::function<void(llvm::Function &oldFn, llvm::Function &newFn, unsigned)>;
+
+/// @brief Clone functions in a module and add an argument to them
+///
+/// @param module LLVM module containing the functions
+/// @param paramTypeFunc Additional parameter to be added defined as a function
+/// returning the type and set of attributes.
+/// This function takes a module, primarily to access DataLayout
+/// @param toBeClonedFunc function which dictates whether each function is
+/// cloned
+/// @param updateMetaDataCallback if set, is invoked with the old function, new
+/// function and new argument index.
+///
+/// @return bool if the module has changed (currently always true)
+///
+/// This iterates through all the functions in a module but only clones and adds
+/// the extra param for those that meet the following criteria after setting
+/// `clonedNoBody` and `ClonedWithBody` from the toBeCloned expression:-
+///
+/// 1.  `!function` declaration or `ClonedNoBody` _or_ is a function
+///     declaration and `ClonedWithBody`
+/// 2.  Not already processed
+bool cloneFunctionsAddArg(
+    llvm::Module &module,
+    std::function<ParamTypeAttrsPair(llvm::Module &)> paramTypeFunc,
+    std::function<void(const llvm::Function &, bool &ClonedWithBody,
+                       bool &ClonedNoBody)>
+        toBeClonedFunc,
+    const UpdateMDCallbackFn &updateMetaDataCallback = nullptr);
+
+/// @brief Updates call instructions after to function clone to point to
+/// `newFunc` instead of `oldFunc`, old call instructions are deleted.
+///
+/// @param[in] oldFunc Function which has been cloned
+/// @param[in] newFunc Cloned function to point callsites to
+/// @param[in] extraArg Whether the cloned callee has an extra argument added
+void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
+                          bool extraArg);
+
+/// @brief Clone all functions in a module, appending an extra parameter to
+/// them.
+///
+/// @param module llvm module containing the functions
+/// @param newParamType Type of the parameter to be added
+/// @param newParamAttrs Parameter attributes of the parameter to be added
+/// @param updateMetaDataCallback if set, is invokved with the old function,
+/// new function and new argument index.
+///
+/// @return bool if the module has changed (currently always true)
+///
+/// This iterates through all the functions in a module and clones all
+/// functions with a body and adds the extra param at the end of their parameter
+/// lists. Simpler version of `cloneFunctionsAddArg()` where the use case is
+/// more limited.
+bool addParamToAllFunctions(
+    llvm::Module &module, llvm::Type *const newParamType,
+    const llvm::AttributeSet &newParamAttrs,
+    const UpdateMDCallbackFn &updateMetaDataCallback = nullptr);
+
+using CreateLoopBodyFn = std::function<llvm::BasicBlock *(
+    llvm::BasicBlock *, llvm::Value *, llvm::ArrayRef<llvm::Value *>,
+    llvm::MutableArrayRef<llvm::Value *>)>;
+
+struct CreateLoopOpts {
+  /// @brief indexInc Value by which to increment the loop counter. If nullptr,
+  /// then it is created as the constant 1, based on type of `indexStart`,
+  /// which is a parameter to compiler::utils::createLoop proper.
+  llvm::Value *indexInc = nullptr;
+  /// @brief disableVectorize Sets loop metadata disabling further
+  /// vectorization.
+  bool disableVectorize = false;
+  /// @brief headerName Optional name for the loop header block. Defaults to:
+  /// "loopIR".
+  llvm::StringRef headerName = "loopIR";
+  /// @brief An optional list of incoming IV values.
+  ///
+  /// Each of these is used as the incoming value to a PHI created by
+  /// createLoop. These PHIs are provided to the 'body' function of createLoop,
+  /// which should in turn set the 'next' version of the IV.
+  std::vector<llvm::Value *> IVs;
+  /// @brief An optional list of IV names, to be set on the PHIs provided by
+  /// 'IVs' field/parameter.
+  ///
+  /// If set, the names are assumed to correlate 1:1 with those IVs. The list
+  /// may be shorter than the list of IVs, in which case the trailing IVs are
+  /// not named.
+  std::vector<std::string> loopIVNames;
+};
+
+/// @brief Create a loop around a body, creating an implicit induction variable
+/// (IV) between specified start and end values, and incremented by a
+/// user-specified amount. The loop thus has a trip count equal to the
+/// following C-style loop: `for (auto i = start; i < end; i += incr)`.
+///
+/// Note that this helper always creates a CFG loop, even if the loop bounds
+/// are known not to produce a loop at compile time. Users can use stock LLVM
+/// optimizations to eliminate/simplify the loop in such a case.
+///
+/// @param entry Loop pre-header block. This block will be rewired to jump into
+/// the new loop.
+/// @param exit Loop exit block. The new loop will jump to this once it exits.
+/// @param indexStart The start index
+/// @param indexEnd The end index (we compare for <)
+/// @param opts Set of options configuring the generation of this loop.
+/// @param body Body of code to insert into loop.
+///
+/// The parameters of this function are as follows: the loop body BasicBlock;
+/// the Value corresponding to the IV beginning at `indexStart` and incremented
+/// each iteration by `indexInc` while less than `indexEnd`; the list of IVs
+/// for this iteration of the loop (may or may not be PHIs, depending on the
+/// loop bounds); the list of IVs for the next iteration of the loop (the
+/// function is required to fill these in). Both these sets of IVs will be
+/// arrays of equal length to the original list of IVs, in the same order. The
+/// function returns the loop latch/exiting block: this block will be given the
+/// branch that decides between continuing the loop and exiting from it.
+///
+/// @return llvm::BasicBlock* The exit block
+llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
+                             llvm::Value *indexStart, llvm::Value *indexEnd,
+                             const CreateLoopOpts &opts, CreateLoopBodyFn body);
+
+/// @brief Get the last argument of a function.
+///
+/// @param f An LLVM function to get an argument from.
+///
+/// @return An LLVM argument.
+llvm::Argument *getLastArgument(llvm::Function *f);
+
+/// @brief get the device-side size of size_t type in bytes.
+unsigned getSizeTypeBytes(const llvm::Module &m);
+
+/// @brief get a size_t type.
+/// @return a LLVM IntegerType representing size_t.
+llvm::IntegerType *getSizeType(const llvm::Module &m);
+
+/// @brief Creates a wrapper function (without body), intended for calling @p F
+/// @param M Containing module
+/// @param F Kernel function which is being replaced
+/// @param ArgTypes List of types to be used for the new function
+/// @param Suffix String to which to append to the new function
+/// @param OldSuffix String to which to append to the old function
+/// @note This takes the metadata and debug from the original function.
+///       This is intended to be used for creating a function which replaces
+///       the original function but calls the original.
+///
+/// @note The name of the wrapper function is computed as the original name of
+///       F followed by the Suffix. The original name of F is taken from F's
+///       'mux-base-fn-name' attribute, if set, else it is F's name:
+///
+///         declare void @foo()
+///         ; Function attrs "mux-base-fn-name"="baz"
+///         declare void @bar()
+///
+///       With suffix '.wrapper', this function will produce:
+///
+///         declare void @foo.wrapper()
+///         declare void @baz.wrapper()
+///
+///       With suffix '.new' and old suffix '.old', this function will produce:
+///
+///         declare void @foo.old()
+///         ; Function attrs "mux-base-fn-name"="baz"
+///         declare void @bar.old()
+///
+///         declare void @foo.new()
+///         declare void @baz.new()
+///
+///       It is advised that the suffix begins with a character that may not
+///       occur in the original source language, to avoid clashes with user
+///       functions.
+llvm::Function *createKernelWrapperFunction(
+    llvm::Module &M, llvm::Function &F, llvm::ArrayRef<llvm::Type *> ArgTypes,
+    llvm::StringRef Suffix, llvm::StringRef OldSuffix = "");
+
+/// @brief As above, but creating a wrapper with the exact function signature
+/// of @p F.
+///
+/// Copies over all parameter names and attributes.
+llvm::Function *createKernelWrapperFunction(llvm::Function &F,
+                                            llvm::StringRef Suffix,
+                                            llvm::StringRef OldSuffix = "");
+
+/// @brief Creates a call to a a wrapped function
+///
+/// Sets the calling convention and call-site attributes to match the wrapped
+/// function.
+///
+/// @param WrappedF the function to call
+/// @param Args the list of arguments to pass to the call
+/// @param BB the basic block into which to insert the call. May be null, in
+/// which case the call is not inserted anywhere.
+/// @param InsertPt the point in BB at which to insert the call
+/// @param Name the name of the call instruction. May be empty.
+/// @return The call instruction
+llvm::CallInst *createCallToWrappedFunction(
+    llvm::Function &WrappedF, const llvm::SmallVectorImpl<llvm::Value *> &Args,
+    llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt,
+    llvm::StringRef Name = "");
+
+/// @brief Create a binary operation corresponding to the given
+/// `llvm::RecurKind` with the two provided arguments. It may not
+/// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one
+/// operation: integer min/max operations may defer to multiple instructions or
+/// intrinsics depending on the LLVM version.
+///
+/// @param[in] B the IRBuilder to build new instructions
+/// @param[in] LHS the left-hand value for the operation
+/// @param[in] RHS the right-hand value for the operation
+/// @param[in] Kind the kind of operation to create
+/// @return The binary operation.
+llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
+                                     llvm::Value *RHS, llvm::RecurKind Kind);
+/// @}
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
new file mode 100644
index 0000000000000..0113dcbebbf5b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
@@ -0,0 +1,145 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Hold global state and objects used for managing a pass pipeline.
+
+#ifndef COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
+#define COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
+
+#include <llvm/ADT/StringMap.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LoopAnalysisManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Passes/StandardInstrumentations.h>
+
+namespace llvm {
+class TargetMachine;
+}
+
+namespace compiler {
+namespace utils {
+
+/// @brief Mirror's LLVM's DebugLogging options in its `opt` tool. Clang has
+/// a boolean on/off version.
+enum class DebugLogging { None, Normal, Verbose, Quiet };
+
+/// @brief A class that manages the lifetime and initialization of all
+/// components required to set up a new-style LLVM pass manager.
+class PassMachinery {
+ public:
+  PassMachinery(llvm::LLVMContext &Ctx, llvm::TargetMachine *TM,
+                bool VerifyEach = false,
+                DebugLogging debugLogLevel = DebugLogging::None);
+
+  virtual ~PassMachinery();
+
+  /// @brief Initializes the PassBuilder and calls registerPasses.
+  void initializeStart(
+      llvm::PipelineTuningOptions PTO = llvm::PipelineTuningOptions());
+
+  /// @brief Cross-registers analysis managers, adds callbacks and
+  /// instrumentation support. Calls addClassToPassNames and
+  /// registerPassCallbacks.
+  void initializeFinish();
+
+  /// @brief Calls buildDefaultAAPipeline and registerLLVMAnalyses.
+  virtual void registerPasses();
+
+  /// @brief Helper method to register the standard LLVM AA pipeline.
+  ///
+  /// Registers:
+  /// * llvm::PassBuilder::buildDefaultAAPipeline
+  void buildDefaultAAPipeline();
+
+  /// @brief Helper method to register the standard LLVM analyses.
+  ///
+  /// Calls:
+  /// * llvm::PassBuilder::registerModuleAnalyses
+  /// * llvm::PassBuilder::registerCGSCCAnalyses
+  /// * llvm::PassBuilder::registerFunctionAnalyses
+  /// * llvm::PassBuilder::registerLoopAnalyses
+  void registerLLVMAnalyses();
+
+  /// @brief Method to allow customization of class-to-pass-names for
+  /// instrumentation purposes. By default, none are set up by
+  /// PassMachinery::initialize.
+  virtual void addClassToPassNames() {}
+
+  /// @brief Method to allow customization of pass callbacks via
+  /// llvm::PassBuilder. of customization of class-to-pass-names for By
+  /// default, no callbacks are set up by PassMachinery::initialize.
+  virtual void registerPassCallbacks() {}
+
+  /// @brief print pass names in style of opt --print-passes
+  /// @note This should print parameters too
+  virtual void printPassNames(llvm::raw_ostream &) {}
+
+  llvm::ModuleAnalysisManager &getMAM() { return MAM; }
+  const llvm::ModuleAnalysisManager &getMAM() const { return MAM; }
+
+  llvm::FunctionAnalysisManager &getFAM() { return FAM; }
+  const llvm::FunctionAnalysisManager &getFAM() const { return FAM; }
+
+  llvm::PassBuilder &getPB() { return PB; }
+  const llvm::PassBuilder &getPB() const { return PB; }
+
+  llvm::TargetMachine *getTM() { return TM; }
+  const llvm::TargetMachine *getTM() const { return TM; }
+
+ protected:
+  /// @brief TargetMachine to be used for passes. May be nullptr.
+  llvm::TargetMachine *TM;
+  // Note: the order here is important! They must be destructed in this order.
+  /// @brief Holds state for Loop analyses.
+  llvm::LoopAnalysisManager LAM;
+  /// @brief Holds state for Function analyses.
+  llvm::FunctionAnalysisManager FAM;
+  /// @brief Holds state for CGSCC analyses.
+  llvm::CGSCCAnalysisManager CGAM;
+  /// @brief Holds state for Module analyses.
+  llvm::ModuleAnalysisManager MAM;
+  /// @brief Manages the state for any instrumentation callbacks.
+  std::unique_ptr<llvm::StandardInstrumentations> SI;
+  /// @brief Provides an interface to register callbacks.
+  llvm::PassInstrumentationCallbacks PIC;
+  /// @brief Helper to build and parse pass pipelines.
+  llvm::PassBuilder PB;
+};
+
+/// Helper functions for pass printing.
+
+/// @brief Helper function for printing a pass name, to be used by
+/// printPassNames.
+/// @param PassName Name of pass from a debug/parsing perspective.
+/// @param OS stream to write to.
+/// @note This is a direct copy from PassBuilder.cpp.
+void printPassName(llvm::StringRef PassName, llvm::raw_ostream &OS);
+
+/// @brief Helper function for printing a pass name with parameters, to be.
+/// @param PassName Name of pass from a debug/parsing perspective.
+/// @param Params Textual representation of the parameters.
+/// @param OS stream to write to.
+/// @note This is a direct copy from PassBuilder.cpp.
+void printPassName(llvm::StringRef PassName, llvm::StringRef Params,
+                   llvm::raw_ostream &OS);
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
new file mode 100644
index 0000000000000..1df7c2f1c2c25
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
@@ -0,0 +1,45 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Prepare barriers pass.
+
+#ifndef COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
+#define COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Pass for ensuring consistent barrier handling.
+///
+/// It inlines functions that contain barriers and gives each barrier call a
+/// unique ID as metadata to ensure consistent handling of barriers in
+/// different versions of the kernel (i.e. Scalar vs Vector). Run before Vecz
+/// for mixed wrapper kernels made up of multiple kernels to work.
+///
+/// Runs over all kernels with "kernel entry point" metadata.
+class PrepareBarriersPass final
+    : public llvm::PassInfoMixin<PrepareBarriersPass> {
+ public:
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
new file mode 100644
index 0000000000000..03808913c4711
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
@@ -0,0 +1,44 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Replace local module-scope variables pass.
+
+#ifndef COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
+#define COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @brief __local address space automatic variables are represented in the
+/// LLVM module as global variables with address space 3. This pass identifies
+/// these variables and places them into a struct allocated (via alloca) in a
+/// newly created wrapper function. A pointer to the struct is then passed
+/// via a parameter to the original kernel.
+///
+/// Runs over all kernels with "kernel" metadata.
+class ReplaceLocalModuleScopeVariablesPass final
+    : public llvm::PassInfoMixin<ReplaceLocalModuleScopeVariablesPass> {
+ public:
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
new file mode 100644
index 0000000000000..ba1fb4f44e1da
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
@@ -0,0 +1,143 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Various utlities to help with work-item and work-group scheduling.
+
+#ifndef COMPILER_UTILS_SCHEDULING_H_INCLUDED
+#define COMPILER_UTILS_SCHEDULING_H_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+class Function;
+class Module;
+class StructType;
+class Argument;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+namespace WorkItemInfoStructField {
+enum Type : uint32_t {
+  local_id,
+  sub_group_id,
+  num_sub_groups,
+  max_sub_group_size,
+  total
+};
+}
+
+namespace WorkGroupInfoStructField {
+enum Type : uint32_t {
+  group_id = 0,
+  num_groups,
+  global_offset,
+  local_size,
+  work_dim,
+  total
+};
+}
+
+/// @brief Computes the work item info structure type for the given module.
+llvm::StructType *getWorkItemInfoStructTy(llvm::Module &M);
+
+/// @brief Computes the work item info structure type for the given module.
+llvm::StructType *getWorkGroupInfoStructTy(llvm::Module &M);
+
+/// @brief Populates an empty function with code to look up and return a value
+/// from a pointer-to-struct argument.
+///
+/// The function may optionally have a 'rank', in which case the struct field
+/// index is expected to be a 3D array of values. Ranked functions must have an
+/// integer index as their first parameter. Any integer type is supported. The
+/// generated code for ranked functions is given a bounds check to ensure the
+/// index is less than 3. If the index is out of bounds, the default value is
+/// returned.
+///
+/// The pointer-to-struct may be any parameter other than the index, which
+/// comes first.
+///
+/// if !hasRankArg:
+///   ; where structFieldIdx identifies the field.
+///   %struct = type { ..., i64, ... }
+///   declare i64 @foo(ptr %struct-ptr)
+///
+/// if hasRankArg:
+///   ; where structFieldIdx identifies the field and the %idx parameter
+///   ; identifies the sub-field.
+///   %struct = type { ..., [i64, i64, i64], ... }
+///   declare i64 @foo(i32 %idx, ptr %struct-ptr)
+///
+/// @param[in,out] F The function to define
+/// @param[in] structPtrArg The pointer-to-struct argument
+/// @param[in] structTy The underlying type of the pointer-to-struct argument,
+/// used for offset calculations
+/// @param[in] structFieldIdx The struct type's field index to load from
+/// @param[in] hasRankArg True if the struct type's field index is a 3D array,
+/// and thus the function's first parameter is an index parameter.
+/// @param[in] defaultValue The default value returned if the index is out of
+/// bounds. Only valid for ranked functions.
+void populateStructGetterFunction(llvm::Function &F,
+                                  llvm::Argument &structPtrArg,
+                                  llvm::StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg,
+                                  size_t defaultValue = 0);
+
+/// @brief Populates an empty function with code to store a value into a
+/// pointer-to-struct argument.
+///
+/// The function may optionally have a 'rank', in which case the struct field
+/// index is expected to be a 3D array of values. Ranked functions must have an
+/// integer index as their first parameter. Any integer type is supported.
+///
+/// The value to store is the next parameter (either first or second) and the
+/// pointer-to-struct may be any other unoccupied parameter.
+///
+/// if !hasRankArg:
+///   ; where structFieldIdx identifies the field.
+///   %struct = type { ..., i64, ... }
+///   declare void @foo(i64 %val, ptr %struct-ptr)
+///
+/// if hasRankArg:
+///   ; where structFieldIdx identifies the field and the %idx parameter
+///   ; identifies the sub-field.
+///   %struct = type { ..., [i64, i64, i64], ... }
+///   declare void @foo(i32 %idx, i64 %val, ptr %struct-ptr)
+///
+/// Note that unlike populateStructGetterFunction, no bounds check is
+/// performed. The setter functions are only available internally to the
+/// compiler, and thus the indices are assumed to be within bounds.
+///
+/// @param[in,out] F The function to define
+/// @param[in] structPtrArg The pointer-to-struct argument
+/// @param[in] structTy The underlying type of the pointer-to-struct argument,
+/// used for offset calculations
+/// @param[in] structFieldIdx The struct type's field index to store to
+/// @param[in] hasRankArg True if the struct type's field index is a 3D array,
+/// and thus the function's first parameter is an index parameter.
+void populateStructSetterFunction(llvm::Function &F,
+                                  llvm::Argument &structPtrArg,
+                                  llvm::StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg);
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_SCHEDULING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
new file mode 100644
index 0000000000000..fb19fc956027b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
@@ -0,0 +1,115 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
+#define COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <map>
+#include <set>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Provides module-level information about the sub-group usage of each
+/// function contained within.
+///
+/// The results for each function are cached in a map. Declarations are not
+/// processed. Thus an external function declaration that uses sub-group
+/// builtins will be missed.
+///
+/// Internal mux sub-group 'setter' functions are not counted. This is because
+/// they only used internally by the oneAPI Construction Kit as scaffolding for
+/// the sub-group support that the user can observe.
+///
+/// Each function contains the set of mux sub-group builtins it (transitively)
+/// calls.
+class GlobalSubgroupInfo {
+  struct SubgroupInfo {
+    std::set<BuiltinID> UsedSubgroupBuiltins;
+  };
+
+  using FunctionMapTy =
+      std::map<const llvm::Function *, std::unique_ptr<SubgroupInfo>>;
+
+  FunctionMapTy FunctionMap;
+
+  compiler::utils::BuiltinInfo &BI;
+
+ public:
+  GlobalSubgroupInfo(llvm::Module &M, BuiltinInfo &);
+
+  compiler::utils::BuiltinInfo &getBuiltinInfo() { return BI; }
+
+  using iterator = FunctionMapTy::iterator;
+  using const_iterator = FunctionMapTy::const_iterator;
+
+  /// @brief Returns the SubgroupInfo for the provided function.
+  ///
+  /// The function must already exist in the map.
+  inline const SubgroupInfo *operator[](const llvm::Function *F) const {
+    const const_iterator I = FunctionMap.find(F);
+    assert(I != FunctionMap.end() && "Function not in sub-group info!");
+    return I->second.get();
+  }
+
+  bool usesSubgroups(const llvm::Function &F) const;
+
+  /// @brief Returns true if the provided function is a mux sub-group
+  /// collective builtin or sub-group barrier.
+  std::optional<compiler::utils::Builtin> isMuxSubgroupBuiltin(
+      const llvm::Function *F) const;
+};
+
+/// @brief Computes and returns the GlobalSubgroupInfo for a Module.
+class SubgroupAnalysis : public llvm::AnalysisInfoMixin<SubgroupAnalysis> {
+  friend AnalysisInfoMixin<SubgroupAnalysis>;
+
+ public:
+  using Result = GlobalSubgroupInfo;
+
+  explicit SubgroupAnalysis() {}
+
+  /// @brief Retrieve the GlobalSubgroupInfo for the module.
+  Result run(llvm::Module &M, llvm::ModuleAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Sub-group analysis"; }
+
+ private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+/// @brief Helper pass to print out the contents of the SubgroupAnalysis
+/// analysis.
+class SubgroupAnalysisPrinterPass
+    : public llvm::PassInfoMixin<SubgroupAnalysisPrinterPass> {
+  llvm::raw_ostream &OS;
+
+ public:
+  explicit SubgroupAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
new file mode 100644
index 0000000000000..c825e6b9cb124
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
@@ -0,0 +1,144 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
+#define COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
+
+namespace llvm {
+class Type;
+class LLVMContext;
+}  // namespace llvm
+
+namespace compiler {
+namespace utils {
+namespace tgtext {
+
+/// @brief The indices of the *integer* parameters of a "spirv.Image" type.
+enum ImageTyIntParamIdx {
+  ImageTyDimensionalityIdx = 0,
+  ImageTyDepthIdx,
+  ImageTyArrayedIdx,
+  ImageTyMSIdx,
+  ImageTySampledIdx,
+  ImageTyFormatIdx,
+  ImageTyAccessQualIdx,
+};
+
+/// @brief Values the 'dimensionality' parameter of a "spirv.Image" type may
+/// hold.
+///
+/// Note that not all of these are supported by the compiler.
+enum ImageTyDimensionalityParam {
+  ImageDim1D = 0,
+  ImageDim2D,
+  ImageDim3D,
+  ImageDimCube,
+  ImageDimRect,
+  ImageDimBuffer,
+  ImageDimSubpassData,
+};
+
+/// @brief Values the 'depth' parameter of a "spirv.Image" type may hold.
+enum ImageTyDepthParam {
+  ImageDepthNone = 0,  // Not a depth image
+  ImageDepth,          // A depth image
+  ImageDepthUnknown,   // No indication as to whether this is a depth or
+                       // non-depth image
+};
+
+/// @brief Values the 'arrayed' parameter of a "spirv.Image" type may hold.
+enum ImageTyArrayedParam {
+  ImageNonArrayed = 0,
+  ImageArrayed,
+};
+
+/// @brief Values the 'MS' parameter of a "spirv.Image" type may hold.
+enum ImageTyMSParam {
+  ImageMSSingleSampled = 0,
+  ImageMSMultiSampled,
+};
+
+/// @brief Values the 'Sampled' parameter of a "spirv.Image" type may hold.
+enum ImageTySampledParam {
+  ImageSampledRuntime = 0,      // only known at run time
+  ImageSampledCompat,           // compatible with sampling operations
+  ImageSampledReadWriteCompat,  // compatiable with read/write operations (a
+                                // storage or subpass data image)
+};
+
+enum ImageTyAccessQualParam {
+  ImageAccessQualReadOnly = 0,
+  ImageAccessQualWriteOnly,
+  ImageAccessQualReadWrite,
+};
+
+/// @brief Returns the TargetExtType representing an 'event' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getEventTy(llvm::LLVMContext &Ctx);
+
+/// @brief Returns the TargetExtType representing an 'sampler' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getSamplerTy(llvm::LLVMContext &Ctx);
+
+/// @brief Returns the TargetExtType representing an 'image1d_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getImage1DTy(
+    llvm::LLVMContext &Ctx,
+    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image1d_array_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getImage1DArrayTy(
+    llvm::LLVMContext &Ctx,
+    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image1d_buffer_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getImage1DBufferTy(
+    llvm::LLVMContext &Ctx,
+    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image2d_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getImage2DTy(
+    llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
+    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image2d_array_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getImage2DArrayTy(
+    llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
+    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image3d_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getImage3DTy(
+    llvm::LLVMContext &Ctx,
+    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+}  // namespace tgtext
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
new file mode 100644
index 0000000000000..370b58702816b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
@@ -0,0 +1,55 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Make opaque structure types unique.
+
+#ifndef COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
+#define COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @addtogroup utils
+/// @{
+
+/// @brief This pass replaces instances of suffixed opaque structure types
+/// with unsuffixed versions if an unsuffixed version exists in the context.
+///
+/// When linking together two modules that declare the same opaque struct
+/// type, or deserializing a module referencing an opaque struct type in a
+/// context that already contains an opaque type with the same name, LLVM
+/// will attempt to resolve the clash by appending a suffix to the name in
+/// module. For example, deserializing a module referencing the
+/// opencl.event_t in a context that already has this type will result in
+/// the references all being renamed to opencl.event_t.0. This is
+/// problematic if passes rely on the name of the struct to identify them.
+/// This pass can be used to  resolve this issue by searching for
+/// problematic types and replacing them with their unsuffixed version.
+class UniqueOpaqueStructsPass
+    : public llvm::PassInfoMixin<UniqueOpaqueStructsPass> {
+ public:
+  UniqueOpaqueStructsPass() = default;
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
new file mode 100644
index 0000000000000..b990d31ab0d99
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
@@ -0,0 +1,117 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Work-item loops pass, splitting into "barrier regions"
+
+#ifndef COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
+#define COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
+
+#include <compiler/utils/barrier_regions.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/vectorization_factor.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include <string>
+
+namespace llvm {
+class DominatorTree;
+}
+
+namespace compiler {
+namespace utils {
+
+class BuiltinInfo;
+class BarrierWithLiveVars;
+
+struct WorkItemLoopsPassOptions {
+  /// @brief Set to true if the pass should add extra alloca
+  /// instructions to preserve the values of variables between barriers.
+  bool IsDebug = false;
+  /// @brief Set to true if the pass should forcibly omit scalar
+  /// tail loops from wrapped vector kernels, even if the local work-group size
+  /// is not known to be a multiple of the vectorization factor.
+  bool ForceNoTail = false;
+};
+
+/// @brief The "work-item loops" pass.
+///
+/// This pass adds loops around implicitly SIMT kernels such that the original
+/// kernel is wrapped in a new function that runs over each work-item in the
+/// work-group and calls the original kernel: the scheduling model thus becomes
+/// explicit.
+///
+/// The work-item loops pass assumes that:
+///
+/// * Any functions containing barrier-like functions have already been inlined
+/// into the kernel entry points
+/// * the IDs of pairs of barrier-like functions align between 'main' and 'tail
+/// kernels.
+///
+/// Both of these can be achieved by first running the PrepareBarriersPass.
+///
+/// The pass will query a kernel function for the `reqd_work_group_size`
+/// metadata and optimize accordingly in the presence of it.
+///
+/// Runs over all kernels with "kernel entry point" metadata. Work-item orders
+/// are sourced from the "work item order" function metadata on each kernel.
+class WorkItemLoopsPass final : public llvm::PassInfoMixin<WorkItemLoopsPass> {
+ public:
+  /// @brief Constructor.
+  WorkItemLoopsPass(const WorkItemLoopsPassOptions &Options)
+      : IsDebug(Options.IsDebug), ForceNoTail(Options.ForceNoTail) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+
+ private:
+  /// @brief Make the work-item-loop wrapper function.
+  /// This creates a wrapper function that iterates over a work group, calling
+  /// the kernel for each work item, respecting the semantics of any barriers
+  /// present. The wrapped kernel may be a scalar kernel, a vectorized kernel,
+  /// or both. When the wrapped kernel wraps both a vector and scalar kernel,
+  /// all vectorized work items will be executed first, and the scalar tail
+  /// last.
+  ///
+  /// The wrapper function is created as a new function suffixed by
+  /// ".mux-barrier-wrapper". The original unwrapped kernel(s)s will be left in
+  /// the Module, but marked as internal linkage so later passes can remove
+  /// them if uncalled once inlined into the wrapper function.
+  ///
+  /// When wrapping only a scalar kernel, or only a vector kernel, pass the
+  /// same Barrier object as both Barrier input parameters.
+  ///
+  /// @param[in] barrierMain the Barrier object of the main kernel function
+  /// @param[in] barrierTail the Barrier object of the tail kernel function
+  /// (may be nullptr).
+  /// @param[in] baseName the base name to use on the new wrapper function
+  /// @param[in] M the module the kernels live in
+  /// @param[in] BI BuiltinInfo providing builtin information
+  /// @return The new wrapper function
+  llvm::Function *makeWrapperFunction(BarrierWithLiveVars &barrierMain,
+                                      BarrierWithLiveVars *barrierTail,
+                                      llvm::StringRef baseName, llvm::Module &M,
+                                      BuiltinInfo &BI);
+
+  const bool IsDebug;
+  const bool ForceNoTail;
+};
+}  // namespace utils
+}  // namespace compiler
+
+#endif  // COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
new file mode 100644
index 0000000000000..77ecd0513b4b0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
@@ -0,0 +1,219 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <llvm/ADT/StringExtras.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+using namespace llvm;
+
+static constexpr const char *MuxKernelAttrName = "mux-kernel";
+
+void setIsKernel(Function &F) { F.addFnAttr(MuxKernelAttrName, ""); }
+
+void setIsKernelEntryPt(Function &F) {
+  F.addFnAttr(MuxKernelAttrName, "entry-point");
+}
+
+bool isKernel(const Function &F) {
+  return F.getFnAttribute(MuxKernelAttrName).isValid();
+}
+
+bool isKernelEntryPt(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(MuxKernelAttrName);
+  if (Attr.isValid()) {
+    return Attr.getValueAsString() == "entry-point";
+  }
+  return false;
+}
+
+void dropIsKernel(Function &F) { F.removeFnAttr(MuxKernelAttrName); }
+
+void takeIsKernel(Function &ToF, Function &FromF) {
+  if (!isKernel(FromF)) {
+    return;
+  }
+  // Check whether we need to add entry-point data.
+  const bool IsEntryPt = isKernelEntryPt(FromF);
+  // Drop all data for simplicity
+  dropIsKernel(ToF);
+  dropIsKernel(FromF);
+  // Add the new data
+  IsEntryPt ? setIsKernelEntryPt(ToF) : setIsKernel(ToF);
+}
+
+static StringRef getFnNameFromAttr(const Function &F, StringRef AttrName) {
+  const Attribute Attr = F.getFnAttribute(AttrName);
+  if (Attr.isValid()) {
+    return Attr.getValueAsString();
+  }
+  return "";
+}
+
+static constexpr const char *OrigFnNameAttr = "mux-orig-fn";
+
+void setOrigFnName(Function &F) { F.addFnAttr(OrigFnNameAttr, F.getName()); }
+
+StringRef getOrigFnName(const Function &F) {
+  return getFnNameFromAttr(F, OrigFnNameAttr);
+}
+
+StringRef getOrigFnNameOrFnName(const Function &F) {
+  auto N = getFnNameFromAttr(F, OrigFnNameAttr);
+  return N.empty() ? F.getName() : N;
+}
+
+static constexpr const char *BaseFnNameAttr = "mux-base-fn-name";
+
+void setBaseFnName(Function &F, StringRef N) { F.addFnAttr(BaseFnNameAttr, N); }
+
+StringRef getBaseFnName(const Function &F) {
+  return getFnNameFromAttr(F, BaseFnNameAttr);
+}
+
+StringRef getBaseFnNameOrFnName(const Function &F) {
+  auto N = getFnNameFromAttr(F, BaseFnNameAttr);
+  return N.empty() ? F.getName() : N;
+}
+
+StringRef getOrSetBaseFnName(Function &F, const Function &SetFromF) {
+  const Attribute Attr = F.getFnAttribute(BaseFnNameAttr);
+  if (Attr.isValid()) {
+    return Attr.getValueAsString();
+  }
+
+  // Try and peer through the original function's name
+  StringRef BaseFnName = getBaseFnNameOrFnName(SetFromF);
+  F.addFnAttr(BaseFnNameAttr, BaseFnName);
+  setBaseFnName(F, BaseFnName);
+  return BaseFnName;
+}
+
+static std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
+  if (Attr.isValid()) {
+    int AttrValue = 0;
+    if (!Attr.getValueAsString().getAsInteger(10, AttrValue)) {
+      return AttrValue;
+    }
+  }
+  return std::nullopt;
+}
+
+static constexpr const char *LocalMemUsageAttrName = "mux-local-mem-usage";
+
+void setLocalMemoryUsage(Function &F, uint64_t LocalMemUsage) {
+  const Attribute Attr = Attribute::get(F.getContext(), LocalMemUsageAttrName,
+                                        itostr(LocalMemUsage));
+  F.addFnAttr(Attr);
+}
+
+std::optional<uint64_t> getLocalMemoryUsage(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(LocalMemUsageAttrName);
+  auto Val = getStringFnAttrAsInt(Attr);
+  // Only return non-negative integers
+  return Val && Val >= 0 ? std::optional<uint64_t>(*Val) : std::nullopt;
+}
+
+static constexpr const char *DMAReqdSizeBytesAttrName = "mux-dma-reqd-size";
+
+void setDMAReqdSizeBytes(Function &F, uint32_t DMASizeBytes) {
+  const Attribute Attr = Attribute::get(
+      F.getContext(), DMAReqdSizeBytesAttrName, itostr(DMASizeBytes));
+  F.addFnAttr(Attr);
+}
+
+std::optional<uint32_t> getDMAReqdSizeBytes(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(DMAReqdSizeBytesAttrName);
+  auto Val = getStringFnAttrAsInt(Attr);
+  // Only return non-negative integers
+  return Val && Val >= 0 ? std::optional<uint32_t>(*Val) : std::nullopt;
+}
+
+static constexpr const char *BarrierScheduleAttrName = "mux-barrier-schedule";
+
+void setBarrierSchedule(CallInst &CI, BarrierSchedule Sched) {
+  StringRef Val;
+  switch (Sched) {
+    default:
+    case BarrierSchedule::Unordered:
+      Val = "unordered";
+      break;
+    case BarrierSchedule::Once:
+      Val = "once";
+      break;
+    case BarrierSchedule::ScalarTail:
+      Val = "scalar-tail";
+      break;
+    case BarrierSchedule::Linear:
+      Val = "linear";
+      break;
+  }
+
+  const Attribute Attr =
+      Attribute::get(CI.getContext(), BarrierScheduleAttrName, Val);
+  CI.addFnAttr(Attr);
+}
+
+BarrierSchedule getBarrierSchedule(const CallInst &CI) {
+  const Attribute Attr = CI.getFnAttr(BarrierScheduleAttrName);
+  if (Attr.isValid()) {
+    return StringSwitch<BarrierSchedule>(Attr.getValueAsString())
+        .Case("once", BarrierSchedule::Once)
+        .Case("scalar-tail", BarrierSchedule::ScalarTail)
+        .Case("linear", BarrierSchedule::Linear)
+        .Default(BarrierSchedule::Unordered);
+  }
+  return BarrierSchedule::Unordered;
+}
+
+static constexpr const char *MuxDegenerateSubgroupsAttrName =
+    "mux-degenerate-subgroups";
+
+void setHasDegenerateSubgroups(Function &F) {
+  F.addFnAttr(MuxDegenerateSubgroupsAttrName);
+}
+
+bool hasDegenerateSubgroups(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(MuxDegenerateSubgroupsAttrName);
+  return Attr.isValid();
+}
+
+static constexpr const char *MuxNoSubgroupsAttrName = "mux-no-subgroups";
+
+void setHasNoExplicitSubgroups(Function &F) {
+  F.addFnAttr(MuxNoSubgroupsAttrName);
+}
+
+bool hasNoExplicitSubgroups(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(MuxNoSubgroupsAttrName);
+  return Attr.isValid();
+}
+
+unsigned getMuxSubgroupSize(const llvm::Function &) {
+  // FIXME: The mux sub-group size is currently assumed to be 1 for all
+  // functions, kerrnels, and targets. This helper function is just to avoid
+  // hard-coding the constant 1 in places that will eventually need updated.
+  return 1;
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
new file mode 100644
index 0000000000000..9d3dd8eb7b2f5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -0,0 +1,1497 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/barrier_regions.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/ADT/SetOperations.h>
+#include <llvm/ADT/SetVector.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/StringSet.h>
+#include <llvm/ADT/TinyPtrVector.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfo.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/Transforms/Utils/LCSSA.h>
+#include <llvm/Transforms/Utils/Local.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <optional>
+
+using namespace llvm;
+
+#define NDEBUG_BARRIER
+#define DEBUG_TYPE "barrier-regions"
+
+namespace {
+using AlignIntTy = uint64_t;
+
+/// @brief it returns true if and only if the instruction is a work group
+/// collective call, and returns false otherwise.
+std::optional<compiler::utils::GroupCollective> getWorkGroupCollectiveCall(
+    Instruction *inst, compiler::utils::BuiltinInfo &bi) {
+  auto *const ci = dyn_cast_or_null<CallInst>(inst);
+  if (!ci) {
+    return std::nullopt;
+  }
+
+  Function *callee = ci->getCalledFunction();
+  assert(callee && "could not get called function");
+  auto info = bi.isMuxGroupCollective(bi.analyzeBuiltin(*callee).ID);
+  if (info && info->isWorkGroupScope()) {
+    return info;
+  }
+  return std::nullopt;
+}
+
+/// @brief Builds a stub function containing only a void return instruction.
+///
+/// @note This is useful for client debuggers that want to break on a
+/// particular barrier and work item. Customer specific passes can fill the
+/// contents since it may involve inline assembly for breakpoint traps. The
+/// stub function takes a single i32 argument which is an id identifying the
+/// barrier which invoked the stub. A client debugger should be able to read
+/// this argument using the arch calling convention even without debug info
+/// since it's always the first argument, although customer passes may
+/// rearrange parameters later.
+///
+/// @param[in] name What to name the stub function.
+/// @param[in] module Current module.
+/// @param[in] cc Calling convention for function.
+///
+/// @return Return function created.
+Function *MakeStubFunction(StringRef name, Module &module, CallingConv::ID cc) {
+  // If we've already created a stub return the existing function
+  if (Function *existing = module.getFunction(name)) {
+    return existing;
+  }
+
+  auto &context = module.getContext();
+  // 32-bit integer parameter
+  IntegerType *int32_type = IntegerType::get(context, 32);
+  // Function returns void
+  FunctionType *func_type =
+      FunctionType::get(Type::getVoidTy(context), {int32_type}, false);
+
+  // Create function in module
+  Function *stub_func =
+      Function::Create(func_type, Function::ExternalLinkage, name, &module);
+
+  // Don't inline the function since we want the debugger to be able to hook it
+  stub_func->addFnAttr(Attribute::NoInline);
+
+  // we don't use exceptions
+  stub_func->addFnAttr(Attribute::NoUnwind);
+  stub_func->setCallingConv(cc);
+
+  // No stub or cloned function should have SPIR_KERNEL calling convention.
+  // Please consider using SPIR_FUNC instead of SPIR_KERNEL. In case the
+  // original code has a different calling convention, we should preserve that
+  // one.
+  assert(cc != CallingConv::SPIR_KERNEL && "calling convention mismatch");
+
+  // Single basic block containing only a return void instruction
+  IRBuilder<> IRBuilder(BasicBlock::Create(context, "entry", stub_func));
+  IRBuilder.CreateRetVoid();
+
+  // Build debug info for function if compiled with -g
+  DIBuilder DIB(module, /*AllowUnresolved*/ false);
+
+  // Find module compile unit
+  auto *cu = DIB.createCompileUnit(
+      dwarf::DW_LANG_OpenCL, DIB.createFile("debug", "/"), "", false, "", 0);
+
+  // Create DISubprogram metadata for function
+  auto type_array =
+      DIB.getOrCreateTypeArray({DIB.createUnspecifiedParameter()});
+  auto subprogram_type = DIB.createSubroutineType(type_array);
+  auto DISubprogram = DIB.createFunction(
+      cu->getFile(), name, name, cu->getFile(), 0, subprogram_type, 0,
+      DINode::FlagZero, DISubprogram::SPFlagDefinition);
+
+  // Set function compile unit
+  DISubprogram->replaceUnit(cu);
+
+  // Assigned debug info to function
+  stub_func->setSubprogram(DISubprogram);
+
+  DIB.finalize();
+
+  return stub_func;
+}
+
+/// @brief Check whether this value is valid as def.
+///
+/// @param[in] v Value for checking.
+///
+/// @return True = valid for definition, False = not valid.
+inline bool CheckValidDef(Value *v) {
+  return !(isa<BranchInst>(v) || isa<ReturnInst>(v));
+}
+
+/// @brief Check whether this value is valid as use.
+///
+/// @param[in] v - value for checking.
+///
+/// @return True = valid for use, False = not valid.
+inline bool CheckValidUse(Value *v) {
+  return !(isa<Constant>(v) || isa<BasicBlock>(v) || isa<MetadataAsValue>(v));
+}
+
+bool IsRematerializableBuiltinCall(Value *v, compiler::utils::BuiltinInfo &bi) {
+  if (auto *call = dyn_cast<CallInst>(v)) {
+    assert(call->getCalledFunction() && "Could not get called function");
+    const auto B = bi.analyzeBuiltin(*call->getCalledFunction());
+    if (B.properties & compiler::utils::eBuiltinPropertyRematerializable) {
+      for (auto &op : call->operands()) {
+        if (isa<Instruction>(op.get())) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+// It traces through instructions with a single Instruction operand, looking
+// for work item functions or function arguments.
+bool IsTrivialValue(Value *v, unsigned depth,
+                    compiler::utils::BuiltinInfo &bi) {
+  while (depth--) {
+    auto *const I = dyn_cast<Instruction>(v);
+    if (!I || IsRematerializableBuiltinCall(v, bi)) {
+      return true;
+    }
+
+    // Pass through a vector splat to the splatted value
+    if (auto *const shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+      if (shuffle->isZeroEltSplat()) {
+        if (auto *const ins =
+                dyn_cast<InsertElementInst>(shuffle->getOperand(0))) {
+          if (auto *const src = dyn_cast<Instruction>(ins->getOperand(1))) {
+            v = src;
+            continue;
+          } else {
+            // Splat of a non-Instruction (i.e. an Argument)
+            return true;
+          }
+        }
+      }
+      return false;
+    }
+
+    // Consider only certain trivial operations
+    if (!I->isBinaryOp() && !I->isCast() && !I->isUnaryOp()) {
+      return false;
+    }
+
+    Value *chain = nullptr;
+    for (auto *op : I->operand_values()) {
+      if (auto *const opI = dyn_cast<Instruction>(op)) {
+        if (!chain) {
+          chain = opI;
+        } else if (chain != op) {
+          // It's non-trivial if it has more than one Instruction operand.
+          return false;
+        }
+      }
+    }
+
+    // It's trivial if it didn't have any operands that were instructions.
+    if (!chain) {
+      return true;
+    }
+
+    v = chain;
+  }
+  return false;
+}
+
+// GEPs typically have a low cost, allow up to 1 non-trivial operand
+// (including the pointer operand as well as the indices).
+bool IsTrivialGEP(Value *v, SmallVectorImpl<Value *> &operands) {
+  auto *const GEP = dyn_cast<GetElementPtrInst>(v);
+  if (!GEP) {
+    return false;
+  }
+
+  unsigned inst_ops = 0;
+  for (auto *op : GEP->operand_values()) {
+    if (isa<Instruction>(op) && ++inst_ops > 1) {
+      return false;
+    }
+  }
+
+  for (auto *op : GEP->operand_values()) {
+    if (isa<Instruction>(op)) {
+      operands.push_back(op);
+    }
+  }
+  return true;
+}
+
+/// @brief Update all basic block edges for PHINodes, and drop edges from
+/// basic blocks that are not in the the new function (which only consists of
+/// the subset of blocks that make up one region).
+///
+/// @param[in] BB Basic block to process.
+/// @param[in] vmap Map for value for cloning.
+void UpdateAndTrimPHINodeEdges(BasicBlock *BB, ValueToValueMapTy &vmap) {
+  for (auto &phi : BB->phis()) {
+    for (unsigned i = 0; i < phi.getNumIncomingValues(); i++) {
+      const BasicBlock *incoming_bb = phi.getIncomingBlock(i);
+
+      // If the incoming basic block was processed during cloning then
+      // update the edge, if it wasn't then it is not in the region so
+      // remove it.
+      if (vmap.count(incoming_bb)) {
+        Value *updated_bb = vmap[incoming_bb];
+        phi.setIncomingBlock(i, cast<BasicBlock>(updated_bb));
+      } else {
+        // Note: Updating the loop iterator to reflect the updated
+        // post-deletion indices.
+        phi.removeIncomingValue(i--);
+      }
+    }
+  }
+}
+
+/// @brief Returns true if the type is a struct type containing any scalable
+/// vectors in its list of elements
+bool isStructWithScalables(Type *ty) {
+  if (auto *const struct_ty = dyn_cast<StructType>(ty)) {
+    return any_of(struct_ty->elements(),
+                  [](Type *ty) { return isa<ScalableVectorType>(ty); });
+  }
+  return false;
+}
+
+}  // namespace
+
+Value *compiler::utils::Barrier::LiveValuesHelper::getExtractValueGEP(
+    const Value *live) {
+  if (auto *const extract = dyn_cast<ExtractValueInst>(live)) {
+    // We can't handle extracts with multiple indices
+    if (extract->getIndices().size() == 1) {
+      return getGEP(extract->getAggregateOperand(), extract->getIndices()[0]);
+    }
+  }
+  return nullptr;
+}
+
+Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live,
+                                                          unsigned member_idx) {
+  auto key = std::make_pair(live, member_idx);
+  if (auto gep_it = live_GEPs.find(key); gep_it != live_GEPs.end()) {
+    return gep_it->second;
+  }
+
+  Value *gep;
+  Type *data_ty = live->getType();
+  if (auto *AI = dyn_cast<AllocaInst>(live)) {
+    data_ty = AI->getAllocatedType();
+  }
+
+  if (auto field_it = barrier.live_variable_index_map_.find(key);
+      field_it != barrier.live_variable_index_map_.end()) {
+    LLVMContext &context = barrier.module_.getContext();
+    const unsigned field_index = field_it->second;
+    Value *live_variable_info_idxs[2] = {
+        ConstantInt::get(Type::getInt32Ty(context), 0),
+        ConstantInt::get(Type::getInt32Ty(context), field_index)};
+
+    gep = gepBuilder.CreateInBoundsGEP(barrier.live_var_mem_ty_, barrier_struct,
+                                       live_variable_info_idxs,
+                                       Twine("live_gep_") + live->getName());
+  } else if (auto field_it = barrier.live_variable_scalables_map_.find(key);
+             field_it != barrier.live_variable_scalables_map_.end()) {
+    const unsigned field_offset = field_it->second;
+    Value *scaled_offset = nullptr;
+
+    LLVMContext &context = barrier.module_.getContext();
+    if (field_offset != 0) {
+      if (!vscale) {
+        Type *size_type = gepBuilder.getIntNTy(barrier.size_t_bytes * 8);
+        vscale = gepBuilder.CreateIntrinsic(Intrinsic::vscale, size_type, {});
+      }
+      scaled_offset = gepBuilder.CreateMul(
+          vscale, gepBuilder.getIntN(barrier.size_t_bytes * 8, field_offset));
+    } else {
+      scaled_offset = ConstantInt::get(Type::getInt32Ty(context), 0);
+    }
+
+    Value *live_variable_info_idxs[3] = {
+        ConstantInt::get(Type::getInt32Ty(context), 0),
+        ConstantInt::get(Type::getInt32Ty(context),
+                         barrier.live_var_mem_scalables_index),
+        scaled_offset,
+    };
+
+    // Gep into the raw byte buffer
+    gep = gepBuilder.CreateInBoundsGEP(
+        barrier.live_var_mem_ty_, barrier_struct, live_variable_info_idxs,
+        Twine("live_gep_scalable_") + live->getName());
+
+    // Cast the pointer to the scalable vector type
+    gep = gepBuilder.CreatePointerCast(
+        gep,
+        PointerType::get(
+            data_ty,
+            cast<PointerType>(barrier_struct->getType())->getAddressSpace()));
+  } else {
+    // Fall back and see if this live variable is actually a decomposed
+    // structure type.
+    return getExtractValueGEP(live);
+  }
+
+  // Cache this GEP for later
+  live_GEPs[key] = gep;
+
+  return gep;
+}
+
+Value *compiler::utils::Barrier::LiveValuesHelper::getReload(Value *live,
+                                                             IRBuilderBase &ir,
+                                                             const char *name,
+                                                             bool reuse) {
+  auto &mapped = reloads[live];
+  if (reuse && mapped) {
+    return mapped;
+  }
+
+  if (Value *v = getGEP(live)) {
+    if (!isa<AllocaInst>(live)) {
+      // If live variable is not allocainst, insert load.
+      if (!isStructWithScalables(live->getType())) {
+        v = ir.CreateLoad(live->getType(), v, Twine(live->getName(), name));
+      } else {
+        auto *const struct_ty = cast<StructType>(live->getType());
+        // Start off with a poison value, and build the struct up member by
+        // member, reloading each member at a time from their respective
+        // offsets.
+        v = PoisonValue::get(struct_ty);
+        for (auto [idx, ty] : enumerate(struct_ty->elements())) {
+          auto *const elt_addr = getGEP(live, idx);
+          assert(elt_addr && "Could not get address of struct element");
+          auto *const reload =
+              ir.CreateLoad(ty, elt_addr, Twine(live->getName(), name));
+          v = ir.CreateInsertValue(v, reload, idx);
+        }
+      }
+    }
+    mapped = v;
+    return v;
+  }
+
+  if (auto *I = dyn_cast<Instruction>(live)) {
+    // Save these
+    auto insPoint = ir.GetInsertPoint();
+    auto *const insBB = ir.GetInsertBlock();
+
+    if (!reuse || !mapped) {
+      auto *clone = I->clone();
+      clone->setName(I->getName());
+      clone->setDebugLoc(DebugLoc());
+      ir.Insert(clone);
+      if (gepBuilder.GetInsertPoint() == ir.GetInsertPoint()) {
+        gepBuilder.SetInsertPoint(clone);
+      }
+      ir.SetInsertPoint(clone);
+      mapped = clone;
+      I = clone;
+    } else {
+      return mapped;
+    }
+
+    for (auto op_it = I->op_begin(); op_it != I->op_end();) {
+      auto &op = *op_it++;
+      if (auto *op_inst = dyn_cast<Instruction>(op.get())) {
+        ir.SetInsertPoint(I);
+        op.set(getReload(op_inst, ir, name, reuse));
+      }
+    }
+
+    // Restore the original insert point
+    ir.SetInsertPoint(insBB, insPoint);
+    return I;
+  }
+
+  return live;
+}
+
+void compiler::utils::Barrier::Run(llvm::ModuleAnalysisManager &mam) {
+  bi_ = &mam.getResult<BuiltinInfoAnalysis>(module_);
+  FindBarriers();
+
+  if (barriers_.empty()) {
+    // If there are no barriers, we can use the original function as the
+    // single barrier region.
+    barrier_graph.emplace_back();
+    auto &node = barrier_graph.back();
+    node.entry = &func_.getEntryBlock();
+    node.id = kBarrier_FirstID;
+    node.successor_ids.push_back(kBarrier_EndID);
+    kernel_id_map_[kBarrier_FirstID] = &func_;
+    return;
+  }
+
+  // If we found some barriers, we need to split up our kernel across them!
+  {
+    ModulePassManager pm;
+    // It's convenient to create LCSSA PHI nodes to stop values defined
+    // within a loop being stored to the barrier unnecessarily on every
+    // iteration (if, for instance, the loop is entirely between two
+    // barriers, but the value is used outside of that barrier region).
+    pm.addPass(llvm::createModuleToFunctionPassAdaptor(LCSSAPass()));
+    pm.run(module_, mam);
+    mam.invalidate(module_, PreservedAnalyses::allInSet<CFGAnalyses>());
+  }
+
+  // Do the splitting first in case a value is used on both sides of a barrier
+  // within the same basic block.
+  SplitBlockwithBarrier();
+  FindLiveVariables();
+
+  // Tidy up the barrier struct, removing values that we can
+  // reload/rematerialize on the other side of the barrier.
+  // NB: We don't do this if any of the barriers is a work-group broadcast. In
+  // the case that a broadcasted value is non-uniform (i.e., it depends on
+  // work-item builtins), we must preserve it in the barrier struct! This is
+  // because we can't rematerialize the local ID and broadcast that; we need
+  // to broadcast the specific local ID for the broadcasted work-item.
+  // This is very crude. We could either:
+  // 1. Trace through all candidate values we want to remove and ensure they're
+  // not being broadcasted.
+  // 2. Add some more advanced rematerialization logic to substitute
+  // rematerializable work-item functions with values specific to a given
+  // work-item. Note that the builtins we rematerialize are ultimately up to
+  // the BuiltinInfo to identify, so we can't assume anything here and would
+  // have to defer back to the BuiltinInfo to do this correctly.
+  if (llvm::none_of(barriers_, [this](llvm::CallInst *const CI) {
+        auto Info = getWorkGroupCollectiveCall(CI, *bi_);
+        return Info && Info->isBroadcast();
+      })) {
+    TidyLiveVariables();
+  }
+
+  MakeLiveVariableMemType();
+  SeperateKernelWithBarrier();
+}
+
+void compiler::utils::Barrier::replaceSubkernel(Function *from, Function *to) {
+  for (auto &k : kernel_id_map_) {
+    if (k.second == from) {
+      k.second = to;
+    }
+  }
+}
+
+/// @brief Find Barriers.
+void compiler::utils::Barrier::FindBarriers() {
+  SmallVector<std::pair<unsigned, CallInst *>, 8> orderedBarriers;
+
+  // Check whether current function has barrier or not.
+  for (BasicBlock &b : func_) {
+    for (Instruction &bi : b) {
+      // Check call instructions for barrier.
+      if (CallInst *call_inst = dyn_cast<CallInst>(&bi)) {
+        Function *callee = call_inst->getCalledFunction();
+        if (callee != nullptr) {
+          const auto B = bi_->analyzeBuiltin(*callee);
+          if (BuiltinInfo::isMuxBuiltinWithWGBarrierID(B.ID)) {
+            unsigned id = ~0u;
+            auto *const id_param = call_inst->getOperand(0);
+            if (auto *const id_param_c = dyn_cast<ConstantInt>(id_param)) {
+              id = id_param_c->getZExtValue();
+            }
+            orderedBarriers.emplace_back(id, call_inst);
+          }
+        }
+      }
+    }
+  }
+
+  std::sort(orderedBarriers.begin(), orderedBarriers.end());
+  for (const auto &barrier : orderedBarriers) {
+    barriers_.push_back(barrier.second);
+  }
+}
+
+/// @brief Split block with barrier.
+void compiler::utils::Barrier::SplitBlockwithBarrier() {
+  // If debugging, create stub functions in the module which will be invoked
+  // before each barrier, and after each barrier, by every work item.
+  Function *entry_stub = nullptr;
+  Function *exit_stub = nullptr;
+  if (is_debug_) {
+    CallingConv::ID stub_cc;
+    if (func_.getCallingConv() == CallingConv::SPIR_KERNEL) {
+      stub_cc = CallingConv::SPIR_FUNC;
+    } else {
+      stub_cc = func_.getCallingConv();
+    }
+    entry_stub = MakeStubFunction("__barrier_entry", module_, stub_cc);
+    exit_stub = MakeStubFunction("__barrier_exit", module_, stub_cc);
+  }
+
+  barrier_graph.emplace_back();
+  auto &node = barrier_graph.back();
+  node.entry = &func_.getEntryBlock();
+  node.id = kBarrier_FirstID;
+
+  unsigned barrier_id = kBarrier_StartNewID;
+  for (CallInst *split_point : barriers_) {
+    if (is_debug_) {
+      assert(entry_stub != nullptr);  // Guaranteed as is_debug_ is const.
+      assert(exit_stub != nullptr);   // Guaranteed as is_debug_ is const.
+
+      // Create call instructions invoking debug stubs for every barrier. We
+      // don't insert these into a basic block yet since we want to insert
+      // them at a point where live variables have already been loaded. This
+      // info won't be available till later.
+
+      // ID identifying which barrier invoked stub used as argument to call.
+      // This number monotonically increases from 0 for each barrier.
+      auto id = ConstantInt::get(Type::getInt32Ty(module_.getContext()),
+                                 barrier_id - kBarrier_StartNewID);
+      // Call invoking entry stub
+      auto entry_caller =
+          CallInst::Create(entry_stub, id, "", (Instruction *)nullptr);
+      entry_caller->setDebugLoc(split_point->getDebugLoc());
+      entry_caller->setCallingConv(entry_stub->getCallingConv());
+
+      // Call invoking exit stub
+      auto exit_caller =
+          CallInst::Create(exit_stub, id, "", (Instruction *)nullptr);
+      exit_caller->setDebugLoc(split_point->getDebugLoc());
+      exit_caller->setCallingConv(exit_stub->getCallingConv());
+
+      // Store call instructions in map for later insertion
+      barrier_stub_call_map_[barrier_id] =
+          std::make_pair(entry_caller, exit_caller);
+    }
+
+    barrier_graph.emplace_back();
+    auto &node = barrier_graph.back();
+    node.barrier_inst = split_point;
+    node.id = barrier_id++;
+    node.schedule = getBarrierSchedule(*split_point);
+
+    // Our scan implementation requires a linear work-item ordering, to loop
+    // over all of the 'main' and 'tail' work-items in order.
+    if (auto collective = getWorkGroupCollectiveCall(split_point, *bi_)) {
+      if (collective->isScan()) {
+        node.schedule = BarrierSchedule::Linear;
+      }
+    }
+
+    split_point->getParent()->splitBasicBlock(split_point, "barrier");
+  }
+
+  // We have to gather the basic block data after splitting, because we
+  // might not be processing barriers in program order, and things can get
+  // awfully confused.
+  for (auto &node : barrier_graph) {
+    if (node.barrier_inst) {
+      auto *const bb = node.barrier_inst->getParent();
+      barrier_id_map_[bb] = node.id;
+      barrier_successor_set_.insert(*predecessors(bb).begin());
+      node.entry = bb;
+    }
+  }
+}
+
+/// @brief Generate an empty kernel that only duplicates the source kernel's
+/// CFG
+///
+/// This is used to do a "dry run" of kernel splitting in order to obtain the
+/// dominator tree, which is needed for correct identification of values that
+/// cross the barrier.
+///
+/// @param[in] region the region to clone into the new kernel.
+/// @param[out] bbmap a mapping of original blocks onto the empty clones.
+/// @return the fake kernel
+Function *compiler::utils::Barrier::GenerateFakeKernel(
+    BarrierRegion &region, DenseMap<BasicBlock *, BasicBlock *> &bbmap) {
+  LLVMContext &context = module_.getContext();
+
+  // Make new kernel function.
+  FunctionType *new_fty = FunctionType::get(Type::getVoidTy(context), false);
+  Function *new_kernel =
+      Function::Create(new_fty, Function::InternalLinkage, "tmp", &module_);
+  ValueToValueMapTy vmap;
+
+  for (auto *bb : region.blocks) {
+    BasicBlock *new_bb = BasicBlock::Create(context, "", new_kernel);
+    if (region.barrier_blocks.count(bb)) {
+      ReturnInst::Create(context, nullptr, new_bb);
+    } else {
+      bb->getTerminator()->clone()->insertInto(new_bb, new_bb->end());
+    }
+    vmap[bb] = new_bb;
+    bbmap[bb] = new_bb;
+  }
+
+  const RemapFlags remapFlags =
+      RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs;
+  for (auto &f : *new_kernel) {
+    auto *term = f.getTerminator();
+    RemapInstruction(term, vmap, remapFlags);
+  }
+
+  return new_kernel;
+}
+
+/// @brief Obtain a set of Basic Blocks for an inter-barrier region
+///
+/// It traverses the CFG, following successors, until it hits a barrier,
+/// building the region's internal data.
+///
+/// @param[out] region the region to process
+void compiler::utils::Barrier::GatherBarrierRegionBlocks(
+    BarrierRegion &region) {
+  DenseSet<BasicBlock *> visited;
+  region.blocks.push_back(region.entry);
+  visited.insert(region.entry);
+  size_t index = 0;
+  while (index < region.blocks.size()) {
+    BasicBlock *BB = region.blocks[index++];
+    if (barrier_successor_set_.count(BB)) {
+      region.barrier_blocks.insert(BB);
+    } else {
+      for (BasicBlock *succ : successors(BB)) {
+        if (visited.insert(succ).second) {
+          region.blocks.push_back(succ);
+        }
+      }
+    }
+  }
+}
+
+/// @brief Obtain a set of Values used in a region that cross a barrier
+///
+/// A value use crosses a barrier in the following cases:
+/// * Its use is not in the same region as the defintion
+/// * Its definition does not dominate the use
+///
+/// @param[in] region The inter-barrier region
+/// @param[in] ignore set of values to ignore
+void compiler::utils::Barrier::GatherBarrierRegionUses(
+    BarrierRegion &region, DenseSet<Value *> &ignore) {
+  DenseMap<BasicBlock *, BasicBlock *> bbmap;
+  Function *fake_func = GenerateFakeKernel(region, bbmap);
+
+  // We should check the dominance relation between definition bb of live
+  // variables and user bb. If def bb does not dominate user bb, the user is
+  // modified by live variable information.
+  DominatorTree DT;
+  DT.recalculate(*fake_func);
+
+  for (auto *BB : region.blocks) {
+    BasicBlock *BBclone = bbmap[BB];
+    for (auto &I : *BB) {
+      if (PHINode *pn = dyn_cast<PHINode>(&I)) {
+        for (unsigned i = 0, e = pn->getNumIncomingValues(); i != e; i++) {
+          Value *val = pn->getIncomingValue(i);
+          if (CheckValidUse(val) && !ignore.count(val)) {
+            if (auto *inst = dyn_cast<Instruction>(val)) {
+              BasicBlock *incoming = pn->getIncomingBlock(i);
+              BasicBlock *parent = inst->getParent();
+              // If the incoming edge comes from outside the region, it is
+              // going to get removed anyway, so disregard it
+              if (bbmap.count(incoming)) {
+                if (!bbmap.count(parent)) {
+                  region.uses_ext.insert(val);
+                } else if (!DT.dominates(bbmap[parent], bbmap[incoming])) {
+                  region.uses_int.insert(val);
+                }
+              }
+            }
+          }
+        }
+      } else {
+        for (Value *val : I.operands()) {
+          if (CheckValidUse(val) && !ignore.count(val)) {
+            if (auto *inst = dyn_cast<Instruction>(val)) {
+              BasicBlock *parent = inst->getParent();
+              if (!bbmap.count(parent)) {
+                region.uses_ext.insert(val);
+              } else if (!DT.dominates(bbmap[parent], BBclone)) {
+                region.uses_int.insert(val);
+              }
+            }
+          }
+        }
+      }
+      if (CheckValidDef(&I) && !I.use_empty()) {
+        region.defs.insert(&I);
+      }
+    }
+  }
+  DT.reset();
+  fake_func->eraseFromParent();
+}
+
+/// @brief Find livein and liveout variables per each basic block.
+void compiler::utils::Barrier::FindLiveVariables() {
+  DenseSet<Value *> func_args;
+  for (Argument &arg : func_.args()) {
+    func_args.insert(&arg);
+  }
+
+#ifndef NDEBUG
+  // Make sure there aren't any stray allocas outside the entry block.
+  for (auto block = func_.begin(); ++block != func_.end();) {
+    for (auto &inst : *block) {
+      assert(!isa<AllocaInst>(inst) && "Alloca found outside entry block!");
+    }
+  }
+#endif  // ndef NDEBUG
+
+  // Put all the original allocas into the barrier struct, in case they get
+  // indirectly referenced from the other side of a barrier.
+  for (Instruction &bi : func_.front()) {
+    if (isa<AllocaInst>(&bi)) {
+      whole_live_variables_set_.insert(&bi);
+    } else {
+      continue;
+    }
+  }
+
+  for (auto &region : barrier_graph) {
+    GatherBarrierRegionBlocks(region);
+    GatherBarrierRegionUses(region, func_args);
+    whole_live_variables_set_.set_union(region.uses_int);
+    whole_live_variables_set_.set_union(region.uses_ext);
+  }
+}
+
+/// @brief Remove variables that are better recalculated than stored in the
+///        barrier, for instance casts and vector splats.
+void compiler::utils::Barrier::TidyLiveVariables() {
+  const auto &dl = module_.getDataLayout();
+
+  // Start off by doing a simple sweep of stuff that is better off not in the
+  // barrier: vector splats, no-op/widening casts, and single/zero index GEPs
+  // since we might as well put their source operand in the barrier, instead.
+  SmallVector<Value *, 16> removals;
+  SmallVector<Value *, 16> redirects;
+  for (auto v : whole_live_variables_set_) {
+    if (auto *const shuffle = dyn_cast<ShuffleVectorInst>(v)) {
+      if (shuffle->isZeroEltSplat()) {
+        // if we remove a vector splat, we have to make sure the scalar
+        // source operand is in the barrier instead.
+        Value *const op = shuffle->getOperand(0);
+        if (auto *const ins = dyn_cast<InsertElementInst>(op)) {
+          removals.push_back(v);
+
+          Value *const src = ins->getOperand(1);
+          // Put the source instruction in the barrier instead.
+          // If it's not an instruction, it is probably a function argument.
+          if (isa<Instruction>(src) && !IsTrivialGEP(src, redirects)) {
+            redirects.push_back(src);
+          }
+        }
+      }
+    } else if (auto *const cast = dyn_cast<CastInst>(v)) {
+      if (auto *const src = dyn_cast<Instruction>(cast->getOperand(0))) {
+        if (cast->isNoopCast(dl) ||
+            (cast->getSrcTy()->getScalarSizeInBits() <
+             cast->getDestTy()->getScalarSizeInBits())) {
+          removals.push_back(v);
+
+          // Put the source instruction in the barrier instead.
+          if (isa<Instruction>(src) && !IsTrivialGEP(src, redirects)) {
+            redirects.push_back(src);
+          }
+        }
+      } else {
+        // No casts of non-instructions in the barrier, please..
+        removals.push_back(v);
+      }
+    } else if (IsTrivialGEP(v, redirects)) {
+      removals.push_back(v);
+    }
+  }
+
+  // We put the redirects into the barrier first, so that if they in turn
+  // turn out to be redundant, we can remove them again.
+  whole_live_variables_set_.set_union(redirects);
+
+  // Remove work item calls and casts of arguments or other barrier members.
+  for (auto v : whole_live_variables_set_) {
+    if (IsTrivialValue(v, 4u, *bi_)) {
+      removals.push_back(v);
+    } else if (auto *cast = dyn_cast<CastInst>(v)) {
+      Value *op = cast->getOperand(0);
+      if (whole_live_variables_set_.count(op)) {
+        removals.push_back(v);
+      }
+    }
+  }
+  whole_live_variables_set_.set_subtract(removals);
+}
+
+/// @brief Pad the field types to an alignment by adding an int array if
+/// needed
+/// @param field_tys The vector of types representing the final structure
+/// @param offset The current offset in the structure
+/// @param alignment The required alignment
+/// @return The new offset (or original offset if no padding needed)
+unsigned compiler::utils::Barrier::PadTypeToAlignment(
+    SmallVectorImpl<Type *> &field_tys, unsigned offset, unsigned alignment) {
+  if (alignment) {
+    // check if member is not already aligned
+    const unsigned int remainder = offset % alignment;
+    if (0 != remainder) {
+      // calculate number of padding bytes
+      const unsigned int padding = alignment - remainder;
+
+      // Use a byte array to pad struct rather than trying to create
+      // an arbitrary intNTy, since this may not be supported by the backend.
+      const auto padByteType = Type::getInt8Ty(module_.getContext());
+      const auto padByteArrayType = ArrayType::get(padByteType, padding);
+      field_tys.push_back(padByteArrayType);
+
+      // bump offset by padding size
+      offset += padding;
+    }
+  }
+  return offset;
+}
+
+/// @brief Make type for whole live variables.
+void compiler::utils::Barrier::MakeLiveVariableMemType() {
+  SmallVector<Type *, 128> field_tys;
+  max_live_var_alignment = 0;
+
+  const auto &dl = module_.getDataLayout();
+
+  struct member_info {
+    /// @brief The root `value` being stored.
+    Value *value;
+    /// @brief The member index of this member inside `value`, if `value` is a
+    /// decomposed structure type. Zero otherwise.
+    unsigned member_idx;
+    /// @brief The type of `value`, or of the specific member of `value`.
+    Type *type;
+    /// @brief The alignment of the value being stored
+    unsigned alignment;
+    /// @brief The size of the value being stored
+    unsigned size;
+  };
+
+  SmallVector<member_info, 8> barrier_members;
+  barrier_members.reserve(whole_live_variables_set_.size());
+  for (Value *live_var : whole_live_variables_set_) {
+    LLVM_DEBUG(dbgs() << "whole live set:" << *live_var << '\n';
+               dbgs() << "type:" << *(live_var->getType()) << '\n';);
+    Type *field_ty = live_var->getType();
+
+    Type *member_ty = nullptr;
+    unsigned alignment = 0;
+    // If allocainst is live variable, get element type of pointer type
+    // from field_ty and remember alignment
+    if (const auto *AI = dyn_cast<AllocaInst>(live_var)) {
+      member_ty = AI->getAllocatedType();
+      alignment = AI->getAlign().value();
+    } else {
+      member_ty = field_ty;
+    }
+
+    std::vector<Type *> member_tys = {member_ty};
+    // If this is a struct type containing any scalable members, we must
+    // decompose the value into its individual components.
+    if (isStructWithScalables(member_ty)) {
+      member_tys = cast<StructType>(member_ty)->elements().vec();
+    }
+
+    for (auto [idx, ty] : enumerate(member_tys)) {
+      // For a scalable vector, we need the size of the equivalent fixed vector
+      // based on its known minimum size.
+      auto member_ty_fixed = ty;
+      if (isa<ScalableVectorType>(ty)) {
+        auto *const eltTy = multi_llvm::getVectorElementType(ty);
+        auto n = multi_llvm::getVectorElementCount(ty).getKnownMinValue();
+        member_ty_fixed = VectorType::get(eltTy, ElementCount::getFixed(n));
+      }
+
+      // Need to ensure that alloc alignment or preferred alignment is kept
+      // in the new struct so pad as necessary.
+      const unsigned size = dl.getTypeAllocSize(member_ty_fixed);
+      alignment = std::max(dl.getPrefTypeAlign(ty).value(),
+                           static_cast<AlignIntTy>(alignment));
+      max_live_var_alignment = std::max(alignment, max_live_var_alignment);
+
+      barrier_members.push_back(
+          {live_var, static_cast<unsigned>(idx), ty, alignment, size});
+    }
+  }
+
+  // sort the barrier members by decreasing alignment to minimise the amount
+  // of padding required (use a stable sort so it's deterministic)
+  std::stable_sort(barrier_members.begin(), barrier_members.end(),
+                   [](const member_info &lhs, const member_info &rhs) -> bool {
+                     return lhs.alignment > rhs.alignment;
+                   });
+
+  // Deal with non-scalable members first
+  unsigned offset = 0;
+  for (auto &member : barrier_members) {
+    if (isa<ScalableVectorType>(member.type)) {
+      continue;
+    }
+
+    offset = PadTypeToAlignment(field_tys, offset, member.alignment);
+
+    // Check if the alloca has a debug info source variable attached. If
+    // so record this and the matching byte offset into the struct.
+#if LLVM_VERSION_GREATER_EQUAL(18, 0)
+    auto DbgIntrinsics = findDbgDeclares(member.value);
+#elif LLVM_VERSION_GREATER_EQUAL(17, 0)
+    auto DbgIntrinsics = FindDbgDeclareUses(member.value);
+#else
+    auto DbgIntrinsics = FindDbgAddrUses(member.value);
+#endif
+    for (auto DII : DbgIntrinsics) {
+      if (auto dbgDeclare = dyn_cast<DbgDeclareInst>(DII)) {
+        debug_intrinsics_.push_back(std::make_pair(dbgDeclare, offset));
+      }
+    }
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+    const auto DVRDeclares = findDVRDeclares(member.value);
+    for (auto *const DVRDeclare : DVRDeclares) {
+      debug_variable_records_.push_back(std::make_pair(DVRDeclare, offset));
+    }
+#endif
+    offset += member.size;
+    live_variable_index_map_[std::make_pair(member.value, member.member_idx)] =
+        field_tys.size();
+    field_tys.push_back(member.type);
+  }
+  // Pad the end of the struct to the max alignment as we are creating an
+  // array
+  offset = PadTypeToAlignment(field_tys, offset, max_live_var_alignment);
+  live_var_mem_size_fixed = offset;  // No more offsets required.
+
+  // Now deal with any scalable members. We reset the offset to zero because
+  // scalables are indexed bytewise starting from the beginning of the
+  // variable-sized scalables section at the end of the struct.
+  SmallVector<Type *, 128> field_tys_scalable;
+  offset = 0;
+  for (auto &member : barrier_members) {
+    if (!isa<ScalableVectorType>(member.type)) {
+      continue;
+    }
+
+    offset = PadTypeToAlignment(field_tys_scalable, offset, member.alignment);
+
+    live_variable_scalables_map_[std::make_pair(member.value,
+                                                member.member_idx)] = offset;
+    offset += member.size;
+    field_tys_scalable.push_back(member.type);
+  }
+  // Pad the end of the struct to the max alignment as we are creating an
+  // array
+  offset =
+      PadTypeToAlignment(field_tys_scalable, offset, max_live_var_alignment);
+  live_var_mem_size_scalable = offset;  // No more offsets required.
+
+  LLVMContext &context = module_.getContext();
+  // if the barrier contains scalables, add a flexible byte array on the end
+  if (offset != 0) {
+    live_var_mem_scalables_index = field_tys.size();
+    field_tys.push_back(ArrayType::get(IntegerType::getInt8Ty(context), 0));
+  }
+
+  // Create struct type for live variable memory allocation; we create this
+  // even when the type is empty. The big entry point pass depends on this
+  // to detect that the barrier pass has been executed.
+  SmallString<128> name;
+  live_var_mem_ty_ = StructType::create(
+      context, field_tys,
+      (Twine(func_.getName() + "_live_mem_info")).toStringRef(name), false);
+
+  name.clear();
+
+  LLVM_DEBUG(dbgs() << "Barrier size: " << offset << "\n";
+             dbgs() << "whole live set type:" << *(live_var_mem_ty_) << '\n';);
+}
+
+/// @brief Generate new kernel from an inter-barrier region such that no call
+/// to barriers occur within it.
+///
+/// @param[in] region the inter-barrier region to create the kernel from
+/// @return the new kernel
+Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
+  BasicBlock *entry_point = region.entry;
+  LLVMContext &context = module_.getContext();
+
+  LLVM_DEBUG(dbgs() << "\n"; unsigned i = 0; for (auto *d
+                                                  : region.blocks) {
+    dbgs() << "entry block: " << entry_point->getName() << "\n";
+    dbgs() << "region visited path [" << i++ << "] = " << d->getName()
+           << "\n\n";
+    dbgs() << *d << "\n\n";
+  });
+
+  SmallVector<Type *, 8> new_func_params;
+  // First kernel adds original kernel's parameters.
+  for (const auto &arg : func_.args()) {
+    new_func_params.push_back(arg.getType());
+  }
+
+  // If we have a work group collective call, we need to create a new argument
+  // so that the result can be passed in.
+  const bool collective =
+      getWorkGroupCollectiveCall(region.barrier_inst, *bi_).has_value();
+  if (collective) {
+    new_func_params.push_back(region.barrier_inst->getType());
+  }
+
+  // Add live variables' parameter as last if there are any.
+  const bool hasBarrierStruct = !whole_live_variables_set_.empty() &&
+                                region.schedule != BarrierSchedule::Once;
+  if (hasBarrierStruct) {
+    PointerType *pty = PointerType::get(live_var_mem_ty_, 0);
+    new_func_params.push_back(pty);
+  }
+
+  // Make new kernel function.
+  FunctionType *new_fty = FunctionType::get(Type::getInt32Ty(context),
+                                            new_func_params, func_.isVarArg());
+  Function *new_kernel =
+      Function::Create(new_fty, Function::InternalLinkage,
+                       func_.getName() + ".mux-barrier-region", &module_);
+
+  // We don't use exceptions.
+  new_kernel->setAttributes(func_.getAttributes());
+
+  // We also want to always inline this function (unless it is noinline).
+  if (!new_kernel->hasFnAttribute(Attribute::NoInline)) {
+    new_kernel->addFnAttr(Attribute::AlwaysInline);
+  }
+
+  // copy the calling convention from the old function, except for
+  // SPIR_KERNEL. SPIR_KERNELs need to be split into SPIR_FUNC
+  CallingConv::ID new_kernel_cc;
+  if (func_.getCallingConv() == CallingConv::SPIR_KERNEL) {
+    new_kernel_cc = CallingConv::SPIR_FUNC;
+  } else {
+    new_kernel_cc = func_.getCallingConv();
+  }
+  new_kernel->setCallingConv(new_kernel_cc);
+
+  // Copy the metadata into the new kernel ignoring any debug info.
+  compiler::utils::copyFunctionMetadata(func_, *new_kernel);
+
+  // We're not interested in these sub-kernels being registered as kernels.
+  // While they're technically kernels, they're only ever called from our
+  // actual wrapper entry point.
+  compiler::utils::dropIsKernel(*new_kernel);
+
+  live_variable_mem_t live_vars_defs_in_kernel;
+  ValueToValueMapTy vmap;
+  // First kernel follows original kernel's arguments first.
+  Function::arg_iterator new_arg = new_kernel->arg_begin();
+  for (const auto &arg : func_.args()) {
+    vmap[&arg] = &*(new_arg++);
+  }
+
+  // Copy a region to the new kernel function.
+  bool returns_from_kernel = false;
+  for (auto *block : region.blocks) {
+    BasicBlock *cloned_bb =
+        CloneBasicBlock(block, vmap, "", live_vars_defs_in_kernel, new_kernel);
+    vmap[block] = cloned_bb;
+
+    // Remove last terminator from clone block with barrier.
+    if (region.barrier_blocks.count(block)) {
+      cloned_bb->getTerminator()->eraseFromParent();
+
+      // Return the next barrier's id.
+      const unsigned next_barrier_id =
+          barrier_id_map_[block->getSingleSuccessor()];
+      ConstantInt *barrier_id_cst =
+          ConstantInt::get(Type::getInt32Ty(context), next_barrier_id);
+      auto new_ret = ReturnInst::Create(context, barrier_id_cst, cloned_bb);
+
+      // Barrier blocks should be unique.
+      region.successor_ids.push_back(next_barrier_id);
+
+      // Insert call to debug stub before return if debugging, this stub
+      // signifies that we're about to enter the next barrier
+      if (is_debug_) {
+        // Look up entry call instruction in map
+        CallInst *entry_call = barrier_stub_call_map_[next_barrier_id].first;
+
+        // Check for null since if this is the final kernel there won't be
+        // a next barrier to have an entry for.
+        if (!entry_call) {
+          continue;
+        }
+
+        // Check if the entry call already has a parent since there can be
+        // multiple return instructions in a kernel, if it does then clone
+        // the instruction first.
+        if (nullptr == entry_call->getParent()) {
+          entry_call->insertBefore(new_ret);
+        } else {
+          entry_call->clone()->insertBefore(new_ret);
+        }
+      }
+    } else if (ReturnInst *ret =
+                   dyn_cast<ReturnInst>(cloned_bb->getTerminator())) {
+      // Change return instruction with end barrier number.
+      ConstantInt *cst_zero =
+          ConstantInt::get(Type::getInt32Ty(context), kBarrier_EndID);
+      ReturnInst *new_ret = ReturnInst::Create(context, cst_zero, ret);
+      ret->replaceAllUsesWith(new_ret);
+      ret->eraseFromParent();
+
+      // We can have multiple return points, but should only count it once.
+      returns_from_kernel = true;
+    }
+  }
+  if (returns_from_kernel) {
+    region.successor_ids.push_back(kBarrier_EndID);
+  }
+  // Keep things consistent
+  std::sort(region.successor_ids.begin(), region.successor_ids.end());
+
+  // Update the incoming edges to phi nodes, and drop edges to basic blocks
+  // that are not present in the new function.  Note that this must happen
+  // after all the basic blocks have been cloned, so that we know how to
+  // update the incoming edges to phi nodes that represent back edges.
+  for (auto *block : region.blocks) {
+    UpdateAndTrimPHINodeEdges(cast<BasicBlock>(vmap[block]), vmap);
+  }
+
+  BasicBlock *new_kernel_entry_block = &(new_kernel->getEntryBlock());
+  Instruction *insert_point = new_kernel_entry_block->getFirstNonPHIOrDbg();
+  auto *const cloned_barrier_call =
+      region.barrier_inst ? insert_point : nullptr;
+
+  // If we have a work group collective call, we need to remap its result from
+  // the arguments list.
+  if (collective) {
+    vmap[insert_point] = &*(new_arg++);
+  }
+
+  // The entry kernel might have allocas in it that don't get removed,
+  // so better make sure to insert after them.
+  while (isa<AllocaInst>(insert_point)) {
+    insert_point = insert_point->getNextNonDebugInstruction();
+  }
+
+  // It puts all the GEPs at the start of the kernel, but only once
+  LiveValuesHelper live_values(
+      *this, insert_point,
+      hasBarrierStruct ? compiler::utils::getLastArgument(new_kernel)
+                       : nullptr);
+
+  // Load live variables and map them.
+  // These variables are defined in a different kernel, so we insert the
+  // relevant load instructions in the entry block of the kernel.
+  {
+    // Note that if our barrier is a work group collective, its operand will
+    // probably still get reloaded here, even though it's going to get deleted,
+    // so we hope that it gets optimized away later, in this case.
+    for (const auto cur_live : region.uses_ext) {
+      IRBuilder<> insertIR(insert_point);
+      vmap[cur_live] = live_values.getReload(cur_live, insertIR, "_load", true);
+    }
+  }
+
+  SmallVector<Instruction *, 8> allocas_and_intrinsics_to_remove;
+
+  // Store only live variables that are defined in this kernel.
+  //
+  // We might like to store the variables at the point we hit the barrier.
+  // However, this is not always possible because the value definition might
+  // not dominate any or all of the exit blocks. Furthermore, if this value
+  // is used again in the same kernel after looping around the barrier, we
+  // have to be aware that the usage might be expecting the updated value.
+  // (This can happen in nested loops, where the outer increment becomes a
+  // conditional block.) Therefore, we put the store right after the
+  // definition instead.
+  for (const auto live_var : live_vars_defs_in_kernel) {
+    // If allocainst is live variable and defined in this function, then
+    // change the alloca to a GEP directly into the live variables struct
+    // otherwise we store the value to the struct. This is needed because
+    // it is possible for one live variable to reference another by
+    // pointer. When we then save them to the live variable struct they
+    // will point to the wrong address. By GEPping directly to the final
+    // live struct we resolve this issue as it will always use the final
+    // address.
+    if (auto *alloca_inst = dyn_cast<AllocaInst>(live_var)) {
+      // Check to see if it is still an alloca after vmap. If not we may
+      // have processed it before and no work needs doing as we are using
+      // the live variable struct directly.
+      if (auto *new_alloca_inst = dyn_cast<AllocaInst>(vmap[alloca_inst])) {
+        allocas_and_intrinsics_to_remove.push_back(new_alloca_inst);
+        // Also remove any assume-like intrinsics that are users of this
+        // alloca. These assumptions may not hold. For example, lifetime
+        // intrinsics are definitely dangerous, as by directly replacing their
+        // alloca operands with the address of the live variable struct, we are
+        // telling LLVM that *all* accesses of the live variable struct also
+        // start/end at that point, which is not true.
+        // Similarly, llvm.assume and llvm.experimental.noalias.scope.decl may
+        // hold for the alloca but not the live variables struct.
+        for (auto *const user : alloca_inst->users()) {
+          if (auto *const intrinsic = dyn_cast<IntrinsicInst>(user);
+              intrinsic && intrinsic->isAssumeLikeIntrinsic()) {
+            allocas_and_intrinsics_to_remove.push_back(intrinsic);
+          }
+        }
+        // change the vmap to point to the GEP instead of the original alloca
+        vmap[live_var] = live_values.getGEP(live_var);
+      }
+    } else {
+      // Place the new store immediately after the definition, but if it's a
+      // PHI node we have to make sure to put it after any other PHI nodes.
+      Instruction *inst = cast<Instruction>(vmap[live_var]);
+      Instruction *insert_point = inst->getNextNonDebugInstruction();
+      while (isa<PHINode>(insert_point)) {
+        insert_point = insert_point->getNextNonDebugInstruction();
+      }
+      IRBuilder<> B(insert_point);
+      if (!isStructWithScalables(live_var->getType())) {
+        auto *addr = live_values.getGEP(live_var);
+        B.CreateStore(live_var, addr);
+      } else {
+        // Store this struct containing scalable members piece-wise
+        auto member_tys = cast<StructType>(live_var->getType())->elements();
+        for (auto [idx, ty] : enumerate(member_tys)) {
+          auto *extract = B.CreateExtractValue(live_var, idx);
+          auto *extract_addr = live_values.getGEP(extract);
+          assert(extract_addr);
+          B.CreateStore(extract, extract_addr);
+        }
+      }
+    }
+  }
+
+  // Iterate instruction from insert point at entry basic block.
+  insert_point = new_kernel_entry_block->getFirstNonPHIOrDbg();
+  const RemapFlags remapFlags =
+      RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs;
+  BasicBlock::iterator b_iter = insert_point->getIterator();
+  while (b_iter != new_kernel_entry_block->end()) {
+    RemapInstruction(&*b_iter, vmap, remapFlags);
+    b_iter++;
+  }
+
+  // Remove barrier. We do this after creating stores so that if it's a work
+  // group collective, it will have been processed as normal above and written
+  // into the barrier struct where needed.
+  if (cloned_barrier_call) {
+    // When debugging insert a call to the exit debug stub at the insert
+    // point, this location is important since all the live variables will
+    // have been loaded by this point.
+    if (is_debug_) {
+      const unsigned barrier_id = barrier_id_map_[entry_point];
+      // Get call instruction invoking exit stub from map
+      CallInst *exit_caller = barrier_stub_call_map_[barrier_id].second;
+      exit_caller->insertAfter(cloned_barrier_call);
+      // Use updated debug info scope since call_inst will have had
+      // this set by ModifyDebugInfoScopes()
+      exit_caller->setDebugLoc(cloned_barrier_call->getDebugLoc());
+    }
+    if (collective) {
+      cloned_barrier_call->replaceAllUsesWith(vmap[cloned_barrier_call]);
+    }
+    cloned_barrier_call->eraseFromParent();
+  }
+
+  // don't remap the first basicblock again..
+  Function::iterator cfi = ++(new_kernel->begin());
+  const Function::iterator cfie = new_kernel->end();
+  for (; cfi != cfie; cfi++) {
+    for (Instruction &cbi : *cfi) {
+      RemapInstruction(&cbi, vmap, remapFlags);
+    }
+  }
+
+  // Remove any allocas and their dependent intrinsics that have been replaced
+  // by a GEP instruction
+  for (auto *inst : allocas_and_intrinsics_to_remove) {
+    inst->eraseFromParent();
+  }
+
+  // This needs resetting for the sake of any further new GEPs created
+  live_values.gepBuilder.SetInsertPoint(
+      new_kernel_entry_block->getFirstNonPHIOrDbg());
+
+  // If there are definitions of live variable in this function, process it
+  // here. As mentioned above regarding value stores, the user might want to
+  // load the value after it has been updated. Therefore, we place the new
+  // loads right before their uses.
+  //
+  // Potentially, this is not optimal, since it might create multiple loads.
+  // Ideally we should use some kind of reachability query to determine if
+  // the load can be placed before the store, and if not, PHI nodes could
+  // be inserted instead to get the value directly from the new definition.
+  //
+  // It would be nice not to have to build the Dominator Tree here again,
+  // since we already did it when we gathered the barrier crossing values.
+  // The problem is it's a use/user pair that crosses a barrier, not just the
+  // use itself. Some users may be dominated, and others not.
+  //
+  // NOTE it is impossible for any of these to be an Alloca.
+  DominatorTree DT;
+  DT.recalculate(*new_kernel);
+
+  for (auto OldDef : region.uses_int) {
+    Instruction *NewDef = cast<Instruction>(vmap[OldDef]);
+    BasicBlock *DefBB = NewDef->getParent();
+
+    for (auto use_it = NewDef->use_begin(); use_it != NewDef->use_end();) {
+      auto &U = *use_it++;
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      BasicBlock *UserBB = UserInst->getParent();
+
+      // Check whether user is in current function.
+      if (UserBB->getParent() == new_kernel) {
+        Instruction *load_insert = nullptr;
+
+        // Check dominance relation between def bb and user bb.
+        if (auto *PHI = dyn_cast<PHINode>(UserInst)) {
+          BasicBlock *incoming = PHI->getIncomingBlock(U);
+          if (!DT.dominates(DefBB, incoming)) {
+            load_insert = incoming->getTerminator();
+          }
+        } else if (!DT.dominates(DefBB, UserBB)) {
+          load_insert = UserInst;
+        }
+
+        if (load_insert) {
+          IRBuilder<> loadIR(load_insert);
+          U.set(live_values.getReload(OldDef, loadIR, "_reload"));
+        }
+      }
+    }
+  }
+
+  // Removing incoming PHI node edges might have created some redundant ones.
+  for (auto *BB : region.blocks) {
+    BasicBlock *cBB = cast<BasicBlock>(vmap[BB]);
+    for (auto I = cBB->begin(); I != cBB->end();) {
+      if (auto *PHI = dyn_cast<PHINode>(&*(I++))) {
+        if (auto *V = PHI->hasConstantValue()) {
+          PHI->replaceAllUsesWith(V);
+          PHI->eraseFromParent();
+        }
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Remap any remaining unmapped instructions coming from DT-based reloads
+  for (auto &BB : *new_kernel) {
+    for (Instruction &I : BB) {
+      RemapInstruction(&I, vmap, remapFlags);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "new kernel function: " << new_kernel->getName()
+                    << "\n";);
+  return new_kernel;
+}
+
+/// @brief This function is a copy from llvm::CloneBasicBlock. In order to
+/// update live variable information, some of codes are added.
+///
+/// @param[in] bb Basic block to copy.
+/// @param[out] vmap Map for value for cloning.
+/// @param[in] name_suffix Name for suffix.
+/// @param[out] live_defs_info Live definitions' info current basic block.
+/// @param[in] F Current function.
+///
+/// @return Return cloned basic block.
+BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
+    BasicBlock *bb, ValueToValueMapTy &vmap, const Twine &name_suffix,
+    live_variable_mem_t &live_defs_info, Function *F) {
+  BasicBlock *new_bb = BasicBlock::Create(bb->getContext(), "", F);
+  if (bb->hasName()) new_bb->setName(bb->getName() + name_suffix);
+
+  // Loop over all instructions, and copy them over.
+  for (Instruction &i : *bb) {
+    // Don't clone over debug intrinsics since we're going to create them
+    // manually later.
+    if (isa<DbgDeclareInst>(&i)) {
+      continue;
+    }
+
+    Instruction *new_inst = i.clone();
+    if (i.hasName()) new_inst->setName(i.getName() + name_suffix);
+    new_inst->insertInto(new_bb, new_bb->end());
+
+    // Record live variables' defs which are in current kernel.
+    if (whole_live_variables_set_.count(&i)) {
+      live_defs_info.insert(&i);
+    }
+
+    vmap[&i] = new_inst;
+  }
+  return new_bb;
+}
+
+/// @brief Seperate kernel function with barrier boundary.
+void compiler::utils::Barrier::SeperateKernelWithBarrier() {
+  if (barriers_.empty()) return;
+
+  for (auto &region : barrier_graph) {
+    kernel_id_map_[region.id] = GenerateNewKernel(region);
+  }
+
+  // Record barrier information on metadata.
+  SmallString<128> name;
+  LLVMContext &context = module_.getContext();
+  ValueAsMetadata *num_barriers_ = ValueAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(context), barriers_.size()));
+  MDNode *num_barriers__md =
+      MDNode::get(context, ArrayRef<Metadata *>(num_barriers_));
+  NamedMDNode *barrier_md = module_.getOrInsertNamedMetadata(
+      Twine(func_.getName() + "_barrier").toStringRef(name));
+  barrier_md->addOperand(num_barriers__md);
+
+  LLVM_DEBUG({
+    for (const auto &kid : kernel_id_map_) {
+      dbgs() << "1. kernel_id[" << kid.first << "] = " << kid.second->getName()
+             << "\n";
+    }
+
+    for (unsigned i = kBarrier_FirstID;
+         i < kernel_id_map_.size() + kBarrier_FirstID; i++) {
+      dbgs() << "2. kernel_id[" << i << "] = " << kernel_id_map_[i]->getName()
+             << "\n";
+    }
+    dbgs() << "\n\n" << module_ << "\n\n";
+  });
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
new file mode 100644
index 0000000000000..2a772bbd29b45
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -0,0 +1,1255 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/cl_builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/scheduling.h>
+#include <llvm/ADT/StringExtras.h>
+#include <llvm/ADT/StringSwitch.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+AnalysisKey BuiltinInfoAnalysis::Key;
+
+BuiltinInfoAnalysis::BuiltinInfoAnalysis()
+    : BICallback([](const Module &) -> BuiltinInfo {
+        return BuiltinInfo(std::make_unique<CLBuiltinInfo>(nullptr));
+      }) {}
+
+Module *BuiltinInfo::getBuiltinsModule() {
+  if (LangImpl) {
+    return LangImpl->getBuiltinsModule();
+  }
+  // Mux builtins don't need a module.
+  return nullptr;
+}
+
+std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
+    const Function &F) const {
+  StringRef Name = F.getName();
+  auto ID =
+      StringSwitch<BuiltinID>(Name)
+          .Case(MuxBuiltins::isftz, eMuxBuiltinIsFTZ)
+          .Case(MuxBuiltins::usefast, eMuxBuiltinUseFast)
+          .Case(MuxBuiltins::isembeddedprofile, eMuxBuiltinIsEmbeddedProfile)
+          .Case(MuxBuiltins::get_global_size, eMuxBuiltinGetGlobalSize)
+          .Case(MuxBuiltins::get_global_id, eMuxBuiltinGetGlobalId)
+          .Case(MuxBuiltins::get_global_offset, eMuxBuiltinGetGlobalOffset)
+          .Case(MuxBuiltins::get_local_size, eMuxBuiltinGetLocalSize)
+          .Case(MuxBuiltins::get_local_id, eMuxBuiltinGetLocalId)
+          .Case(MuxBuiltins::set_local_id, eMuxBuiltinSetLocalId)
+          .Case(MuxBuiltins::get_sub_group_id, eMuxBuiltinGetSubGroupId)
+          .Case(MuxBuiltins::set_sub_group_id, eMuxBuiltinSetSubGroupId)
+          .Case(MuxBuiltins::get_num_groups, eMuxBuiltinGetNumGroups)
+          .Case(MuxBuiltins::get_num_sub_groups, eMuxBuiltinGetNumSubGroups)
+          .Case(MuxBuiltins::set_num_sub_groups, eMuxBuiltinSetNumSubGroups)
+          .Case(MuxBuiltins::get_max_sub_group_size,
+                eMuxBuiltinGetMaxSubGroupSize)
+          .Case(MuxBuiltins::set_max_sub_group_size,
+                eMuxBuiltinSetMaxSubGroupSize)
+          .Case(MuxBuiltins::get_group_id, eMuxBuiltinGetGroupId)
+          .Case(MuxBuiltins::get_work_dim, eMuxBuiltinGetWorkDim)
+          .Case(MuxBuiltins::dma_read_1d, eMuxBuiltinDMARead1D)
+          .Case(MuxBuiltins::dma_read_2d, eMuxBuiltinDMARead2D)
+          .Case(MuxBuiltins::dma_read_3d, eMuxBuiltinDMARead3D)
+          .Case(MuxBuiltins::dma_write_1d, eMuxBuiltinDMAWrite1D)
+          .Case(MuxBuiltins::dma_write_2d, eMuxBuiltinDMAWrite2D)
+          .Case(MuxBuiltins::dma_write_3d, eMuxBuiltinDMAWrite3D)
+          .Case(MuxBuiltins::dma_wait, eMuxBuiltinDMAWait)
+          .Case(MuxBuiltins::get_global_linear_id, eMuxBuiltinGetGlobalLinearId)
+          .Case(MuxBuiltins::get_local_linear_id, eMuxBuiltinGetLocalLinearId)
+          .Case(MuxBuiltins::get_enqueued_local_size,
+                eMuxBuiltinGetEnqueuedLocalSize)
+          .Case(MuxBuiltins::get_sub_group_size, eMuxBuiltinGetSubGroupSize)
+          .Case(MuxBuiltins::get_sub_group_local_id,
+                eMuxBuiltinGetSubGroupLocalId)
+          .Case(MuxBuiltins::work_group_barrier, eMuxBuiltinWorkGroupBarrier)
+          .Case(MuxBuiltins::sub_group_barrier, eMuxBuiltinSubGroupBarrier)
+          .Case(MuxBuiltins::mem_barrier, eMuxBuiltinMemBarrier)
+          .Default(eBuiltinInvalid);
+  if (ID != eBuiltinInvalid) {
+    switch (ID) {
+      default:
+        return {ID, {}};
+      case eMuxBuiltinDMARead1D:
+      case eMuxBuiltinDMARead2D:
+      case eMuxBuiltinDMARead3D:
+      case eMuxBuiltinDMAWrite1D:
+      case eMuxBuiltinDMAWrite2D:
+      case eMuxBuiltinDMAWrite3D:
+        // Return the event type used by these builtins. The event type is
+        // required to declare/define these builtins, so return it here for
+        // the sake of completeness. The event type doesn't change the
+        // builtins' name (i.e., it's not mangled) as it's required to be
+        // consistent at any single snapshot of the module, though it may
+        // change through time.
+        return {ID, {F.getReturnType()}};
+    }
+  }
+
+  // Now check for group functions, which are a bit more involved as there's
+  // many of them and they're also mangled. We enforce that the mangling makes
+  // sense, otherwise the builtin is declared as invalid.
+  const bool IsSubgroupOp = Name.consume_front("__mux_sub_group_");
+  const bool IsVecgroupOp = Name.consume_front("__mux_vec_group_");
+  if (!IsSubgroupOp && !IsVecgroupOp &&
+      !Name.consume_front("__mux_work_group_")) {
+    return {eBuiltinInvalid, {}};
+  }
+
+#define SCOPED_GROUP_OP(OP)                 \
+  (IsSubgroupOp   ? eMuxBuiltinSubgroup##OP \
+   : IsVecgroupOp ? eMuxBuiltinVecgroup##OP \
+                  : eMuxBuiltinWorkgroup##OP)
+
+  // Most group operations have one argument, except for broadcasts. Despite
+  // that, we don't mangle the indices as they're fixed.
+  const unsigned NumExpectedMangledArgs = 1;
+
+  if (Name.consume_front("any")) {
+    ID = SCOPED_GROUP_OP(Any);
+  } else if (Name.consume_front("all")) {
+    ID = SCOPED_GROUP_OP(All);
+  } else if (Name.consume_front("broadcast")) {
+    ID = SCOPED_GROUP_OP(Broadcast);
+  } else if (Name.consume_front("shuffle_up")) {
+    if (!IsSubgroupOp) {
+      return {eBuiltinInvalid, {}};
+    }
+    ID = eMuxBuiltinSubgroupShuffleUp;
+  } else if (Name.consume_front("shuffle_down")) {
+    if (!IsSubgroupOp) {
+      return {eBuiltinInvalid, {}};
+    }
+    ID = eMuxBuiltinSubgroupShuffleDown;
+  } else if (Name.consume_front("shuffle_xor")) {
+    if (!IsSubgroupOp) {
+      return {eBuiltinInvalid, {}};
+    }
+    ID = eMuxBuiltinSubgroupShuffleXor;
+  } else if (Name.consume_front("shuffle")) {
+    if (!IsSubgroupOp) {
+      return {eBuiltinInvalid, {}};
+    }
+    ID = eMuxBuiltinSubgroupShuffle;
+  } else if (Name.consume_front("reduce_")) {
+    auto NextIdx = Name.find_first_of('_');
+    std::string Group = Name.substr(0, NextIdx).str();
+    Name = Name.drop_front(Group.size());
+
+    if (Group == "logical") {
+      Name = Name.drop_front();  // Drop the underscore
+      auto NextIdx = Name.find_first_of('_');
+      auto RealGroup = Name.substr(0, NextIdx);
+      Group += "_" + RealGroup.str();
+      Name = Name.drop_front(RealGroup.size());
+    }
+
+    ID = StringSwitch<BuiltinID>(Group)
+             .Case("add", SCOPED_GROUP_OP(ReduceAdd))
+             .Case("fadd", SCOPED_GROUP_OP(ReduceFAdd))
+             .Case("mul", SCOPED_GROUP_OP(ReduceMul))
+             .Case("fmul", SCOPED_GROUP_OP(ReduceFMul))
+             .Case("smin", SCOPED_GROUP_OP(ReduceSMin))
+             .Case("umin", SCOPED_GROUP_OP(ReduceUMin))
+             .Case("fmin", SCOPED_GROUP_OP(ReduceFMin))
+             .Case("smax", SCOPED_GROUP_OP(ReduceSMax))
+             .Case("umax", SCOPED_GROUP_OP(ReduceUMax))
+             .Case("fmax", SCOPED_GROUP_OP(ReduceFMax))
+             .Case("and", SCOPED_GROUP_OP(ReduceAnd))
+             .Case("or", SCOPED_GROUP_OP(ReduceOr))
+             .Case("xor", SCOPED_GROUP_OP(ReduceXor))
+             .Case("logical_and", SCOPED_GROUP_OP(ReduceLogicalAnd))
+             .Case("logical_or", SCOPED_GROUP_OP(ReduceLogicalOr))
+             .Case("logical_xor", SCOPED_GROUP_OP(ReduceLogicalXor))
+             .Default(eBuiltinInvalid);
+  } else if (Name.consume_front("scan_")) {
+    const bool IsInclusive = Name.consume_front("inclusive_");
+    if (!IsInclusive && !Name.consume_front("exclusive_")) {
+      return {eBuiltinInvalid, {}};
+    }
+
+    auto NextIdx = Name.find_first_of('_');
+    std::string Group = Name.substr(0, NextIdx).str();
+    Name = Name.drop_front(Group.size());
+
+    if (Group == "logical") {
+      auto NextIdx = Name.find_first_of('_', /*From*/ 1);
+      auto RealGroup = Name.substr(0, NextIdx);
+      Group += RealGroup.str();
+      Name = Name.drop_front(RealGroup.size());
+    }
+
+    ID = StringSwitch<BuiltinID>(Group)
+             .Case("add", IsInclusive ? SCOPED_GROUP_OP(ScanAddInclusive)
+                                      : SCOPED_GROUP_OP(ScanAddExclusive))
+             .Case("fadd", IsInclusive ? SCOPED_GROUP_OP(ScanFAddInclusive)
+                                       : SCOPED_GROUP_OP(ScanFAddExclusive))
+             .Case("mul", IsInclusive ? SCOPED_GROUP_OP(ScanMulInclusive)
+                                      : SCOPED_GROUP_OP(ScanMulExclusive))
+             .Case("fmul", IsInclusive ? SCOPED_GROUP_OP(ScanFMulInclusive)
+                                       : SCOPED_GROUP_OP(ScanFMulExclusive))
+             .Case("smin", IsInclusive ? SCOPED_GROUP_OP(ScanSMinInclusive)
+                                       : SCOPED_GROUP_OP(ScanSMinExclusive))
+             .Case("umin", IsInclusive ? SCOPED_GROUP_OP(ScanUMinInclusive)
+                                       : SCOPED_GROUP_OP(ScanUMinExclusive))
+             .Case("fmin", IsInclusive ? SCOPED_GROUP_OP(ScanFMinInclusive)
+                                       : SCOPED_GROUP_OP(ScanFMinExclusive))
+             .Case("smax", IsInclusive ? SCOPED_GROUP_OP(ScanSMaxInclusive)
+                                       : SCOPED_GROUP_OP(ScanSMaxExclusive))
+             .Case("umax", IsInclusive ? SCOPED_GROUP_OP(ScanUMaxInclusive)
+                                       : SCOPED_GROUP_OP(ScanUMaxExclusive))
+             .Case("fmax", IsInclusive ? SCOPED_GROUP_OP(ScanFMaxInclusive)
+                                       : SCOPED_GROUP_OP(ScanFMaxExclusive))
+             .Case("and", IsInclusive ? SCOPED_GROUP_OP(ScanAndInclusive)
+                                      : SCOPED_GROUP_OP(ScanAndExclusive))
+             .Case("or", IsInclusive ? SCOPED_GROUP_OP(ScanOrInclusive)
+                                     : SCOPED_GROUP_OP(ScanOrExclusive))
+             .Case("xor", IsInclusive ? SCOPED_GROUP_OP(ScanXorInclusive)
+                                      : SCOPED_GROUP_OP(ScanXorExclusive))
+             .Case("logical_and",
+                   IsInclusive ? SCOPED_GROUP_OP(ScanLogicalAndInclusive)
+                               : SCOPED_GROUP_OP(ScanLogicalAndExclusive))
+             .Case("logical_or", IsInclusive
+                                     ? SCOPED_GROUP_OP(ScanLogicalOrInclusive)
+                                     : SCOPED_GROUP_OP(ScanLogicalOrExclusive))
+             .Case("logical_xor",
+                   IsInclusive ? SCOPED_GROUP_OP(ScanLogicalXorInclusive)
+                               : SCOPED_GROUP_OP(ScanLogicalXorExclusive))
+             .Default(eBuiltinInvalid);
+  }
+
+  std::vector<Type *> OverloadInfo;
+  if (ID != eBuiltinInvalid) {
+    // Consume the rest of this group Op function name. If we can't identify a
+    // series of mangled type names, this builtin is invalid.
+    unsigned NumMangledArgs = 0;
+    // Work-group builtins have an unmangled 'barrier ID' parameter first, which
+    // we want to skip.
+    const unsigned Offset = ID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+                            ID <= eLastMuxWorkgroupCollectiveBuiltin;
+    while (!Name.empty()) {
+      if (!Name.consume_front("_")) {
+        return {eBuiltinInvalid, {}};
+      }
+      auto [Ty, NewName] = getDemangledTypeFromStr(Name, F.getContext());
+      Name = NewName;
+
+      auto ParamIdx = Offset + NumMangledArgs;
+      if (ParamIdx >= F.arg_size() || Ty != F.getArg(ParamIdx)->getType()) {
+        return {eBuiltinInvalid, {}};
+      }
+
+      ++NumMangledArgs;
+      OverloadInfo.push_back(Ty);
+    }
+    if (NumMangledArgs != NumExpectedMangledArgs) {
+      return {eBuiltinInvalid, {}};
+    }
+  }
+
+  return {ID, OverloadInfo};
+#undef SCOPED_GROUP_OP
+}
+
+BuiltinUniformity BuiltinInfo::isBuiltinUniform(const Builtin &B,
+                                                const CallInst *CI,
+                                                unsigned SimdDimIdx) const {
+  switch (B.ID) {
+    default:
+      break;
+    case eMuxBuiltinGetGlobalId:
+    case eMuxBuiltinGetLocalId: {
+      // We need to know the dimension requested from these builtins at compile
+      // time to infer their uniformity.
+      if (!CI || CI->arg_empty()) {
+        return eBuiltinUniformityNever;
+      }
+      auto *Rank = dyn_cast<ConstantInt>(CI->getArgOperand(0));
+      if (!Rank) {
+        // The Rank is some function, which "might" evaluate to zero
+        // sometimes, so we let the packetizer sort it out with some
+        // conditional magic.
+        // TODO Make sure this can never go haywire in weird edge cases.
+        // Where we have one get_global_id() dependent on another, this is
+        // not packetized correctly. Doing so is very hard!  We should
+        // probably just fail to packetize in this case.  We might also be
+        // able to return eBuiltinUniformityNever here, in cases where we can
+        // prove that the value can never be zero.
+        return eBuiltinUniformityMaybeInstanceID;
+      }
+      // Only vectorize on selected dimension. The value of get_global_id with
+      // other ranks is uniform.
+      if (Rank->getZExtValue() == SimdDimIdx) {
+        return eBuiltinUniformityInstanceID;
+      }
+
+      return eBuiltinUniformityAlways;
+    }
+    case eMuxBuiltinGetSubGroupLocalId:
+      return eBuiltinUniformityInstanceID;
+    case eMuxBuiltinGetLocalLinearId:
+    case eMuxBuiltinGetGlobalLinearId:
+      // TODO: This is fine for vectorizing in the x-axis, but currently we do
+      // not support vectorizing along y or z (see CA-2843).
+      return SimdDimIdx ? eBuiltinUniformityNever
+                        : eBuiltinUniformityInstanceID;
+  }
+
+  // Reductions and broadcasts are always uniform
+  if (auto Info = isMuxGroupCollective(B.ID)) {
+    if (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast()) {
+      return eBuiltinUniformityAlways;
+    }
+  }
+
+  if (LangImpl) {
+    return LangImpl->isBuiltinUniform(B, CI, SimdDimIdx);
+  }
+  return eBuiltinUniformityUnknown;
+}
+
+Builtin BuiltinInfo::analyzeBuiltin(const Function &F) const {
+  // Handle LLVM intrinsics.
+  if (F.isIntrinsic()) {
+    int32_t Properties = eBuiltinPropertyNone;
+
+    const Intrinsic::ID IntrID = (Intrinsic::ID)F.getIntrinsicID();
+    const AttributeList AS = Intrinsic::getAttributes(F.getContext(), IntrID);
+    const bool NoSideEffect = F.onlyReadsMemory();
+    bool SafeIntrinsic = false;
+    switch (IntrID) {
+      default:
+        SafeIntrinsic = false;
+        break;
+      case Intrinsic::smin:
+      case Intrinsic::smax:
+      case Intrinsic::umin:
+      case Intrinsic::umax:
+      case Intrinsic::abs:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+      case Intrinsic::sqrt:
+      case Intrinsic::sin:
+      case Intrinsic::cos:
+      case Intrinsic::pow:
+      case Intrinsic::exp:
+      case Intrinsic::exp2:
+      case Intrinsic::log:
+      case Intrinsic::log10:
+      case Intrinsic::log2:
+      case Intrinsic::fma:
+      case Intrinsic::fabs:
+      case Intrinsic::minnum:
+      case Intrinsic::maxnum:
+      case Intrinsic::copysign:
+      case Intrinsic::floor:
+      case Intrinsic::ceil:
+      case Intrinsic::trunc:
+      case Intrinsic::rint:
+      case Intrinsic::nearbyint:
+      case Intrinsic::round:
+      case Intrinsic::ctpop:
+      case Intrinsic::fmuladd:
+      case Intrinsic::fshl:
+      case Intrinsic::fshr:
+      case Intrinsic::sadd_sat:
+      case Intrinsic::uadd_sat:
+      case Intrinsic::ssub_sat:
+      case Intrinsic::usub_sat:
+      case Intrinsic::bitreverse:
+        // All these function are overloadable and have both scalar and vector
+        // versions.
+        Properties |= eBuiltinPropertyVectorEquivalent;
+        SafeIntrinsic = true;
+        break;
+      case Intrinsic::assume:
+      case Intrinsic::dbg_declare:
+      case Intrinsic::dbg_value:
+      case Intrinsic::invariant_start:
+      case Intrinsic::invariant_end:
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+      case Intrinsic::objectsize:
+      case Intrinsic::ptr_annotation:
+      case Intrinsic::var_annotation:
+      case Intrinsic::experimental_noalias_scope_decl:
+        SafeIntrinsic = true;
+        break;
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+        Properties |= eBuiltinPropertyNoVectorEquivalent;
+        Properties |= eBuiltinPropertySideEffects;
+        break;
+    }
+    if (NoSideEffect || SafeIntrinsic) {
+      Properties |= eBuiltinPropertyNoSideEffects;
+      if (!AS.hasFnAttr(Attribute::NoDuplicate)) {
+        Properties |= eBuiltinPropertySupportsInstantiation;
+      }
+    }
+    return Builtin{F, eBuiltinUnknown, (BuiltinProperties)Properties};
+  }
+
+  auto [ID, OverloadInfo] = identifyMuxBuiltin(F);
+
+  if (ID == eBuiltinInvalid) {
+    // It's not a Mux builtin, so defer to the language implementation
+    if (LangImpl) {
+      return LangImpl->analyzeBuiltin(F);
+    }
+    return Builtin{F, ID, eBuiltinPropertyNone};
+  }
+
+  // Check that all overloadable builtins have returned some overloading
+  // information, for API consistency.
+  assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
+         "Inconsistency in overloadable builtin APIs");
+
+  bool IsConvergent = false;
+  unsigned Properties = eBuiltinPropertyNone;
+  switch (ID) {
+    default:
+      break;
+    case eMuxBuiltinMemBarrier:
+      Properties = eBuiltinPropertySideEffects;
+      break;
+    case eMuxBuiltinSubGroupBarrier:
+    case eMuxBuiltinWorkGroupBarrier:
+      IsConvergent = true;
+      Properties = eBuiltinPropertyExecutionFlow | eBuiltinPropertySideEffects;
+      break;
+    case eMuxBuiltinDMARead1D:
+    case eMuxBuiltinDMARead2D:
+    case eMuxBuiltinDMARead3D:
+    case eMuxBuiltinDMAWrite1D:
+    case eMuxBuiltinDMAWrite2D:
+    case eMuxBuiltinDMAWrite3D:
+    case eMuxBuiltinDMAWait:
+      // Our DMA builtins, by default, rely on thread checks against specific
+      // work-item IDs, so they must be convergent.
+      IsConvergent = true;
+      Properties = eBuiltinPropertyNoSideEffects;
+      break;
+    case eMuxBuiltinGetWorkDim:
+    case eMuxBuiltinGetGroupId:
+    case eMuxBuiltinGetGlobalSize:
+    case eMuxBuiltinGetGlobalOffset:
+    case eMuxBuiltinGetLocalSize:
+    case eMuxBuiltinGetNumGroups:
+    case eMuxBuiltinGetGlobalLinearId:
+    case eMuxBuiltinGetLocalLinearId:
+    case eMuxBuiltinGetGlobalId:
+    case eMuxBuiltinGetSubGroupLocalId:
+      Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyRematerializable;
+      break;
+    case eMuxBuiltinGetLocalId:
+      Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyLocalID |
+                   eBuiltinPropertyRematerializable;
+      break;
+    case eMuxBuiltinIsFTZ:
+    case eMuxBuiltinIsEmbeddedProfile:
+    case eMuxBuiltinUseFast:
+      Properties = eBuiltinPropertyNoSideEffects;
+      break;
+  }
+
+  // Group functions are convergent.
+  if (isMuxGroupCollective(ID)) {
+    IsConvergent = true;
+  }
+
+  if (!IsConvergent) {
+    Properties |= eBuiltinPropertyKnownNonConvergent;
+  }
+
+  return Builtin{F, ID, (BuiltinProperties)Properties, OverloadInfo};
+}
+
+BuiltinCall BuiltinInfo::analyzeBuiltinCall(const CallInst &CI,
+                                            unsigned SimdDimIdx) const {
+  auto *const callee = CI.getCalledFunction();
+  assert(callee && "Call instruction with no callee");
+  const auto B = analyzeBuiltin(*callee);
+  const auto U = isBuiltinUniform(B, &CI, SimdDimIdx);
+  return BuiltinCall{B, CI, U};
+}
+
+Function *BuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
+                                           Module *M) {
+  // We don't handle LLVM intrinsics here
+  if (B.function.isIntrinsic()) {
+    return nullptr;
+  }
+
+  if (LangImpl) {
+    return LangImpl->getVectorEquivalent(B, Width, M);
+  }
+  return nullptr;
+}
+
+Function *BuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
+  // We will first check to see if this is an LLVM intrinsic that has a scalar
+  // equivalent.
+  if (B.function.isIntrinsic()) {
+    // Analyze the builtin. Some functions have no scalar equivalent.
+    const auto Props = B.properties;
+    if (!(Props & eBuiltinPropertyVectorEquivalent)) {
+      return nullptr;
+    }
+
+    // Check the return type.
+    auto *VecRetTy = dyn_cast<FixedVectorType>(B.function.getReturnType());
+    if (!VecRetTy) {
+      return nullptr;
+    }
+
+    auto IntrinsicID = B.function.getIntrinsicID();
+    // Currently, we can only handle correctly intrinsics that have one
+    // overloaded type, used for both the return type and all of the arguments.
+    // TODO: More generic support for intrinsics with vector equivalents.
+    for (Type *ArgTy : B.function.getFunctionType()->params()) {
+      // If the argument isn't a vector, then it isn't going to get scalarized,
+      // so don't worry about it.
+      if (ArgTy->isVectorTy() && ArgTy != VecRetTy) {
+        return nullptr;
+      }
+    }
+    Type *ScalarType = VecRetTy->getElementType();
+    // Get the scalar version of the intrinsic
+    Function *ScalarIntrinsic =
+        Intrinsic::getDeclaration(M, IntrinsicID, ScalarType);
+
+    return ScalarIntrinsic;
+  }
+
+  if (LangImpl) {
+    return LangImpl->getScalarEquivalent(B, M);
+  }
+  return nullptr;
+}
+
+Value *BuiltinInfo::emitBuiltinInline(Function *Builtin, IRBuilder<> &B,
+                                      ArrayRef<Value *> Args) {
+  if (LangImpl) {
+    return LangImpl->emitBuiltinInline(Builtin, B, Args);
+  }
+  return nullptr;
+}
+
+std::optional<llvm::ConstantRange> BuiltinInfo::getBuiltinRange(
+    CallInst &CI, std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+    std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const {
+  auto *F = CI.getCalledFunction();
+  // Ranges only apply to integer types, and ensure that there's a named
+  // function to analyze.
+  if (!F || !F->hasName() || !CI.getType()->isIntegerTy()) {
+    return std::nullopt;
+  }
+
+  // First, check mux builtins
+  if (auto [ID, _] = identifyMuxBuiltin(*F); isMuxBuiltinID(ID)) {
+    return MuxImpl->getBuiltinRange(CI, ID, MaxLocalSizes, MaxGlobalSizes);
+  }
+
+  // Next, ask the language builtin info
+  if (LangImpl) {
+    return LangImpl->getBuiltinRange(CI, MaxLocalSizes, MaxGlobalSizes);
+  }
+
+  return std::nullopt;
+}
+
+Instruction *BuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI) {
+  if (LangImpl) {
+    return LangImpl->lowerBuiltinToMuxBuiltin(CI, *MuxImpl);
+  }
+  // We shouldn't be mapping mux builtins to mux builtins, so we can stop here.
+  return nullptr;
+}
+
+BuiltinID BuiltinInfo::getPrintfBuiltin() const {
+  if (LangImpl) {
+    return LangImpl->getPrintfBuiltin();
+  }
+  return eBuiltinInvalid;
+}
+
+bool BuiltinInfo::requiresSchedulingParameters(BuiltinID ID) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->requiresSchedulingParameters(ID);
+}
+
+Type *BuiltinInfo::getRemappedTargetExtTy(Type *Ty, Module &M) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->getRemappedTargetExtTy(Ty, M);
+}
+
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BuiltinInfo::getMuxSchedulingParameters(Module &M) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->getMuxSchedulingParameters(M);
+}
+
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BuiltinInfo::getFunctionSchedulingParameters(Function &F) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->getFunctionSchedulingParameters(F);
+}
+
+Value *BuiltinInfo::initializeSchedulingParamForWrappedKernel(
+    const SchedParamInfo &Info, IRBuilder<> &B, Function &IntoF,
+    Function &CalleeF) {
+  return MuxImpl->initializeSchedulingParamForWrappedKernel(Info, B, IntoF,
+                                                            CalleeF);
+}
+
+// This provides an extremely simple mangling scheme matching LLVM's intrinsic
+// mangling system. It is only designed to be used with a specific set of types
+// and is not a general-purpose mangler.
+std::string BuiltinInfo::getMangledTypeStr(Type *Ty) {
+  std::string Result;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    const ElementCount EC = VTy->getElementCount();
+    if (EC.isScalable()) {
+      Result += "nx";
+    }
+    return "v" + utostr(EC.getKnownMinValue()) +
+           getMangledTypeStr(VTy->getElementType());
+  }
+
+  if (Ty) {
+    switch (Ty->getTypeID()) {
+      default:
+        break;
+      case Type::HalfTyID:
+        return "f16";
+      case Type::BFloatTyID:
+        return "bf16";
+      case Type::FloatTyID:
+        return "f32";
+      case Type::DoubleTyID:
+        return "f64";
+      case Type::IntegerTyID:
+        return "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
+    }
+  }
+  llvm_unreachable("Unhandled type");
+}
+
+std::pair<Type *, StringRef> BuiltinInfo::getDemangledTypeFromStr(
+    StringRef TyStr, LLVMContext &Ctx) {
+  const bool IsScalable = TyStr.consume_front("nx");
+  if (TyStr.consume_front("v")) {
+    unsigned EC;
+    if (TyStr.consumeInteger(10, EC)) {
+      return {nullptr, TyStr};
+    }
+    if (auto [EltTy, NewTyStr] = getDemangledTypeFromStr(TyStr, Ctx); EltTy) {
+      return {VectorType::get(EltTy, EC, IsScalable), NewTyStr};
+    }
+    return {nullptr, TyStr};
+  }
+  if (TyStr.consume_front("f16")) {
+    return {Type::getHalfTy(Ctx), TyStr};
+  }
+  if (TyStr.consume_front("bf16")) {
+    return {Type::getBFloatTy(Ctx), TyStr};
+  }
+  if (TyStr.consume_front("f32")) {
+    return {Type::getFloatTy(Ctx), TyStr};
+  }
+  if (TyStr.consume_front("f64")) {
+    return {Type::getDoubleTy(Ctx), TyStr};
+  }
+  unsigned IntBitWidth;
+  if (TyStr.consume_front("i") && !TyStr.consumeInteger(10, IntBitWidth)) {
+    return {IntegerType::get(Ctx, IntBitWidth), TyStr};
+  }
+
+  return {nullptr, TyStr};
+}
+
+std::string BuiltinInfo::getMuxBuiltinName(BuiltinID ID,
+                                           ArrayRef<Type *> OverloadInfo) {
+  assert(isMuxBuiltinID(ID));
+  switch (ID) {
+    default:
+      break;
+    case eMuxBuiltinIsFTZ:
+      return MuxBuiltins::isftz;
+    case eMuxBuiltinUseFast:
+      return MuxBuiltins::usefast;
+    case eMuxBuiltinIsEmbeddedProfile:
+      return MuxBuiltins::isembeddedprofile;
+    case eMuxBuiltinGetGlobalSize:
+      return MuxBuiltins::get_global_size;
+    case eMuxBuiltinGetGlobalId:
+      return MuxBuiltins::get_global_id;
+    case eMuxBuiltinGetGlobalOffset:
+      return MuxBuiltins::get_global_offset;
+    case eMuxBuiltinGetLocalSize:
+      return MuxBuiltins::get_local_size;
+    case eMuxBuiltinGetLocalId:
+      return MuxBuiltins::get_local_id;
+    case eMuxBuiltinSetLocalId:
+      return MuxBuiltins::set_local_id;
+    case eMuxBuiltinGetSubGroupId:
+      return MuxBuiltins::get_sub_group_id;
+    case eMuxBuiltinSetSubGroupId:
+      return MuxBuiltins::set_sub_group_id;
+    case eMuxBuiltinGetNumGroups:
+      return MuxBuiltins::get_num_groups;
+    case eMuxBuiltinGetNumSubGroups:
+      return MuxBuiltins::get_num_sub_groups;
+    case eMuxBuiltinSetNumSubGroups:
+      return MuxBuiltins::set_num_sub_groups;
+    case eMuxBuiltinGetMaxSubGroupSize:
+      return MuxBuiltins::get_max_sub_group_size;
+    case eMuxBuiltinSetMaxSubGroupSize:
+      return MuxBuiltins::set_max_sub_group_size;
+    case eMuxBuiltinGetGroupId:
+      return MuxBuiltins::get_group_id;
+    case eMuxBuiltinGetWorkDim:
+      return MuxBuiltins::get_work_dim;
+    case eMuxBuiltinDMARead1D:
+      return MuxBuiltins::dma_read_1d;
+    case eMuxBuiltinDMARead2D:
+      return MuxBuiltins::dma_read_2d;
+    case eMuxBuiltinDMARead3D:
+      return MuxBuiltins::dma_read_3d;
+    case eMuxBuiltinDMAWrite1D:
+      return MuxBuiltins::dma_write_1d;
+    case eMuxBuiltinDMAWrite2D:
+      return MuxBuiltins::dma_write_2d;
+    case eMuxBuiltinDMAWrite3D:
+      return MuxBuiltins::dma_write_3d;
+    case eMuxBuiltinDMAWait:
+      return MuxBuiltins::dma_wait;
+    case eMuxBuiltinGetGlobalLinearId:
+      return MuxBuiltins::get_global_linear_id;
+    case eMuxBuiltinGetLocalLinearId:
+      return MuxBuiltins::get_local_linear_id;
+    case eMuxBuiltinGetEnqueuedLocalSize:
+      return MuxBuiltins::get_enqueued_local_size;
+    case eMuxBuiltinGetSubGroupSize:
+      return MuxBuiltins::get_sub_group_size;
+    case eMuxBuiltinGetSubGroupLocalId:
+      return MuxBuiltins::get_sub_group_local_id;
+    case eMuxBuiltinMemBarrier:
+      return MuxBuiltins::mem_barrier;
+    case eMuxBuiltinWorkGroupBarrier:
+      return MuxBuiltins::work_group_barrier;
+    case eMuxBuiltinSubGroupBarrier:
+      return MuxBuiltins::sub_group_barrier;
+  }
+
+    // A sneaky macro to do case statements on all scopes of a group operation.
+    // Note that it is missing a leading 'case' and a trailing ':' to trick
+    // clang-format into formatting it like a regular case statement.
+#define CASE_GROUP_OP_ALL_SCOPES(OP)                      \
+  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP: \
+  case eMuxBuiltinWorkgroup##OP
+
+  std::string BaseName = [](BuiltinID ID) {
+    // For simplicity, return all group operations as 'work_group' and replace
+    // the string with 'sub_group' or 'vec_group' post-hoc.
+    switch (ID) {
+      default:
+        return "";
+      case CASE_GROUP_OP_ALL_SCOPES(All):
+        return "__mux_work_group_all";
+      case CASE_GROUP_OP_ALL_SCOPES(Any):
+        return "__mux_work_group_any";
+      case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
+        return "__mux_work_group_broadcast";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+        return "__mux_work_group_reduce_add";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+        return "__mux_work_group_reduce_fadd";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+        return "__mux_work_group_reduce_smin";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+        return "__mux_work_group_reduce_umin";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+        return "__mux_work_group_reduce_fmin";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+        return "__mux_work_group_reduce_smax";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+        return "__mux_work_group_reduce_umax";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+        return "__mux_work_group_reduce_fmax";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+        return "__mux_work_group_reduce_mul";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+        return "__mux_work_group_reduce_fmul";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+        return "__mux_work_group_reduce_and";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+        return "__mux_work_group_reduce_or";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+        return "__mux_work_group_reduce_xor";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+        return "__mux_work_group_reduce_logical_and";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+        return "__mux_work_group_reduce_logical_or";
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+        return "__mux_work_group_reduce_logical_xor";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+        return "__mux_work_group_scan_inclusive_add";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+        return "__mux_work_group_scan_inclusive_fadd";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+        return "__mux_work_group_scan_exclusive_add";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+        return "__mux_work_group_scan_exclusive_fadd";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+        return "__mux_work_group_scan_inclusive_smin";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+        return "__mux_work_group_scan_inclusive_umin";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+        return "__mux_work_group_scan_inclusive_fmin";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+        return "__mux_work_group_scan_exclusive_smin";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+        return "__mux_work_group_scan_exclusive_umin";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+        return "__mux_work_group_scan_exclusive_fmin";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+        return "__mux_work_group_scan_inclusive_smax";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+        return "__mux_work_group_scan_inclusive_umax";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+        return "__mux_work_group_scan_inclusive_fmax";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+        return "__mux_work_group_scan_exclusive_smax";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+        return "__mux_work_group_scan_exclusive_umax";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+        return "__mux_work_group_scan_exclusive_fmax";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+        return "__mux_work_group_scan_inclusive_mul";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+        return "__mux_work_group_scan_inclusive_fmul";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+        return "__mux_work_group_scan_exclusive_mul";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+        return "__mux_work_group_scan_exclusive_fmul";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+        return "__mux_work_group_scan_inclusive_and";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+        return "__mux_work_group_scan_exclusive_and";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+        return "__mux_work_group_scan_inclusive_or";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+        return "__mux_work_group_scan_exclusive_or";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+        return "__mux_work_group_scan_inclusive_xor";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+        return "__mux_work_group_scan_exclusive_xor";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+        return "__mux_work_group_scan_inclusive_logical_and";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+        return "__mux_work_group_scan_exclusive_logical_and";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+        return "__mux_work_group_scan_inclusive_logical_or";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+        return "__mux_work_group_scan_exclusive_logical_or";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+        return "__mux_work_group_scan_inclusive_logical_xor";
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+        return "__mux_work_group_scan_exclusive_logical_xor";
+      case eMuxBuiltinSubgroupShuffle:
+        return "__mux_work_group_shuffle";
+      case eMuxBuiltinSubgroupShuffleUp:
+        return "__mux_work_group_shuffle_up";
+      case eMuxBuiltinSubgroupShuffleDown:
+        return "__mux_work_group_shuffle_down";
+      case eMuxBuiltinSubgroupShuffleXor:
+        return "__mux_work_group_shuffle_xor";
+    }
+  }(ID);
+
+  if (!BaseName.empty()) {
+    assert(!OverloadInfo.empty() &&
+           "Must know how to overload group operation");
+    if (ID >= eFirstMuxSubgroupCollectiveBuiltin &&
+        ID <= eLastMuxSubgroupCollectiveBuiltin) {
+      // Replace 'work' with 'sub'
+      BaseName = BaseName.replace(6, 4, "sub");
+    } else if (ID >= eFirstMuxVecgroupCollectiveBuiltin &&
+               ID <= eLastMuxVecgroupCollectiveBuiltin) {
+      // Replace 'work' with 'vec'
+      BaseName = BaseName.replace(6, 4, "vec");
+    }
+    auto *const Ty = OverloadInfo.front();
+    return BaseName + "_" + getMangledTypeStr(Ty);
+  }
+  llvm_unreachable("Unhandled mux builtin");
+#undef CASE_GROUP_OP_ALL_SCOPES
+}
+
+Function *BuiltinInfo::defineMuxBuiltin(BuiltinID ID, Module &M,
+                                        ArrayRef<Type *> OverloadInfo) {
+  assert(isMuxBuiltinID(ID) && "Only handling mux builtins");
+  // Check that all overloadable builtins have returned some overloading
+  // information, for API consistency.
+  assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
+         "Inconsistency in overloadable builtin APIs");
+
+  Function *F = M.getFunction(getMuxBuiltinName(ID, OverloadInfo));
+  // FIXME: We'd ideally want to declare it here to reduce pass
+  // inter-dependencies.
+  assert(F && "Function should have been pre-declared");
+  if (!F->isDeclaration()) {
+    return F;
+  }
+  // Defer to the mux implementation to define this builtin.
+  return MuxImpl->defineMuxBuiltin(ID, M, OverloadInfo);
+}
+
+Function *BuiltinInfo::getOrDeclareMuxBuiltin(BuiltinID ID, Module &M,
+                                              ArrayRef<Type *> OverloadInfo) {
+  assert(isMuxBuiltinID(ID) && "Only handling mux builtins");
+  // Check that all overloadable builtins have returned some overloading
+  // information, for API consistency.
+  assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
+         "Inconsistency in overloadable builtin APIs");
+  // Defer to the mux implementation to get/declare this builtin.
+  return MuxImpl->getOrDeclareMuxBuiltin(ID, M, OverloadInfo);
+}
+
+std::optional<GroupCollective> BuiltinInfo::isMuxGroupCollective(BuiltinID ID) {
+  GroupCollective Collective;
+
+  if (ID >= eFirstMuxSubgroupCollectiveBuiltin &&
+      ID <= eLastMuxSubgroupCollectiveBuiltin) {
+    Collective.Scope = GroupCollective::ScopeKind::SubGroup;
+  } else if (ID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+             ID <= eLastMuxWorkgroupCollectiveBuiltin) {
+    Collective.Scope = GroupCollective::ScopeKind::WorkGroup;
+  } else if (ID >= eFirstMuxVecgroupCollectiveBuiltin &&
+             ID <= eLastMuxVecgroupCollectiveBuiltin) {
+    Collective.Scope = GroupCollective::ScopeKind::VectorGroup;
+  } else {
+    return std::nullopt;
+  }
+
+  // A sneaky macro to do case statements on all scopes of a group operation.
+  // Note that it is missing a leading 'case' and a trailing ':' to trick
+  // clang-format into formatting it like a regular case statement.
+#define CASE_GROUP_OP_ALL_SCOPES(OP)                      \
+  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP: \
+  case eMuxBuiltinWorkgroup##OP
+
+  switch (ID) {
+    default:
+      llvm_unreachable("Unhandled mux group builtin");
+    case CASE_GROUP_OP_ALL_SCOPES(All):
+      Collective.Op = GroupCollective::OpKind::All;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(Any):
+      Collective.Op = GroupCollective::OpKind::Any;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
+      Collective.Op = GroupCollective::OpKind::Broadcast;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+      Collective.IsLogical = true;
+      [[fallthrough]];
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+      Collective.Op = GroupCollective::OpKind::Reduction;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+      Collective.IsLogical = true;
+      [[fallthrough]];
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+      Collective.Op = GroupCollective::OpKind::ScanInclusive;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+      Collective.IsLogical = true;
+      [[fallthrough]];
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+      Collective.Op = GroupCollective::OpKind::ScanExclusive;
+      break;
+    case eMuxBuiltinSubgroupShuffle:
+      Collective.Op = GroupCollective::OpKind::Shuffle;
+      break;
+    case eMuxBuiltinSubgroupShuffleUp:
+      Collective.Op = GroupCollective::OpKind::ShuffleUp;
+      break;
+    case eMuxBuiltinSubgroupShuffleDown:
+      Collective.Op = GroupCollective::OpKind::ShuffleDown;
+      break;
+    case eMuxBuiltinSubgroupShuffleXor:
+      Collective.Op = GroupCollective::OpKind::ShuffleXor;
+      break;
+  }
+
+  // Then the recurrence kind.
+  if (Collective.Op == GroupCollective::OpKind::All) {
+    Collective.Recurrence = RecurKind::And;
+  } else if (Collective.Op == GroupCollective::OpKind::Any) {
+    Collective.Recurrence = RecurKind::Or;
+  } else if (Collective.Op == GroupCollective::OpKind::Reduction ||
+             Collective.Op == GroupCollective::OpKind::ScanExclusive ||
+             Collective.Op == GroupCollective::OpKind::ScanInclusive) {
+    switch (ID) {
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+        Collective.Recurrence = RecurKind::Add;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+        Collective.Recurrence = RecurKind::FAdd;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+        Collective.Recurrence = RecurKind::Mul;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+        Collective.Recurrence = RecurKind::FMul;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+        Collective.Recurrence = RecurKind::SMin;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+        Collective.Recurrence = RecurKind::UMin;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+        Collective.Recurrence = RecurKind::FMin;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+        Collective.Recurrence = RecurKind::SMax;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+        Collective.Recurrence = RecurKind::UMax;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+        Collective.Recurrence = RecurKind::FMax;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+        Collective.Recurrence = RecurKind::And;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+        Collective.Recurrence = RecurKind::Or;
+        break;
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+        Collective.Recurrence = RecurKind::Xor;
+        break;
+      default:
+        llvm_unreachable("Unhandled mux group operation");
+    }
+  } else if (!Collective.isBroadcast() && !Collective.isShuffleLike()) {
+    llvm_unreachable("Unhandled mux group operation");
+  }
+
+  return Collective;
+#undef CASE_GROUP_OP_ALL_SCOPES
+}
+
+BuiltinID BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
+#define SIMPLE_SCOPE_SWITCH(OP)                     \
+  do {                                              \
+    switch (Group.Scope) {                          \
+      default:                                      \
+        llvm_unreachable("Impossible scope kind");  \
+      case GroupCollective::ScopeKind::SubGroup:    \
+        return eMuxBuiltinSubgroup##OP;             \
+      case GroupCollective::ScopeKind::WorkGroup:   \
+        return eMuxBuiltinWorkgroup##OP;            \
+      case GroupCollective::ScopeKind::VectorGroup: \
+        return eMuxBuiltinVecgroup##OP;             \
+    }                                               \
+  } while (0)
+
+#define COMPLEX_SCOPE_SWITCH(OP, SUFFIX)               \
+  do {                                                 \
+    switch (Group.Recurrence) {                        \
+      default:                                         \
+        llvm_unreachable("Unhandled recursion kind");  \
+      case RecurKind::Add:                             \
+        SIMPLE_SCOPE_SWITCH(OP##Add##SUFFIX);          \
+      case RecurKind::Mul:                             \
+        SIMPLE_SCOPE_SWITCH(OP##Mul##SUFFIX);          \
+      case RecurKind::FAdd:                            \
+        SIMPLE_SCOPE_SWITCH(OP##FAdd##SUFFIX);         \
+      case RecurKind::FMul:                            \
+        SIMPLE_SCOPE_SWITCH(OP##FMul##SUFFIX);         \
+      case RecurKind::SMin:                            \
+        SIMPLE_SCOPE_SWITCH(OP##SMin##SUFFIX);         \
+      case RecurKind::UMin:                            \
+        SIMPLE_SCOPE_SWITCH(OP##UMin##SUFFIX);         \
+      case RecurKind::FMin:                            \
+        SIMPLE_SCOPE_SWITCH(OP##FMin##SUFFIX);         \
+      case RecurKind::SMax:                            \
+        SIMPLE_SCOPE_SWITCH(OP##SMax##SUFFIX);         \
+      case RecurKind::UMax:                            \
+        SIMPLE_SCOPE_SWITCH(OP##UMax##SUFFIX);         \
+      case RecurKind::FMax:                            \
+        SIMPLE_SCOPE_SWITCH(OP##FMax##SUFFIX);         \
+      case RecurKind::And:                             \
+        if (Group.IsLogical) {                         \
+          SIMPLE_SCOPE_SWITCH(OP##LogicalAnd##SUFFIX); \
+        } else {                                       \
+          SIMPLE_SCOPE_SWITCH(OP##And##SUFFIX);        \
+        }                                              \
+      case RecurKind::Or:                              \
+        if (Group.IsLogical) {                         \
+          SIMPLE_SCOPE_SWITCH(OP##LogicalOr##SUFFIX);  \
+        } else {                                       \
+          SIMPLE_SCOPE_SWITCH(OP##Or##SUFFIX);         \
+        }                                              \
+      case RecurKind::Xor:                             \
+        if (Group.IsLogical) {                         \
+          SIMPLE_SCOPE_SWITCH(OP##LogicalXor##SUFFIX); \
+        } else {                                       \
+          SIMPLE_SCOPE_SWITCH(OP##Xor##SUFFIX);        \
+        }                                              \
+    }                                                  \
+  } while (0)
+
+  switch (Group.Op) {
+    case GroupCollective::OpKind::All:
+      SIMPLE_SCOPE_SWITCH(All);
+    case GroupCollective::OpKind::Any:
+      SIMPLE_SCOPE_SWITCH(Any);
+    case GroupCollective::OpKind::Broadcast:
+      SIMPLE_SCOPE_SWITCH(Broadcast);
+    case GroupCollective::OpKind::Reduction:
+      COMPLEX_SCOPE_SWITCH(Reduce, );
+    case GroupCollective::OpKind::ScanExclusive:
+      COMPLEX_SCOPE_SWITCH(Scan, Exclusive);
+    case GroupCollective::OpKind::ScanInclusive:
+      COMPLEX_SCOPE_SWITCH(Scan, Inclusive);
+      break;
+    case GroupCollective::OpKind::Shuffle:
+      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffle
+                                     : eBuiltinInvalid;
+    case GroupCollective::OpKind::ShuffleUp:
+      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffleUp
+                                     : eBuiltinInvalid;
+    case GroupCollective::OpKind::ShuffleDown:
+      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffleDown
+                                     : eBuiltinInvalid;
+    case GroupCollective::OpKind::ShuffleXor:
+      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffleXor
+                                     : eBuiltinInvalid;
+  }
+  return eBuiltinInvalid;
+#undef COMPLEX_SCOPE_SWITCH
+#undef SCOPE_SWITCH
+}
+
+bool BuiltinInfo::isOverloadableMuxBuiltinID(BuiltinID ID) {
+  if (!isMuxBuiltinID(ID)) {
+    return false;
+  }
+  switch (ID) {
+    default:
+      return isMuxGroupCollective(ID).has_value();
+    case eMuxBuiltinDMARead1D:
+    case eMuxBuiltinDMAWrite1D:
+    case eMuxBuiltinDMARead2D:
+    case eMuxBuiltinDMAWrite2D:
+    case eMuxBuiltinDMARead3D:
+    case eMuxBuiltinDMAWrite3D:
+      return true;
+  }
+}
+
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
new file mode 100644
index 0000000000000..92faf3552cbd4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -0,0 +1,3671 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/cl_builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/Support/Compiler.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/MathExtras.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cmath>
+#include <set>
+
+// For compatibility with the Android NDK, we need to use the C ilogb function.
+namespace stdcompat {
+#ifdef __ANDROID__
+// Note: This function accepts double only as its argument
+using ::ilogb;
+#else
+using std::ilogb;
+#endif  // __ANDROID__
+}  // namespace stdcompat
+
+namespace {
+/// @brief Identifiers for recognized OpenCL builtins.
+enum CLBuiltinID : compiler::utils::BuiltinID {
+  // Non-standard Builtin Functions
+  /// @brief Internal builtin 'convert_half_to_float'.
+  eCLBuiltinConvertHalfToFloat = compiler::utils::eFirstTargetBuiltin,
+  /// @brief Internal builtin 'convert_float_to_half'.
+  eCLBuiltinConvertFloatToHalf,
+  /// @brief Internal builtin 'convert_float_to_half_rte'
+  eCLBuiltinConvertFloatToHalfRte,
+  /// @brief Internal builtin 'convert_float_to_half_rtz'
+  eCLBuiltinConvertFloatToHalfRtz,
+  /// @brief Internal builtin 'convert_float_to_half_rtp'
+  eCLBuiltinConvertFloatToHalfRtp,
+  /// @brief Internal builtin 'convert_float_to_half_rtn'
+  eCLBuiltinConvertFloatToHalfRtn,
+  /// @brief Internal builtin 'convert_half_to_double'.
+  eCLBuiltinConvertHalfToDouble,
+  /// @brief Internal builtin 'convert_double_to_half'.
+  eCLBuiltinConvertDoubleToHalf,
+  /// @brief Internal builtin 'convert_double_to_half_rte'
+  eCLBuiltinConvertDoubleToHalfRte,
+  /// @brief Internal builtin 'convert_double_to_half_rtz'
+  eCLBuiltinConvertDoubleToHalfRtz,
+  /// @brief Internal builtin 'convert_double_to_half_rtp'
+  eCLBuiltinConvertDoubleToHalfRtp,
+  /// @brief Internal builtin 'convert_double_to_half_rtn'
+  eCLBuiltinConvertDoubleToHalfRtn,
+
+  // 6.2.3 Explicit Conversions
+  /// @brief OpenCL builtin `convert_char`
+  eCLBuiltinConvertChar,
+  /// @brief OpenCL builtin `convert_short`
+  eCLBuiltinConvertShort,
+  /// @brief OpenCL builtin `convert_int`
+  eCLBuiltinConvertInt,
+  /// @brief OpenCL builtin `convert_long`
+  eCLBuiltinConvertLong,
+  /// @brief OpenCL builtin `convert_uchar`
+  eCLBuiltinConvertUChar,
+  /// @brief OpenCL builtin `convert_ushort`
+  eCLBuiltinConvertUShort,
+  /// @brief OpenCL builtin `convert_uint`
+  eCLBuiltinConvertUInt,
+  /// @brief OpenCL builtin `convert_ulong`
+  eCLBuiltinConvertULong,
+
+  // 6.12.1 Work-Item Functions
+  /// @brief OpenCL builtin 'get_work_dim'.
+  eCLBuiltinGetWorkDim,
+  /// @brief OpenCL builtin 'get_group_id'.
+  eCLBuiltinGetGroupId,
+  /// @brief OpenCL builtin 'get_global_size'.
+  eCLBuiltinGetGlobalSize,
+  /// @brief OpenCL builtin 'get_global_offset'.
+  eCLBuiltinGetGlobalOffset,
+  /// @brief OpenCL builtin 'get_local_id'.
+  eCLBuiltinGetLocalId,
+  /// @brief OpenCL builtin 'get_local_size'.
+  eCLBuiltinGetLocalSize,
+  /// @brief OpenCL builtin 'get_enqueued_local_size'.
+  eCLBuiltinGetEnqueuedLocalSize,
+  /// @brief OpenCL builtin 'get_num_groups'.
+  eCLBuiltinGetNumGroups,
+  /// @brief OpenCL builtin 'get_global_id'.
+  eCLBuiltinGetGlobalId,
+  /// @brief OpenCL builtin 'get_local_linear_id' (OpenCL >= 2.0).
+  eCLBuiltinGetLocalLinearId,
+  /// @brief OpenCL builtin 'get_global_linear_id' (OpenCL >= 2.0).
+  eCLBuiltinGetGlobalLinearId,
+  /// @brief OpenCL builtin 'get_sub_group_local_id' (OpenCL >= 3.0).
+  eCLBuiltinGetSubgroupLocalId,
+  /// @brief OpenCL builtin 'get_sub_group_size' (OpenCL >= 3.0).
+  eCLBuiltinGetSubgroupSize,
+  /// @brief OpenCL builtin 'get_max_sub_group_size' (OpenCL >= 3.0).
+  eCLBuiltinGetMaxSubgroupSize,
+  /// @brief OpenCL builtin 'get_num_sub_groups' (OpenCL >= 3.0).
+  eCLBuiltinGetNumSubgroups,
+  /// @brief OpenCL builtin 'get_enqueued_num_sub_groups' (OpenCL >= 3.0).
+  eCLBuiltinGetEnqueuedNumSubgroups,
+  /// @brief OpenCL builtin 'get_sub_group_id' (OpenCL >= 3.0).
+  eCLBuiltinGetSubgroupId,
+
+  // 6.12.2 Math Functions
+  /// @brief OpenCL builtin 'fmax'.
+  eCLBuiltinFMax,
+  /// @brief OpenCL builtin 'fmin'.
+  eCLBuiltinFMin,
+  /// @brief OpenCL builtin 'fract'.
+  eCLBuiltinFract,
+  /// @brief OpenCL builtin 'frexp'.
+  eCLBuiltinFrexp,
+  /// @brief OpenCL builtin 'lgamma_r'.
+  eCLBuiltinLGammaR,
+  /// @brief OpenCL builtin 'modf'.
+  eCLBuiltinModF,
+  /// @brief OpenCL builtin 'sincos'.
+  eCLBuiltinSinCos,
+  /// @brief OpenCL builtin 'remquo'.
+  eCLBuiltinRemquo,
+
+  // 6.12.3 Integer Functions
+  /// @brief OpenCL builtin 'add_sat'.
+  eCLBuiltinAddSat,
+  /// @brief OpenCL builtin 'sub_sat'.
+  eCLBuiltinSubSat,
+
+  // 6.12.5 Geometric Builtin-in Functions
+  /// @brief OpenCL builtin 'dot'.
+  eCLBuiltinDot,
+  /// @brief OpenCL builtin 'cross'.
+  eCLBuiltinCross,
+  /// @brief OpenCL builtin 'length'.
+  eCLBuiltinLength,
+  /// @brief OpenCL builtin 'distance'.
+  eCLBuiltinDistance,
+  /// @brief OpenCL builtin 'normalize'.
+  eCLBuiltinNormalize,
+  /// @brief OpenCL builtin 'fast_length'.
+  eCLBuiltinFastLength,
+  /// @brief OpenCL builtin 'fast_distance'.
+  eCLBuiltinFastDistance,
+  /// @brief OpenCL builtin 'fast_normalize'.
+  eCLBuiltinFastNormalize,
+
+  // 6.12.6 Relational Functions
+  /// @brief OpenCL builtin 'all'.
+  eCLBuiltinAll,
+  /// @brief OpenCL builtin 'any'.
+  eCLBuiltinAny,
+  /// @brief OpenCL builtin 'isequal'.
+  eCLBuiltinIsEqual,
+  /// @brief OpenCL builtin 'isnotequal'.
+  eCLBuiltinIsNotEqual,
+  /// @brief OpenCL builtin 'isgreater'.
+  eCLBuiltinIsGreater,
+  /// @brief OpenCL builtin 'isgreaterequal'.
+  eCLBuiltinIsGreaterEqual,
+  /// @brief OpenCL builtin 'isless'.
+  eCLBuiltinIsLess,
+  /// @brief OpenCL builtin 'islessequal'.
+  eCLBuiltinIsLessEqual,
+  /// @brief OpenCL builtin 'islessgreater'.
+  eCLBuiltinIsLessGreater,
+  /// @brief OpenCL builtin 'isordered'.
+  eCLBuiltinIsOrdered,
+  /// @brief OpenCL builtin 'isunordered'.
+  eCLBuiltinIsUnordered,
+  /// @brief OpenCL builtin 'isfinite'.
+  eCLBuiltinIsFinite,
+  /// @brief OpenCL builtin 'isinf'.
+  eCLBuiltinIsInf,
+  /// @brief OpenCL builtin 'isnan'.
+  eCLBuiltinIsNan,
+  /// @brief OpenCL builtin 'isnormal'.
+  eCLBuiltinIsNormal,
+  /// @brief OpenCL builtin 'signbit'.
+  eCLBuiltinSignBit,
+  /// @brief OpenCL builtin `select`.
+  eCLBuiltinSelect,
+
+  // 6.12.8 Synchronization Functions
+  /// @brief OpenCL builtin 'barrier'.
+  eCLBuiltinBarrier,
+  /// @brief OpenCL builtin 'mem_fence'.
+  eCLBuiltinMemFence,
+  /// @brief OpenCL builtin 'read_mem_fence'.
+  eCLBuiltinReadMemFence,
+  /// @brief OpenCL builtin 'write_mem_fence'.
+  eCLBuiltinWriteMemFence,
+  /// @brief OpenCL builtin 'atomic_work_item_fence'.
+  eCLBuiltinAtomicWorkItemFence,
+  /// @brief OpenCL builtin 'sub_group_barrier'.
+  eCLBuiltinSubGroupBarrier,
+  /// @brief OpenCL builtin 'work_group_barrier'.
+  eCLBuiltinWorkGroupBarrier,
+
+  // 6.12.10 Async Copies and Prefetch Functions
+  /// @brief OpenCL builtin 'async_work_group_copy'.
+  eCLBuiltinAsyncWorkGroupCopy,
+  /// @brief OpenCL builtin 'async_work_group_strided_copy'.
+  eCLBuiltinAsyncWorkGroupStridedCopy,
+  /// @brief OpenCL builtin 'wait_group_events'.
+  eCLBuiltinWaitGroupEvents,
+  /// @brief OpenCL builtin 'async_work_group_copy_2D2D'.
+  eCLBuiltinAsyncWorkGroupCopy2D2D,
+  /// @brief OpenCL builtin 'async_work_group_copy_3D3D'.
+  eCLBuiltinAsyncWorkGroupCopy3D3D,
+
+  // 6.12.11 Atomic Functions
+  /// @brief OpenCL builtins 'atomic_add', 'atom_add'.
+  eCLBuiltinAtomicAdd,
+  /// @brief OpenCL builtins 'atomic_sub', 'atom_sub'.
+  eCLBuiltinAtomicSub,
+  /// @brief OpenCL builtins 'atomic_xchg', 'atom_xchg'.
+  eCLBuiltinAtomicXchg,
+  /// @brief OpenCL builtins 'atomic_inc', 'atom_inc'.
+  eCLBuiltinAtomicInc,
+  /// @brief OpenCL builtins 'atomic_dec', 'atom_dec'.
+  eCLBuiltinAtomicDec,
+  /// @brief OpenCL builtins 'atomic_cmpxchg', 'atom_cmpxchg'.
+  eCLBuiltinAtomicCmpxchg,
+  /// @brief OpenCL builtins 'atomic_min', 'atom_min'.
+  eCLBuiltinAtomicMin,
+  /// @brief OpenCL builtins 'atomic_max', 'atom_max'.
+  eCLBuiltinAtomicMax,
+  /// @brief OpenCL builtins 'atomic_and', 'atom_and'.
+  eCLBuiltinAtomicAnd,
+  /// @brief OpenCL builtins 'atomic_or', 'atom_or'.
+  eCLBuiltinAtomicOr,
+  /// @brief OpenCL builtins 'atomic_xor', 'atom_xor'.
+  eCLBuiltinAtomicXor,
+
+  // 6.12.12 Miscellaneous Vector Functions
+  eCLBuiltinShuffle,
+  eCLBuiltinShuffle2,
+
+  // 6.12.13 printf
+  /// @brief OpenCL builtin 'printf'.
+  eCLBuiltinPrintf,
+
+  // 6.15.16 Work-group Collective Functions
+  /// @brief OpenCL builtin 'work_group_all'.
+  eCLBuiltinWorkgroupAll,
+  /// @brief OpenCL builtin 'work_group_any'.
+  eCLBuiltinWorkgroupAny,
+  /// @brief OpenCL builtin 'work_group_broadcast'.
+  eCLBuiltinWorkgroupBroadcast,
+  /// @brief OpenCL builtin 'work_group_reduce_add'.
+  eCLBuiltinWorkgroupReduceAdd,
+  /// @brief OpenCL builtin 'work_group_reduce_min'.
+  eCLBuiltinWorkgroupReduceMin,
+  /// @brief OpenCL builtin 'work_group_reduce_max'.
+  eCLBuiltinWorkgroupReduceMax,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_add'.
+  eCLBuiltinWorkgroupScanAddInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_add'.
+  eCLBuiltinWorkgroupScanAddExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_min'.
+  eCLBuiltinWorkgroupScanMinInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_min'.
+  eCLBuiltinWorkgroupScanMinExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_max'.
+  eCLBuiltinWorkgroupScanMaxInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_max'.
+  eCLBuiltinWorkgroupScanMaxExclusive,
+
+  /// @brief OpenCL builtin 'work_group_reduce_mul'.
+  eCLBuiltinWorkgroupReduceMul,
+  /// @brief OpenCL builtin 'work_group_reduce_and'.
+  eCLBuiltinWorkgroupReduceAnd,
+  /// @brief OpenCL builtin 'work_group_reduce_or'.
+  eCLBuiltinWorkgroupReduceOr,
+  /// @brief OpenCL builtin 'work_group_reduce_xor'.
+  eCLBuiltinWorkgroupReduceXor,
+  /// @brief OpenCL builtin 'work_group_reduce_logical_and'.
+  eCLBuiltinWorkgroupReduceLogicalAnd,
+  /// @brief OpenCL builtin 'work_group_reduce_logical_or'.
+  eCLBuiltinWorkgroupReduceLogicalOr,
+  /// @brief OpenCL builtin 'work_group_reduce_logical_xor'.
+  eCLBuiltinWorkgroupReduceLogicalXor,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_mul'.
+  eCLBuiltinWorkgroupScanMulInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_mul'.
+  eCLBuiltinWorkgroupScanMulExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_and'.
+  eCLBuiltinWorkgroupScanAndInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_and'.
+  eCLBuiltinWorkgroupScanAndExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_or'.
+  eCLBuiltinWorkgroupScanOrInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_or'.
+  eCLBuiltinWorkgroupScanOrExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_xor'.
+  eCLBuiltinWorkgroupScanXorInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_xor'.
+  eCLBuiltinWorkgroupScanXorExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_and'.
+  eCLBuiltinWorkgroupScanLogicalAndInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_and'.
+  eCLBuiltinWorkgroupScanLogicalAndExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_or'.
+  eCLBuiltinWorkgroupScanLogicalOrInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_or'.
+  eCLBuiltinWorkgroupScanLogicalOrExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_xor'.
+  eCLBuiltinWorkgroupScanLogicalXorInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_xor'.
+  eCLBuiltinWorkgroupScanLogicalXorExclusive,
+
+  // 6.15.19 Subgroup Collective Functions
+  /// @brief OpenCL builtin 'sub_group_all'.
+  eCLBuiltinSubgroupAll,
+  /// @brief OpenCL builtin 'sub_group_any'.
+  eCLBuiltinSubgroupAny,
+  /// @brief OpenCL builtin 'sub_group_broadcast'.
+  eCLBuiltinSubgroupBroadcast,
+  /// @brief OpenCL builtin 'sub_group_reduce_add'.
+  eCLBuiltinSubgroupReduceAdd,
+  /// @brief OpenCL builtin 'sub_group_reduce_min'.
+  eCLBuiltinSubgroupReduceMin,
+  /// @brief OpenCL builtin 'sub_group_reduce_max'.
+  eCLBuiltinSubgroupReduceMax,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_add'.
+  eCLBuiltinSubgroupScanAddInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_add'.
+  eCLBuiltinSubgroupScanAddExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_min'.
+  eCLBuiltinSubgroupScanMinInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_min'.
+  eCLBuiltinSubgroupScanMinExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_max'.
+  eCLBuiltinSubgroupScanMaxInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_max'.
+  eCLBuiltinSubgroupScanMaxExclusive,
+
+  /// @brief OpenCL builtin 'sub_group_reduce_mul'.
+  eCLBuiltinSubgroupReduceMul,
+  /// @brief OpenCL builtin 'sub_group_reduce_and'.
+  eCLBuiltinSubgroupReduceAnd,
+  /// @brief OpenCL builtin 'sub_group_reduce_or'.
+  eCLBuiltinSubgroupReduceOr,
+  /// @brief OpenCL builtin 'sub_group_reduce_xor'.
+  eCLBuiltinSubgroupReduceXor,
+  /// @brief OpenCL builtin 'sub_group_reduce_logical_and'.
+  eCLBuiltinSubgroupReduceLogicalAnd,
+  /// @brief OpenCL builtin 'sub_group_reduce_logical_or'.
+  eCLBuiltinSubgroupReduceLogicalOr,
+  /// @brief OpenCL builtin 'sub_group_reduce_logical_xor'.
+  eCLBuiltinSubgroupReduceLogicalXor,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_mul'.
+  eCLBuiltinSubgroupScanMulInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_mul'.
+  eCLBuiltinSubgroupScanMulExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_and'.
+  eCLBuiltinSubgroupScanAndInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_and'.
+  eCLBuiltinSubgroupScanAndExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_or'.
+  eCLBuiltinSubgroupScanOrInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_or'.
+  eCLBuiltinSubgroupScanOrExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_xor'.
+  eCLBuiltinSubgroupScanXorInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_xor'.
+  eCLBuiltinSubgroupScanXorExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_and'.
+  eCLBuiltinSubgroupScanLogicalAndInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_and'.
+  eCLBuiltinSubgroupScanLogicalAndExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_or'.
+  eCLBuiltinSubgroupScanLogicalOrInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_or'.
+  eCLBuiltinSubgroupScanLogicalOrExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_xor'.
+  eCLBuiltinSubgroupScanLogicalXorInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_xor'.
+  eCLBuiltinSubgroupScanLogicalXorExclusive,
+
+  // GLSL builtin functions
+  eCLBuiltinCodeplayFindLSB,
+  eCLBuiltinCodeplayFindMSB,
+  eCLBuiltinCodeplayBitReverse,
+  eCLBuiltinCodeplayFaceForward,
+  eCLBuiltinCodeplayReflect,
+  eCLBuiltinCodeplayRefract,
+  eCLBuiltinCodeplayPackNormalizeChar4,
+  eCLBuiltinCodeplayPackNormalizeUchar4,
+  eCLBuiltinCodeplayPackNormalizeShort2,
+  eCLBuiltinCodeplayPackNormalizeUshort2,
+  eCLBuiltinCodeplayPackHalf2,
+  eCLBuiltinCodeplayUnpackNormalize,
+  eCLBuiltinCodeplayUnpackHalf2,
+
+  // 6.12.7 Vector Data Load and Store Functions
+  eCLBuiltinVLoad,
+  eCLBuiltinVLoadHalf,
+  eCLBuiltinVStore,
+  eCLBuiltinVStoreHalf,
+
+  // 6.3 Conversions & Type Casting Examples
+  eCLBuiltinAs,
+};
+}  // namespace
+
+namespace {
+using namespace llvm;
+using namespace compiler::utils;
+
+// Returns whether the given integer is a valid vector width in OpenCL.
+// Matches 2, 3, 4, 8, 16.
+bool isValidVecWidth(unsigned w) {
+  return (w == 3 || (w >= 2 && w <= 16 && llvm::isPowerOf2_32(w)));
+}
+
+/// @brief Copy global variables to a module on demand.
+class GlobalValueMaterializer final : public llvm::ValueMaterializer {
+ public:
+  /// @brief Create a new global variable materializer.
+  /// @param[in] M Module to materialize the variables in.
+  GlobalValueMaterializer(Module &M) : DestM(M) {}
+
+  /// @brief List of variables created during materialization.
+  const std::vector<GlobalVariable *> &variables() const { return Variables; }
+
+  /// @brief Materialize the given value.
+  ///
+  /// @param[in] V Value to materialize.
+  ///
+  /// @return A value that lives in the destination module, or nullptr if the
+  /// given value could not be materialized (e.g. it is not a global variable).
+  Value *materialize(Value *V) override final {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+    if (!GV) {
+      return nullptr;
+    }
+    GlobalVariable *NewGV = DestM.getGlobalVariable(GV->getName());
+    if (!NewGV) {
+      NewGV = new GlobalVariable(
+          DestM, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+          (Constant *)nullptr, GV->getName(), (GlobalVariable *)nullptr,
+          GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+      NewGV->copyAttributesFrom(GV);
+      Variables.push_back(GV);
+    }
+    return NewGV;
+  }
+
+ private:
+  /// @brief Modules to materialize variables in.
+  Module &DestM;
+  /// @brief Materialized variables.
+  std::vector<GlobalVariable *> Variables;
+};
+}  // namespace
+
+namespace compiler {
+namespace utils {
+using namespace llvm;
+
+std::unique_ptr<BILangInfoConcept> createCLBuiltinInfo(Module *Builtins) {
+  return std::make_unique<CLBuiltinInfo>(Builtins);
+}
+
+CLBuiltinInfo::CLBuiltinInfo(Module *builtins)
+    : Loader(std::make_unique<SimpleCLBuiltinLoader>(builtins)) {}
+
+CLBuiltinInfo::~CLBuiltinInfo() = default;
+
+/// @brief Create a call instruction to the given builtin and set the correct
+/// calling convention.
+///
+/// This function is intended as a helper function for creating calls to
+/// builtins. For each call generated we need to set the calling convention
+/// manually, which can lead to code bloat. This function will create the call
+/// instruction and then it will either copy the calling convention for the
+/// called function (if possible) or set it to the default value of spir_func.
+///
+/// @param[in] B The IRBuilder to use when creating the CallInst
+/// @param[in] Builtin The Function to call
+/// @param[in] Args The call arguments
+/// @param[in] NameStr The name for the new CallInst
+/// @return The newly emitted CallInst
+static CallInst *CreateBuiltinCall(IRBuilder<> &B, Function *Builtin,
+                                   ArrayRef<Value *> Args,
+                                   const Twine &NameStr = "") {
+  CallInst *CI =
+      B.CreateCall(Builtin->getFunctionType(), Builtin, Args, NameStr);
+  CI->setCallingConv(Builtin->getCallingConv());
+  return CI;
+}
+
+struct CLBuiltinEntry {
+  /// @brief Identifier for the builtin function.
+  BuiltinID ID;
+  /// @brief OpenCL name of the builtin function.
+  const char *OpenCLFnName;
+  /// @brief Minimum OpenCL version that supports this builtin.
+  uint32_t MinVer = OpenCLC10;
+};
+
+/// @brief Information about known OpenCL builtins.
+static constexpr CLBuiltinEntry Builtins[] = {
+    // Non-standard Builtin Functions
+    {eCLBuiltinConvertHalfToFloat, "convert_half_to_float"},
+    {eCLBuiltinConvertFloatToHalf, "convert_float_to_half"},
+    {eCLBuiltinConvertFloatToHalfRte, "convert_float_to_half_rte"},
+    {eCLBuiltinConvertFloatToHalfRtz, "convert_float_to_half_rtz"},
+    {eCLBuiltinConvertFloatToHalfRtp, "convert_float_to_half_rtp"},
+    {eCLBuiltinConvertFloatToHalfRtn, "convert_float_to_half_rtn"},
+    {eCLBuiltinConvertHalfToDouble, "convert_half_to_double"},
+    {eCLBuiltinConvertDoubleToHalf, "convert_double_to_half"},
+    {eCLBuiltinConvertDoubleToHalfRte, "convert_double_to_half_rte"},
+    {eCLBuiltinConvertDoubleToHalfRtz, "convert_double_to_half_rtz"},
+    {eCLBuiltinConvertDoubleToHalfRtp, "convert_double_to_half_rtp"},
+    {eCLBuiltinConvertDoubleToHalfRtn, "convert_double_to_half_rtn"},
+
+    // 6.2.3 Explicit Conversions
+    {eCLBuiltinConvertChar, "convert_char"},
+    {eCLBuiltinConvertShort, "convert_short"},
+    {eCLBuiltinConvertInt, "convert_int"},
+    {eCLBuiltinConvertLong, "convert_long"},
+    {eCLBuiltinConvertUChar, "convert_uchar"},
+    {eCLBuiltinConvertUShort, "convert_ushort"},
+    {eCLBuiltinConvertUInt, "convert_uint"},
+    {eCLBuiltinConvertULong, "convert_ulong"},
+
+    // 6.12.1 Work-Item Functions
+    {eCLBuiltinGetWorkDim, "get_work_dim"},
+    {eCLBuiltinGetGroupId, "get_group_id"},
+    {eCLBuiltinGetGlobalSize, "get_global_size"},
+    {eCLBuiltinGetGlobalOffset, "get_global_offset"},
+    {eCLBuiltinGetLocalId, "get_local_id"},
+    {eCLBuiltinGetLocalSize, "get_local_size"},
+    {eCLBuiltinGetEnqueuedLocalSize, "get_enqueued_local_size"},
+    {eCLBuiltinGetNumGroups, "get_num_groups"},
+    {eCLBuiltinGetGlobalId, "get_global_id"},
+    {eCLBuiltinGetLocalLinearId, "get_local_linear_id", OpenCLC20},
+    {eCLBuiltinGetGlobalLinearId, "get_global_linear_id", OpenCLC20},
+    {eCLBuiltinGetSubgroupLocalId, "get_sub_group_local_id", OpenCLC30},
+    {eCLBuiltinGetSubgroupSize, "get_sub_group_size", OpenCLC30},
+    {eCLBuiltinGetMaxSubgroupSize, "get_max_sub_group_size", OpenCLC30},
+    {eCLBuiltinGetNumSubgroups, "get_num_sub_groups", OpenCLC30},
+    {eCLBuiltinGetEnqueuedNumSubgroups, "get_enqueued_num_sub_groups",
+     OpenCLC30},
+    {eCLBuiltinGetSubgroupId, "get_sub_group_id", OpenCLC30},
+
+    // 6.12.2 Math Functions
+    {eCLBuiltinFMax, "fmax"},
+    {eCLBuiltinFMin, "fmin"},
+    {eCLBuiltinFract, "fract"},
+    {eCLBuiltinFrexp, "frexp"},
+    {eCLBuiltinLGammaR, "lgamma_r"},
+    {eCLBuiltinModF, "modf"},
+    {eCLBuiltinSinCos, "sincos"},
+    {eCLBuiltinRemquo, "remquo"},
+
+    // 6.12.3 Integer Functions
+    {eCLBuiltinAddSat, "add_sat"},
+    {eCLBuiltinSubSat, "sub_sat"},
+
+    // 6.12.5 Geometric Functions
+    {eCLBuiltinDot, "dot"},
+    {eCLBuiltinCross, "cross"},
+    {eCLBuiltinLength, "length"},
+    {eCLBuiltinDistance, "distance"},
+    {eCLBuiltinNormalize, "normalize"},
+    {eCLBuiltinFastLength, "fast_length"},
+    {eCLBuiltinFastDistance, "fast_distance"},
+    {eCLBuiltinFastNormalize, "fast_normalize"},
+
+    // 6.12.6 Relational Functions
+    {eCLBuiltinAll, "all"},
+    {eCLBuiltinAny, "any"},
+    {eCLBuiltinIsEqual, "isequal"},
+    {eCLBuiltinIsNotEqual, "isnotequal"},
+    {eCLBuiltinIsGreater, "isgreater"},
+    {eCLBuiltinIsGreaterEqual, "isgreaterequal"},
+    {eCLBuiltinIsLess, "isless"},
+    {eCLBuiltinIsLessEqual, "islessequal"},
+    {eCLBuiltinIsLessGreater, "islessgreater"},
+    {eCLBuiltinIsOrdered, "isordered"},
+    {eCLBuiltinIsUnordered, "isunordered"},
+    {eCLBuiltinIsFinite, "isfinite"},
+    {eCLBuiltinIsInf, "isinf"},
+    {eCLBuiltinIsNan, "isnan"},
+    {eCLBuiltinIsNormal, "isnormal"},
+    {eCLBuiltinSignBit, "signbit"},
+    {eCLBuiltinSelect, "select"},
+
+    // 6.12.8 Synchronization Functions
+    {eCLBuiltinBarrier, "barrier"},
+    {eCLBuiltinMemFence, "mem_fence"},
+    {eCLBuiltinReadMemFence, "read_mem_fence"},
+    {eCLBuiltinWriteMemFence, "write_mem_fence"},
+    {eCLBuiltinAtomicWorkItemFence, "atomic_work_item_fence", OpenCLC20},
+    {eCLBuiltinSubGroupBarrier, "sub_group_barrier", OpenCLC30},
+    {eCLBuiltinWorkGroupBarrier, "work_group_barrier", OpenCLC20},
+
+    // 6.12.10 Async Copies and Prefetch Functions
+    {eCLBuiltinAsyncWorkGroupCopy, "async_work_group_copy"},
+    {eCLBuiltinAsyncWorkGroupStridedCopy, "async_work_group_strided_copy"},
+    {eCLBuiltinWaitGroupEvents, "wait_group_events"},
+    {eCLBuiltinAsyncWorkGroupCopy2D2D, "async_work_group_copy_2D2D"},
+    {eCLBuiltinAsyncWorkGroupCopy3D3D, "async_work_group_copy_3D3D"},
+
+    // 6.12.11 Atomic Functions
+    {eCLBuiltinAtomicAdd, "atom_add"},
+    {eCLBuiltinAtomicSub, "atom_sub"},
+    {eCLBuiltinAtomicXchg, "atom_xchg"},
+    {eCLBuiltinAtomicInc, "atom_inc"},
+    {eCLBuiltinAtomicDec, "atom_dec"},
+    {eCLBuiltinAtomicCmpxchg, "atom_cmpxchg"},
+    {eCLBuiltinAtomicMin, "atom_min"},
+    {eCLBuiltinAtomicMax, "atom_max"},
+    {eCLBuiltinAtomicAnd, "atom_and"},
+    {eCLBuiltinAtomicOr, "atom_or"},
+    {eCLBuiltinAtomicXor, "atom_xor"},
+    {eCLBuiltinAtomicAdd, "atomic_add"},
+    {eCLBuiltinAtomicSub, "atomic_sub"},
+    {eCLBuiltinAtomicXchg, "atomic_xchg"},
+    {eCLBuiltinAtomicInc, "atomic_inc"},
+    {eCLBuiltinAtomicDec, "atomic_dec"},
+    {eCLBuiltinAtomicCmpxchg, "atomic_cmpxchg"},
+    {eCLBuiltinAtomicMin, "atomic_min"},
+    {eCLBuiltinAtomicMax, "atomic_max"},
+    {eCLBuiltinAtomicAnd, "atomic_and"},
+    {eCLBuiltinAtomicOr, "atomic_or"},
+    {eCLBuiltinAtomicXor, "atomic_xor"},
+
+    // 6.11.12 Miscellaneous Vector Functions
+    {eCLBuiltinShuffle, "shuffle"},
+    {eCLBuiltinShuffle2, "shuffle2"},
+
+    // 6.12.13 printf
+    {eCLBuiltinPrintf, "printf"},
+
+    // 6.15.16 Work-group Collective Functions
+    {eCLBuiltinWorkgroupAll, "work_group_all", OpenCLC20},
+    {eCLBuiltinWorkgroupAny, "work_group_any", OpenCLC20},
+    {eCLBuiltinWorkgroupBroadcast, "work_group_broadcast", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceAdd, "work_group_reduce_add", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceMin, "work_group_reduce_min", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceMax, "work_group_reduce_max", OpenCLC20},
+    {eCLBuiltinWorkgroupScanAddInclusive, "work_group_scan_inclusive_add",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanAddExclusive, "work_group_scan_exclusive_add",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMinInclusive, "work_group_scan_inclusive_min",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMinExclusive, "work_group_scan_exclusive_min",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMaxInclusive, "work_group_scan_inclusive_max",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMaxExclusive, "work_group_scan_exclusive_max",
+     OpenCLC20},
+
+    /// Provided by SPV_KHR_uniform_group_instructions.
+    {eCLBuiltinWorkgroupReduceMul, "work_group_reduce_mul", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceAnd, "work_group_reduce_and", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceOr, "work_group_reduce_or", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceXor, "work_group_reduce_xor", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceLogicalAnd, "work_group_reduce_logical_and",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupReduceLogicalOr, "work_group_reduce_logical_or",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupReduceLogicalXor, "work_group_reduce_logical_xor",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMulInclusive, "work_group_scan_inclusive_mul",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMulExclusive, "work_group_scan_exclusive_mul",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanAndInclusive, "work_group_scan_inclusive_and",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanAndExclusive, "work_group_scan_exclusive_and",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanOrInclusive, "work_group_scan_inclusive_or",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanOrExclusive, "work_group_scan_exclusive_or",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanXorInclusive, "work_group_scan_inclusive_xor",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanXorExclusive, "work_group_scan_exclusive_xor",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalAndInclusive,
+     "work_group_scan_inclusive_logical_and", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalAndExclusive,
+     "work_group_scan_exclusive_logical_and", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalOrInclusive,
+     "work_group_scan_inclusive_logical_or", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalOrExclusive,
+     "work_group_scan_exclusive_logical_or", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalXorInclusive,
+     "work_group_scan_inclusive_logical_xor", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalXorExclusive,
+     "work_group_scan_exclusive_logical_xor", OpenCLC20},
+
+    // 6.15.19 Subgroup Collective Functions
+    {eCLBuiltinSubgroupAll, "sub_group_all", OpenCLC30},
+    {eCLBuiltinSubgroupAny, "sub_group_any", OpenCLC30},
+    {eCLBuiltinSubgroupBroadcast, "sub_group_broadcast", OpenCLC30},
+    {eCLBuiltinSubgroupReduceAdd, "sub_group_reduce_add", OpenCLC30},
+    {eCLBuiltinSubgroupReduceMin, "sub_group_reduce_min", OpenCLC30},
+    {eCLBuiltinSubgroupReduceMax, "sub_group_reduce_max", OpenCLC30},
+    {eCLBuiltinSubgroupScanAddInclusive, "sub_group_scan_inclusive_add",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanAddExclusive, "sub_group_scan_exclusive_add",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMinInclusive, "sub_group_scan_inclusive_min",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMinExclusive, "sub_group_scan_exclusive_min",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMaxInclusive, "sub_group_scan_inclusive_max",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMaxExclusive, "sub_group_scan_exclusive_max",
+     OpenCLC30},
+    /// Provided by SPV_KHR_uniform_group_instructions.
+    {eCLBuiltinSubgroupReduceMul, "sub_group_reduce_mul", OpenCLC30},
+    {eCLBuiltinSubgroupReduceAnd, "sub_group_reduce_and", OpenCLC30},
+    {eCLBuiltinSubgroupReduceOr, "sub_group_reduce_or", OpenCLC30},
+    {eCLBuiltinSubgroupReduceXor, "sub_group_reduce_xor", OpenCLC30},
+    {eCLBuiltinSubgroupReduceLogicalAnd, "sub_group_reduce_logical_and",
+     OpenCLC30},
+    {eCLBuiltinSubgroupReduceLogicalOr, "sub_group_reduce_logical_or",
+     OpenCLC30},
+    {eCLBuiltinSubgroupReduceLogicalXor, "sub_group_reduce_logical_xor",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMulInclusive, "sub_group_scan_inclusive_mul",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMulExclusive, "sub_group_scan_exclusive_mul",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanAndInclusive, "sub_group_scan_inclusive_and",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanAndExclusive, "sub_group_scan_exclusive_and",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanOrInclusive, "sub_group_scan_inclusive_or",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanOrExclusive, "sub_group_scan_exclusive_or",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanXorInclusive, "sub_group_scan_inclusive_xor",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanXorExclusive, "sub_group_scan_exclusive_xor",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalAndInclusive,
+     "sub_group_scan_inclusive_logical_and", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalAndExclusive,
+     "sub_group_scan_exclusive_logical_and", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalOrInclusive,
+     "sub_group_scan_inclusive_logical_or", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalOrExclusive,
+     "sub_group_scan_exclusive_logical_or", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalXorInclusive,
+     "sub_group_scan_inclusive_logical_xor", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalXorExclusive,
+     "sub_group_scan_exclusive_logical_xor", OpenCLC30},
+
+    // GLSL builtin functions
+    {eCLBuiltinCodeplayFaceForward, "codeplay_face_forward"},
+    {eCLBuiltinCodeplayReflect, "codeplay_reflect"},
+    {eCLBuiltinCodeplayRefract, "codeplay_refract"},
+    {eCLBuiltinCodeplayFindLSB, "codeplay_pack_find_lsb"},
+    {eCLBuiltinCodeplayFindMSB, "codeplay_pack_find_msb"},
+    {eCLBuiltinCodeplayBitReverse, "codeplay_pack_bit_reverse"},
+    {eCLBuiltinCodeplayPackNormalizeChar4, "codeplay_pack_normalize_char4"},
+    {eCLBuiltinCodeplayPackNormalizeUchar4, "codeplay_pack_normalize_uchar4"},
+    {eCLBuiltinCodeplayPackNormalizeShort2, "codeplay_pack_normalize_short2"},
+    {eCLBuiltinCodeplayPackNormalizeUshort2, "codeplay_pack_normalize_ushort2"},
+    {eCLBuiltinCodeplayPackHalf2, "codeplay_pack_half2"},
+    {eCLBuiltinCodeplayUnpackNormalize, "codeplay_unpack_normalize"},
+    {eCLBuiltinCodeplayUnpackHalf2, "codeplay_unpack_half2"},
+
+    {eBuiltinInvalid, nullptr},
+    {eBuiltinUnknown, nullptr}};
+
+////////////////////////////////////////////////////////////////////////////////
+
+Function *CLBuiltinInfo::declareBuiltin(Module *M, BuiltinID ID, Type *RetTy,
+                                        ArrayRef<Type *> ArgTys,
+                                        ArrayRef<TypeQualifiers> ArgQuals,
+                                        Twine Suffix) {
+  // Determine the builtin function name.
+  if (!M) {
+    return nullptr;
+  }
+  std::string BuiltinName = getBuiltinName(ID).str();
+  if (BuiltinName.empty()) {
+    return nullptr;
+  }
+
+  // Add the optional suffix.
+  SmallVector<char, 16> SuffixVec;
+  Suffix.toVector(SuffixVec);
+  if (!SuffixVec.empty()) {
+    BuiltinName.append(SuffixVec.begin(), SuffixVec.end());
+  }
+
+  // Mangle the function name and look it up in the module.
+  NameMangler Mangler(&M->getContext());
+  const std::string MangledName =
+      Mangler.mangleName(BuiltinName, ArgTys, ArgQuals);
+  Function *Builtin = M->getFunction(MangledName);
+
+  // Declare the builtin if necessary.
+  if (!Builtin) {
+    FunctionType *FT = FunctionType::get(RetTy, ArgTys, false);
+    M->getOrInsertFunction(MangledName, FT);
+    Builtin = M->getFunction(MangledName);
+    Builtin->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+  return Builtin;
+}
+
+BuiltinID CLBuiltinInfo::getPrintfBuiltin() const { return eCLBuiltinPrintf; }
+
+Module *CLBuiltinInfo::getBuiltinsModule() {
+  if (!Loader) {
+    return nullptr;
+  }
+  return Loader->getBuiltinsModule();
+}
+
+Function *CLBuiltinInfo::materializeBuiltin(StringRef BuiltinName,
+                                            Module *DestM,
+                                            BuiltinMatFlags Flags) {
+  // First try to find the builtin in the target module.
+  if (DestM) {
+    Function *Builtin = DestM->getFunction(BuiltinName);
+    // If a builtin was found, it might be either a declaration or a definition.
+    // If the definition flag (eBuiltinMatDefinition) is set, we can not return
+    // just a declaration.
+    if (Builtin &&
+        (!(Flags & eBuiltinMatDefinition) || !Builtin->isDeclaration())) {
+      return Builtin;
+    }
+  }
+
+  if (!Loader) {
+    return nullptr;
+  }
+  // Try to find the builtin in the builtins module
+  return Loader->materializeBuiltin(BuiltinName, DestM, Flags);
+}
+
+BuiltinID CLBuiltinInfo::identifyBuiltin(const Function &F) const {
+  NameMangler Mangler(nullptr);
+  const StringRef Name = F.getName();
+  const CLBuiltinEntry *entry = Builtins;
+  const auto Version = getOpenCLVersion(*F.getParent());
+  const StringRef DemangledName = Mangler.demangleName(Name);
+  while (entry->ID != eBuiltinInvalid) {
+    if (Version >= entry->MinVer && DemangledName.equals(entry->OpenCLFnName)) {
+      return entry->ID;
+    }
+    entry++;
+  }
+
+  if (DemangledName == Name) {
+    // The function name is not mangled and so it can not be an OpenCL builtin.
+    return eBuiltinInvalid;
+  }
+
+  Lexer L(Mangler.demangleName(Name));
+  if (L.Consume("vload")) {
+    unsigned Width = 0;
+    if (L.Consume("_half")) {
+      // We have both `vload_half` and `vload_halfN` variants.
+      if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) {
+        // If there's nothing left to parse we're good to go.
+        if (!L.Left()) {
+          return eCLBuiltinVLoadHalf;
+        }
+      }
+    } else if (L.ConsumeInteger(Width) && !L.Left() && isValidVecWidth(Width)) {
+      // There are no scalar variants of this builtin.
+      return eCLBuiltinVLoad;
+    }
+  } else if (L.Consume("vstore")) {
+    unsigned Width = 0;
+    if (L.Consume("_half")) {
+      // We have both `vstore_half` and `vstore_halfN` variants.
+      if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) {
+        // Rounding modes are optional.
+        L.Consume("_rte") || L.Consume("_rtz") || L.Consume("_rtp") ||
+            L.Consume("_rtn");
+
+        // If there's nothing left to parse we're good to go.
+        if (!L.Left()) {
+          return eCLBuiltinVStoreHalf;
+        }
+      }
+    } else if (L.ConsumeInteger(Width) && !L.Left() && isValidVecWidth(Width)) {
+      // There are no scalar variants of this builtin.
+      return eCLBuiltinVStore;
+    }
+  } else if (L.Consume("as_")) {
+    if (L.Consume("char") || L.Consume("uchar") || L.Consume("short") ||
+        L.Consume("ushort") || L.Consume("int") || L.Consume("uint") ||
+        L.Consume("long") || L.Consume("ulong") || L.Consume("float") ||
+        L.Consume("double") || L.Consume("half")) {
+      unsigned Width = 0;
+      if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) {
+        if (!L.Left()) {
+          return eCLBuiltinAs;
+        }
+      }
+    }
+  }
+
+  return eBuiltinUnknown;
+}
+
+llvm::StringRef CLBuiltinInfo::getBuiltinName(BuiltinID ID) const {
+  const CLBuiltinEntry *entry = Builtins;
+  while (entry->ID != eBuiltinInvalid) {
+    if (ID == entry->ID) {
+      return entry->OpenCLFnName;
+    }
+    entry++;
+  }
+  return llvm::StringRef();
+}
+
+BuiltinUniformity CLBuiltinInfo::isBuiltinUniform(const Builtin &,
+                                                  const CallInst *CI,
+                                                  unsigned) const {
+  // Assume that builtins with side effects are varying.
+  if (Function *Callee = CI->getCalledFunction()) {
+    const auto Props = analyzeBuiltin(*Callee).properties;
+    if (Props & eBuiltinPropertySideEffects) {
+      return eBuiltinUniformityNever;
+    }
+  }
+
+  return eBuiltinUniformityLikeInputs;
+}
+
+Builtin CLBuiltinInfo::analyzeBuiltin(const Function &Callee) const {
+  const BuiltinID ID = identifyBuiltin(Callee);
+
+  bool IsConvergent = false;
+  unsigned Properties = eBuiltinPropertyNone;
+  switch (ID) {
+    default:
+      // Assume convergence on unknown builtins.
+      IsConvergent = true;
+      break;
+    case eBuiltinUnknown: {
+      // Assume convergence on unknown builtins.
+      IsConvergent = true;
+      // If we know that this is an OpenCL builtin, but we don't have any
+      // special information about it, we can determine if it has side effects
+      // or not by its return type and its paramaters. This depends on being
+      // able to identify all the "special" builtins, such as barriers and
+      // fences.
+      bool HasSideEffects = false;
+
+      // Void functions have side effects
+      if (Callee.getReturnType() == Type::getVoidTy(Callee.getContext())) {
+        HasSideEffects = true;
+      }
+      // Functions that take pointers probably have side effects
+      for (const auto &arg : Callee.args()) {
+        if (arg.getType()->isPointerTy()) {
+          HasSideEffects = true;
+        }
+      }
+      Properties |= HasSideEffects ? eBuiltinPropertySideEffects
+                                   : eBuiltinPropertyNoSideEffects;
+    } break;
+    case eCLBuiltinBarrier:
+      IsConvergent = true;
+      Properties |= eBuiltinPropertyExecutionFlow;
+      Properties |= eBuiltinPropertySideEffects;
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+    case eCLBuiltinMemFence:
+    case eCLBuiltinReadMemFence:
+    case eCLBuiltinWriteMemFence:
+      Properties |= eBuiltinPropertySupportsInstantiation;
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+    case eCLBuiltinPrintf:
+      Properties |= eBuiltinPropertySideEffects;
+      Properties |= eBuiltinPropertySupportsInstantiation;
+      break;
+    case eCLBuiltinAsyncWorkGroupCopy:
+    case eCLBuiltinAsyncWorkGroupStridedCopy:
+    case eCLBuiltinWaitGroupEvents:
+    case eCLBuiltinAsyncWorkGroupCopy2D2D:
+    case eCLBuiltinAsyncWorkGroupCopy3D3D:
+      // Our implementation of these builtins uses thread checks against
+      // specific work-item IDs, so they are convergent.
+      IsConvergent = true;
+      Properties |= eBuiltinPropertyNoSideEffects;
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+    case eCLBuiltinAtomicAdd:
+    case eCLBuiltinAtomicSub:
+    case eCLBuiltinAtomicXchg:
+    case eCLBuiltinAtomicInc:
+    case eCLBuiltinAtomicDec:
+    case eCLBuiltinAtomicCmpxchg:
+    case eCLBuiltinAtomicMin:
+    case eCLBuiltinAtomicMax:
+    case eCLBuiltinAtomicAnd:
+    case eCLBuiltinAtomicOr:
+    case eCLBuiltinAtomicXor:
+      Properties |= eBuiltinPropertySideEffects;
+      Properties |= eBuiltinPropertySupportsInstantiation;
+      Properties |= eBuiltinPropertyAtomic;
+      break;
+    case eCLBuiltinGetWorkDim:
+    case eCLBuiltinGetGroupId:
+    case eCLBuiltinGetGlobalSize:
+    case eCLBuiltinGetGlobalOffset:
+    case eCLBuiltinGetNumGroups:
+    case eCLBuiltinGetGlobalId:
+    case eCLBuiltinGetLocalSize:
+    case eCLBuiltinGetEnqueuedLocalSize:
+    case eCLBuiltinGetLocalLinearId:
+    case eCLBuiltinGetGlobalLinearId:
+    case eCLBuiltinGetSubgroupLocalId:
+      Properties |= eBuiltinPropertyWorkItem;
+      Properties |= eBuiltinPropertyRematerializable;
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+    case eCLBuiltinGetLocalId:
+      Properties |= eBuiltinPropertyWorkItem;
+      Properties |= eBuiltinPropertyLocalID;
+      Properties |= eBuiltinPropertyRematerializable;
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+    case eCLBuiltinDot:
+    case eCLBuiltinCross:
+    case eCLBuiltinFastDistance:
+    case eCLBuiltinFastLength:
+    case eCLBuiltinFastNormalize:
+      Properties |= eBuiltinPropertyReduction;
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinDistance:
+    case eCLBuiltinLength:
+    case eCLBuiltinNormalize:
+      Properties |= eBuiltinPropertyReduction;
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      // XXX The inline implementation seems to have precision issues. The dot
+      // product can overflow to +inf which results in the wrong result.
+      // See redmine #6427 and #9115
+      // Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinIsEqual:
+    case eCLBuiltinIsNotEqual:
+    case eCLBuiltinIsGreater:
+    case eCLBuiltinIsGreaterEqual:
+    case eCLBuiltinIsLess:
+    case eCLBuiltinIsLessEqual:
+    case eCLBuiltinIsLessGreater:
+    case eCLBuiltinIsOrdered:
+    case eCLBuiltinIsUnordered:
+    case eCLBuiltinIsFinite:
+    case eCLBuiltinIsInf:
+    case eCLBuiltinIsNan:
+    case eCLBuiltinIsNormal:
+    case eCLBuiltinSignBit:
+      // Scalar variants return '0' or '1', vector variants '0' or '111...1'.
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      Properties |= eBuiltinPropertySupportsInstantiation;
+      break;
+    case eCLBuiltinAny:
+    case eCLBuiltinAll:
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinFract:
+    case eCLBuiltinModF:
+    case eCLBuiltinSinCos:
+      Properties |= eBuiltinPropertyPointerReturnEqualRetTy;
+      break;
+    case eCLBuiltinFrexp:
+    case eCLBuiltinLGammaR:
+    case eCLBuiltinRemquo:
+      Properties |= eBuiltinPropertyPointerReturnEqualIntRetTy;
+      break;
+    case eCLBuiltinShuffle:
+    case eCLBuiltinShuffle2:
+      // While there are vector equivalents for these builtins, they require a
+      // modified mask, so we cannot use them by simply packetizing their
+      // arguments.
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinFMax:
+    case eCLBuiltinFMin:
+    case eCLBuiltinAddSat:
+    case eCLBuiltinSubSat:
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinCodeplayFaceForward:
+    case eCLBuiltinCodeplayReflect:
+    case eCLBuiltinCodeplayRefract:
+      Properties |= eBuiltinPropertyReduction;
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      break;
+    case eCLBuiltinConvertChar:
+    case eCLBuiltinConvertShort:
+    case eCLBuiltinConvertInt:
+    case eCLBuiltinConvertLong:
+    case eCLBuiltinConvertUChar:
+    case eCLBuiltinConvertUShort:
+    case eCLBuiltinConvertUInt:
+    case eCLBuiltinConvertULong:
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinVLoad:
+    case eCLBuiltinVLoadHalf:
+      Properties |= eBuiltinPropertyNoSideEffects;
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinVStore:
+    case eCLBuiltinVStoreHalf:
+      Properties |= eBuiltinPropertySideEffects;
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinSelect:
+    case eCLBuiltinAs:
+      // Some of these builtins do have vector equivalents, but since we can
+      // emit all variants inline, we mark them as having none for simplicity.
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertyCanEmitInline;
+      break;
+    case eCLBuiltinWorkGroupBarrier:
+    case eCLBuiltinSubGroupBarrier:
+      IsConvergent = true;
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinAtomicWorkItemFence:
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+    case eCLBuiltinGetSubgroupSize:
+    case eCLBuiltinGetMaxSubgroupSize:
+    case eCLBuiltinGetNumSubgroups:
+    case eCLBuiltinGetEnqueuedNumSubgroups:
+    case eCLBuiltinGetSubgroupId:
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+      // Subgroup collectives
+    case eCLBuiltinSubgroupAll:
+    case eCLBuiltinSubgroupAny:
+    case eCLBuiltinSubgroupBroadcast:
+    case eCLBuiltinSubgroupReduceAdd:
+    case eCLBuiltinSubgroupReduceMin:
+    case eCLBuiltinSubgroupReduceMax:
+    case eCLBuiltinSubgroupScanAddInclusive:
+    case eCLBuiltinSubgroupScanAddExclusive:
+    case eCLBuiltinSubgroupScanMinInclusive:
+    case eCLBuiltinSubgroupScanMinExclusive:
+    case eCLBuiltinSubgroupScanMaxInclusive:
+    case eCLBuiltinSubgroupScanMaxExclusive:
+    case eCLBuiltinSubgroupReduceMul:
+    case eCLBuiltinSubgroupReduceAnd:
+    case eCLBuiltinSubgroupReduceOr:
+    case eCLBuiltinSubgroupReduceXor:
+    case eCLBuiltinSubgroupReduceLogicalAnd:
+    case eCLBuiltinSubgroupReduceLogicalOr:
+    case eCLBuiltinSubgroupReduceLogicalXor:
+    case eCLBuiltinSubgroupScanMulInclusive:
+    case eCLBuiltinSubgroupScanMulExclusive:
+    case eCLBuiltinSubgroupScanAndInclusive:
+    case eCLBuiltinSubgroupScanAndExclusive:
+    case eCLBuiltinSubgroupScanOrInclusive:
+    case eCLBuiltinSubgroupScanOrExclusive:
+    case eCLBuiltinSubgroupScanXorInclusive:
+    case eCLBuiltinSubgroupScanXorExclusive:
+    case eCLBuiltinSubgroupScanLogicalAndInclusive:
+    case eCLBuiltinSubgroupScanLogicalAndExclusive:
+    case eCLBuiltinSubgroupScanLogicalOrInclusive:
+    case eCLBuiltinSubgroupScanLogicalOrExclusive:
+    case eCLBuiltinSubgroupScanLogicalXorInclusive:
+    case eCLBuiltinSubgroupScanLogicalXorExclusive:
+      // Work-group collectives
+    case eCLBuiltinWorkgroupAll:
+    case eCLBuiltinWorkgroupAny:
+    case eCLBuiltinWorkgroupBroadcast:
+    case eCLBuiltinWorkgroupReduceAdd:
+    case eCLBuiltinWorkgroupReduceMin:
+    case eCLBuiltinWorkgroupReduceMax:
+    case eCLBuiltinWorkgroupScanAddInclusive:
+    case eCLBuiltinWorkgroupScanAddExclusive:
+    case eCLBuiltinWorkgroupScanMinInclusive:
+    case eCLBuiltinWorkgroupScanMinExclusive:
+    case eCLBuiltinWorkgroupScanMaxInclusive:
+    case eCLBuiltinWorkgroupScanMaxExclusive:
+    case eCLBuiltinWorkgroupReduceMul:
+    case eCLBuiltinWorkgroupReduceAnd:
+    case eCLBuiltinWorkgroupReduceOr:
+    case eCLBuiltinWorkgroupReduceXor:
+    case eCLBuiltinWorkgroupReduceLogicalAnd:
+    case eCLBuiltinWorkgroupReduceLogicalOr:
+    case eCLBuiltinWorkgroupReduceLogicalXor:
+    case eCLBuiltinWorkgroupScanMulInclusive:
+    case eCLBuiltinWorkgroupScanMulExclusive:
+    case eCLBuiltinWorkgroupScanAndInclusive:
+    case eCLBuiltinWorkgroupScanAndExclusive:
+    case eCLBuiltinWorkgroupScanOrInclusive:
+    case eCLBuiltinWorkgroupScanOrExclusive:
+    case eCLBuiltinWorkgroupScanXorInclusive:
+    case eCLBuiltinWorkgroupScanXorExclusive:
+    case eCLBuiltinWorkgroupScanLogicalAndInclusive:
+    case eCLBuiltinWorkgroupScanLogicalAndExclusive:
+    case eCLBuiltinWorkgroupScanLogicalOrInclusive:
+    case eCLBuiltinWorkgroupScanLogicalOrExclusive:
+    case eCLBuiltinWorkgroupScanLogicalXorInclusive:
+    case eCLBuiltinWorkgroupScanLogicalXorExclusive:
+      IsConvergent = true;
+      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+      break;
+  }
+
+  if (!IsConvergent) {
+    Properties |= eBuiltinPropertyKnownNonConvergent;
+  }
+
+  return Builtin{Callee, ID, (BuiltinProperties)Properties};
+}
+
+Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
+                                             Module *M) {
+  // Analyze the builtin. Some functions have no vector equivalent.
+  const auto Props = B.properties;
+  if (Props & eBuiltinPropertyNoVectorEquivalent) {
+    return nullptr;
+  }
+
+  // Builtin functions have mangled names. If it's not mangled, there will be
+  // no vector equivalent.
+  NameMangler Mangler(&B.function.getContext());
+  SmallVector<Type *, 4> BuiltinArgTypes, BuiltinPointeeTypes;
+  SmallVector<TypeQualifiers, 4> BuiltinArgQuals;
+  const StringRef BuiltinName =
+      Mangler.demangleName(B.function.getName(), BuiltinArgTypes,
+                           BuiltinPointeeTypes, BuiltinArgQuals);
+  if (BuiltinName.empty()) {
+    return nullptr;
+  }
+
+  // Determine the mangled name of the vector equivalent.
+  // This means creating a list of qualified types for the arguments.
+  SmallVector<Type *, 4> VectorTypes;
+  SmallVector<TypeQualifiers, 4> VectorQuals;
+  for (unsigned i = 0; i < BuiltinArgTypes.size(); i++) {
+    Type *OldTy = BuiltinArgTypes[i];
+    const TypeQualifiers OldQuals = BuiltinArgQuals[i];
+    if (isa<FixedVectorType>(OldTy)) {
+      return nullptr;
+    }
+    PointerType *OldPtrTy = dyn_cast<PointerType>(OldTy);
+    if (OldPtrTy) {
+      if (auto *const PtrRetPointeeTy =
+              getPointerReturnPointeeTy(B.function, Props)) {
+        [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i];
+        assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy &&
+               "Demangling inconsistency");
+        if (!FixedVectorType::isValidElementType(PtrRetPointeeTy)) {
+          return nullptr;
+        }
+        Type *NewEleTy = FixedVectorType::get(PtrRetPointeeTy, Width);
+        Type *NewType = PointerType::get(NewEleTy, OldPtrTy->getAddressSpace());
+        TypeQualifiers NewQuals;
+        TypeQualifiers EleQuals = OldQuals;
+        NewQuals.push_back(EleQuals.pop_front());  // Pointer qualifier
+        NewQuals.push_back(eTypeQualNone);         // Vector qualifier
+        NewQuals.push_back(EleQuals);
+
+        VectorTypes.push_back(NewType);
+        VectorQuals.push_back(NewQuals);
+
+        continue;
+      }
+    }
+
+    if (!FixedVectorType::isValidElementType(OldTy)) {
+      return nullptr;
+    }
+    TypeQualifiers NewQuals;
+    Type *NewType = FixedVectorType::get(OldTy, Width);
+    NewQuals.push_back(eTypeQualNone);  // Vector qualifier
+    NewQuals.push_back(OldQuals);       // Element qualifier
+
+    VectorTypes.push_back(NewType);
+    VectorQuals.push_back(NewQuals);
+  }
+
+  // Handle special builtin naming equivalents.
+  std::string EquivNameBase = BuiltinName.str();
+  StringRef FirstChunk;
+  Lexer L(BuiltinName);
+  if (L.ConsumeUntil('_', FirstChunk)) {
+    const bool AsBuiltin = FirstChunk.equals("as");
+    const bool ConvertBuiltin = FirstChunk.equals("convert");
+    if (!L.Consume("_")) {
+      return nullptr;
+    }
+    StringRef SecondChunkNoWidth;
+    if (!L.ConsumeAlpha(SecondChunkNoWidth)) {
+      return nullptr;
+    }
+    if (AsBuiltin || ConvertBuiltin) {
+      // as_* and convert_* builtins have vector equivalents, with a vector
+      // width suffix. Add the width suffix to the scalar builtin name.
+      if (AsBuiltin && L.Left()) {
+        return nullptr;
+      }
+      const Twine WidthText(Width);
+      EquivNameBase.insert(L.CurrentPos(), WidthText.str());
+    }
+  }
+
+  const std::string EquivName =
+      Mangler.mangleName(EquivNameBase, VectorTypes, VectorQuals);
+
+  // Lookup the vector equivalent and make sure the return type agrees.
+  Function *VectorBuiltin = materializeBuiltin(EquivName, M);
+  if (VectorBuiltin) {
+    Type *RetTy = B.function.getReturnType();
+    auto *VecRetTy = dyn_cast<FixedVectorType>(VectorBuiltin->getReturnType());
+    if (!VecRetTy || (VecRetTy->getElementType() != RetTy) ||
+        (VecRetTy->getNumElements() != Width)) {
+      VectorBuiltin = nullptr;
+    }
+  }
+  return VectorBuiltin;
+}
+
+Function *CLBuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
+  // Analyze the builtin. Some functions have no scalar equivalent.
+  const auto Props = B.properties;
+  if (Props & eBuiltinPropertyNoVectorEquivalent) {
+    return nullptr;
+  }
+
+  // Check the return type.
+  auto *VecRetTy = dyn_cast<FixedVectorType>(B.function.getReturnType());
+  if (!VecRetTy) {
+    return nullptr;
+  }
+
+  // Builtin functions have mangled names. If it's not mangled, there will be
+  // no scalar equivalent.
+  NameMangler Mangler(&B.function.getContext());
+  SmallVector<Type *, 4> BuiltinArgTypes, BuiltinPointeeTypes;
+  SmallVector<TypeQualifiers, 4> BuiltinArgQuals;
+  const StringRef BuiltinName =
+      Mangler.demangleName(B.function.getName(), BuiltinArgTypes,
+                           BuiltinPointeeTypes, BuiltinArgQuals);
+  if (BuiltinName.empty()) {
+    return nullptr;
+  }
+
+  // Determine the mangled name of the scalar equivalent.
+  // This means creating a list of qualified types for the arguments.
+  const unsigned Width = VecRetTy->getNumElements();
+  SmallVector<Type *, 4> ScalarTypes;
+  SmallVector<TypeQualifiers, 4> ScalarQuals;
+  for (unsigned i = 0; i < BuiltinArgTypes.size(); i++) {
+    Type *OldTy = BuiltinArgTypes[i];
+    const TypeQualifiers OldQuals = BuiltinArgQuals[i];
+    if (auto *OldVecTy = dyn_cast<FixedVectorType>(OldTy)) {
+      if (OldVecTy->getNumElements() != Width) {
+        return nullptr;
+      }
+      Type *NewTy = OldVecTy->getElementType();
+      TypeQualifiers NewQuals = OldQuals;
+      NewQuals.pop_front();
+
+      ScalarTypes.push_back(NewTy);
+      ScalarQuals.push_back(NewQuals);
+    } else if (PointerType *OldPtrTy = dyn_cast<PointerType>(OldTy)) {
+      Type *const PtrRetPointeeTy =
+          getPointerReturnPointeeTy(B.function, Props);
+      if (PtrRetPointeeTy && PtrRetPointeeTy->isVectorTy()) {
+        [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i];
+        assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy &&
+               "Demangling inconsistency");
+        auto *OldVecTy = cast<FixedVectorType>(PtrRetPointeeTy);
+        Type *NewTy = PointerType::get(OldVecTy->getElementType(),
+                                       OldPtrTy->getAddressSpace());
+        TypeQualifiers NewQuals = OldQuals;
+        const TypeQualifier PtrQual = NewQuals.pop_front();
+        const TypeQualifier VecQual = NewQuals.pop_front();
+        (void)VecQual;
+        const TypeQualifier EleQual = NewQuals.pop_front();
+        NewQuals.push_back(PtrQual);
+        NewQuals.push_back(EleQual);
+        ScalarTypes.push_back(NewTy);
+        ScalarQuals.push_back(NewQuals);
+      } else {
+        ScalarTypes.push_back(OldTy);
+        ScalarQuals.push_back(OldQuals);
+      }
+    } else {
+      if (!OldTy) {
+        return nullptr;
+      }
+      ScalarTypes.push_back(OldTy);
+      ScalarQuals.push_back(OldQuals);
+    }
+  }
+
+  // Handle special builtin naming equivalents.
+  std::string EquivNameBase = BuiltinName.str();
+  StringRef FirstChunk;
+  Lexer L(BuiltinName);
+  if (L.ConsumeUntil('_', FirstChunk)) {
+    const bool AsBuiltin = FirstChunk.equals("as");
+    const bool ConvertBuiltin = FirstChunk.equals("convert");
+    if (!L.Consume("_")) {
+      return nullptr;
+    }
+    StringRef SecondChunkNoWidth;
+    if (!L.ConsumeAlpha(SecondChunkNoWidth)) {
+      return nullptr;
+    }
+    if (AsBuiltin || ConvertBuiltin) {
+      // as_* and convert_* builtins have scalar equivalents, with no width
+      // suffix. Remove the width suffix from the vector builtin name.
+      const unsigned WidthStart = L.CurrentPos();
+      unsigned Width = 0;
+      if (!L.ConsumeInteger(Width)) {
+        return nullptr;
+      }
+      const unsigned WidthEnd = L.CurrentPos();
+      EquivNameBase.erase(WidthStart, WidthEnd - WidthStart);
+    }
+  }
+
+  const std::string EquivName =
+      Mangler.mangleName(EquivNameBase, ScalarTypes, ScalarQuals);
+
+  // Lookup the scalar equivalent and make sure the return type agrees.
+  Function *ScalarBuiltin = materializeBuiltin(EquivName, M);
+  if (!ScalarBuiltin) {
+    return nullptr;
+  }
+  Type *RetTy = ScalarBuiltin->getReturnType();
+  if (VecRetTy->getElementType() != RetTy) {
+    return nullptr;
+  }
+  return ScalarBuiltin;
+}
+
+/// @brief Returns whether the parameter corresponding to given index to the
+/// (assumed builtin) Function is known to possess the given qualifier.
+/// @return true if the parameter is known to have the qualifier, false if not,
+/// and None on error.
+static std::optional<bool> paramHasTypeQual(const Function &F,
+                                            unsigned ParamIdx,
+                                            TypeQualifier Q) {
+  // Demangle the function name to get the type qualifiers.
+  SmallVector<Type *, 2> Types;
+  SmallVector<TypeQualifiers, 2> Quals;
+  NameMangler Mangler(&F.getContext());
+  if (Mangler.demangleName(F.getName(), Types, Quals).empty()) {
+    return std::nullopt;
+  }
+
+  if (ParamIdx >= Quals.size()) {
+    return std::nullopt;
+  }
+
+  auto &Qual = Quals[ParamIdx];
+  while (Qual.getCount()) {
+    if (Qual.pop_front() == Q) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
+                                        ArrayRef<Value *> Args) {
+  if (!F) {
+    return nullptr;
+  }
+
+  // Handle 'common' builtins.
+  const BuiltinID BuiltinID = identifyBuiltin(*F);
+  if (BuiltinID != eBuiltinInvalid && BuiltinID != eBuiltinUnknown) {
+    // Note we have to handle these specially since we need to deduce whether
+    // the source operand is signed or not. It is not possible to do this based
+    // solely on the BuiltinID.
+    switch (BuiltinID) {
+        // 6.2 Explicit Conversions
+      case eCLBuiltinConvertChar:
+      case eCLBuiltinConvertShort:
+      case eCLBuiltinConvertInt:
+      case eCLBuiltinConvertLong:
+      case eCLBuiltinConvertUChar:
+      case eCLBuiltinConvertUShort:
+      case eCLBuiltinConvertUInt:
+      case eCLBuiltinConvertULong:
+        return emitBuiltinInlineConvert(F, BuiltinID, B, Args);
+        // 6.12.3 Integer Functions
+      case eCLBuiltinAddSat:
+      case eCLBuiltinSubSat: {
+        std::optional<bool> IsParamSignedOrNone =
+            paramHasTypeQual(*F, 0, eTypeQualSignedInt);
+        if (!IsParamSignedOrNone.has_value()) {
+          return nullptr;
+        }
+        const bool IsSigned = *IsParamSignedOrNone;
+        const Intrinsic::ID IntrinsicOpc =
+            BuiltinID == eCLBuiltinSubSat
+                ? (IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat)
+                : (IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat);
+        return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                      IntrinsicOpc);
+      }
+      case eCLBuiltinVLoad: {
+        NameMangler Mangler(&F->getContext());
+        Lexer L(Mangler.demangleName(F->getName()));
+        if (L.Consume("vload")) {
+          unsigned Width = 0;
+          if (L.ConsumeInteger(Width)) {
+            return emitBuiltinInlineVLoad(F, Width, B, Args);
+          }
+        }
+      } break;
+      case eCLBuiltinVLoadHalf: {
+        NameMangler Mangler(&F->getContext());
+        const auto name = Mangler.demangleName(F->getName());
+        if (name == "vload_half") {
+          // TODO CA-4691 handle "vload_halfn"
+          return emitBuiltinInlineVLoadHalf(F, B, Args);
+        }
+      } break;
+      case eCLBuiltinVStore: {
+        NameMangler Mangler(&F->getContext());
+        Lexer L(Mangler.demangleName(F->getName()));
+        if (L.Consume("vstore")) {
+          unsigned Width = 0;
+          if (L.ConsumeInteger(Width)) {
+            return emitBuiltinInlineVStore(F, Width, B, Args);
+          }
+        }
+      } break;
+      case eCLBuiltinVStoreHalf: {
+        NameMangler Mangler(&F->getContext());
+        Lexer L(Mangler.demangleName(F->getName()));
+        if (L.Consume("vstore_half")) {
+          // TODO CA-4691 handle "vstore_halfn"
+          return emitBuiltinInlineVStoreHalf(F, L.TextLeft(), B, Args);
+        }
+      } break;
+      case eCLBuiltinSelect:
+        return emitBuiltinInlineSelect(F, B, Args);
+      case eCLBuiltinAs:
+        return emitBuiltinInlineAs(F, B, Args);
+      default:
+        break;
+    }
+    return emitBuiltinInline(BuiltinID, B, Args);
+  }
+
+  return nullptr;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInline(BuiltinID BuiltinID, IRBuilder<> &B,
+                                        ArrayRef<Value *> Args) {
+  switch (BuiltinID) {
+    default:
+      return nullptr;
+
+    case eCLBuiltinDot:
+    case eCLBuiltinCross:
+    case eCLBuiltinLength:
+    case eCLBuiltinDistance:
+    case eCLBuiltinNormalize:
+    case eCLBuiltinFastLength:
+    case eCLBuiltinFastDistance:
+    case eCLBuiltinFastNormalize:
+      return emitBuiltinInlineGeometrics(BuiltinID, B, Args);
+    // 6.12.2 Math Functions
+    case eCLBuiltinFMax:
+      return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                    llvm::Intrinsic::maxnum);
+    case eCLBuiltinFMin:
+      return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                    llvm::Intrinsic::minnum);
+    // 6.12.6 Relational Functions
+    case eCLBuiltinAll:
+      return emitBuiltinInlineAll(B, Args);
+    case eCLBuiltinAny:
+      return emitBuiltinInlineAny(B, Args);
+    case eCLBuiltinIsEqual:
+    case eCLBuiltinIsNotEqual:
+    case eCLBuiltinIsGreater:
+    case eCLBuiltinIsGreaterEqual:
+    case eCLBuiltinIsLess:
+    case eCLBuiltinIsLessEqual:
+    case eCLBuiltinIsLessGreater:
+    case eCLBuiltinIsOrdered:
+    case eCLBuiltinIsUnordered:
+      return emitBuiltinInlineRelationalsWithTwoArguments(BuiltinID, B, Args);
+    case eCLBuiltinIsFinite:
+    case eCLBuiltinIsInf:
+    case eCLBuiltinIsNan:
+    case eCLBuiltinIsNormal:
+    case eCLBuiltinSignBit:
+      assert(Args.size() == 1 && "Invalid number of arguments");
+      return emitBuiltinInlineRelationalsWithOneArgument(BuiltinID, B, Args[0]);
+    // 6.12.12 Miscellaneous Vector Functions
+    case eCLBuiltinShuffle:
+    case eCLBuiltinShuffle2:
+      return emitBuiltinInlineShuffle(BuiltinID, B, Args);
+
+    case eCLBuiltinPrintf:
+      return emitBuiltinInlinePrintf(BuiltinID, B, Args);
+  }
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineGeometrics(BuiltinID BuiltinID,
+                                                  IRBuilder<> &B,
+                                                  ArrayRef<Value *> Args) {
+  Value *Src = nullptr;
+  switch (BuiltinID) {
+    default:
+      return nullptr;
+    case eCLBuiltinDot:
+      return emitBuiltinInlineDot(B, Args);
+    case eCLBuiltinCross:
+      return emitBuiltinInlineCross(B, Args);
+    case eCLBuiltinLength:
+    case eCLBuiltinFastLength:
+      return emitBuiltinInlineLength(B, Args);
+    case eCLBuiltinDistance:
+    case eCLBuiltinFastDistance:
+      if (Args.size() != 2) {
+        return nullptr;
+      }
+      Src = B.CreateFSub(Args[0], Args[1], "distance");
+      return emitBuiltinInlineLength(B, ArrayRef<Value *>(&Src, 1));
+    case eCLBuiltinNormalize:
+    case eCLBuiltinFastNormalize:
+      return emitBuiltinInlineNormalize(B, Args);
+  }
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineDot(IRBuilder<> &B,
+                                           ArrayRef<Value *> Args) {
+  if (Args.size() != 2) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0];
+  Value *Src1 = Args[1];
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(Src0->getType());
+  if (SrcVecTy) {
+    Value *LHS0 = B.CreateExtractElement(Src0, B.getInt32(0), "lhs");
+    Value *RHS0 = B.CreateExtractElement(Src1, B.getInt32(0), "rhs");
+    Value *Sum = B.CreateFMul(LHS0, RHS0, "dot");
+    for (unsigned i = 1; i < SrcVecTy->getNumElements(); i++) {
+      Value *LHS = B.CreateExtractElement(Src0, B.getInt32(i), "lhs");
+      Value *RHS = B.CreateExtractElement(Src1, B.getInt32(i), "rhs");
+      Sum = B.CreateFAdd(Sum, B.CreateFMul(LHS, RHS, "dot"), "dot");
+    }
+    return Sum;
+  } else {
+    return B.CreateFMul(Src0, Src1, "dot");
+  }
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineCross(IRBuilder<> &B,
+                                             ArrayRef<Value *> Args) {
+  if (Args.size() != 2) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0];
+  Value *Src1 = Args[1];
+  auto *RetTy = dyn_cast<FixedVectorType>(Src0->getType());
+  if (!RetTy) {
+    return nullptr;
+  }
+  const int SrcIndices[] = {1, 2, 2, 0, 0, 1};
+  SmallVector<Value *, 4> Src0Lanes;
+  SmallVector<Value *, 4> Src1Lanes;
+  for (unsigned i = 0; i < 3; i++) {
+    Src0Lanes.push_back(B.CreateExtractElement(Src0, B.getInt32(i)));
+    Src1Lanes.push_back(B.CreateExtractElement(Src1, B.getInt32(i)));
+  }
+
+  Value *Result = UndefValue::get(RetTy);
+  for (unsigned i = 0; i < 3; i++) {
+    const int Idx0 = SrcIndices[(i * 2) + 0];
+    const int Idx1 = SrcIndices[(i * 2) + 1];
+    Value *Src0A = Src0Lanes[Idx0];
+    Value *Src1A = Src1Lanes[Idx1];
+    Value *TempA = B.CreateFMul(Src0A, Src1A);
+    Value *Src0B = Src0Lanes[Idx1];
+    Value *Src1B = Src1Lanes[Idx0];
+    Value *TempB = B.CreateFMul(Src0B, Src1B);
+    Value *Lane = B.CreateFSub(TempA, TempB);
+    Result = B.CreateInsertElement(Result, Lane, B.getInt32(i));
+  }
+  if (RetTy->getNumElements() == 4) {
+    Type *EleTy = RetTy->getElementType();
+    Result = B.CreateInsertElement(Result, Constant::getNullValue(EleTy),
+                                   B.getInt32(3));
+  }
+  return Result;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineLength(IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0];
+  Value *Src1 = Src0;
+
+  NameMangler Mangler(&B.getContext());
+  Type *SrcType = Src0->getType();
+  auto *SrcVecType = dyn_cast<FixedVectorType>(SrcType);
+  if (SrcVecType) {
+    SrcType = SrcVecType->getElementType();
+  }
+
+  TypeQualifiers SrcQuals;
+  SmallVector<Type *, 4> Tys;
+  SmallVector<TypeQualifiers, 4> Quals;
+  SrcQuals.push_back(eTypeQualNone);
+
+  // Materialize 'sqrt', 'fabs' and 'isinf'.
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  BasicBlock *BB = B.GetInsertBlock();
+  if (!BB) {
+    return nullptr;
+  }
+  Function *F = BB->getParent();
+  if (!F) {
+    return nullptr;
+  }
+  Module *M = F->getParent();
+  if (!M) {
+    return nullptr;
+  }
+
+  const std::string FabsName = Mangler.mangleName("fabs", Tys, Quals);
+  Function *Fabs = materializeBuiltin(FabsName, M);
+  if (!Fabs) {
+    return nullptr;
+  }
+  if (!SrcVecType) {
+    // The "length" of a scalar is just the absolute value.
+    return CreateBuiltinCall(B, Fabs, Src0, "scalar_length");
+  }
+
+  const std::string SqrtName = Mangler.mangleName("sqrt", Tys, Quals);
+  Function *Sqrt = materializeBuiltin(SqrtName, M);
+  if (!Sqrt) {
+    return nullptr;
+  }
+
+  const std::string IsInfName = Mangler.mangleName("isinf", Tys, Quals);
+  Function *IsInf = materializeBuiltin(IsInfName, M);
+  if (!IsInf) {
+    return nullptr;
+  }
+  Tys.clear();
+  Quals.clear();
+
+  // Materialize 'fmax'.
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  const std::string FmaxName = Mangler.mangleName("fmax", Tys, Quals);
+  Function *Fmax = materializeBuiltin(FmaxName, M);
+  if (!Fmax) {
+    return nullptr;
+  }
+
+  // Emit length or distance inline.
+  SmallVector<Value *, 4> Ops;
+  Ops.push_back(Src0);
+  Ops.push_back(Src1);
+  Value *Result = emitBuiltinInline(eCLBuiltinDot, B, Ops);
+  Result = CreateBuiltinCall(B, Sqrt, Result, "result");
+
+  // Handle the case where the result is infinite.
+  Value *AltResult = ConstantFP::get(SrcType, 0.0);
+  if (SrcVecType) {
+    for (unsigned i = 0; i < SrcVecType->getNumElements(); i++) {
+      Value *SrcLane = B.CreateExtractElement(Src0, B.getInt32(i), "src_lane");
+      SrcLane = CreateBuiltinCall(B, Fabs, SrcLane, "src_lane");
+      AltResult =
+          CreateBuiltinCall(B, Fmax, {SrcLane, AltResult}, "alt_result");
+    }
+  } else {
+    Value *SrcLane = CreateBuiltinCall(B, Fabs, Src0, "src_lane");
+    AltResult = CreateBuiltinCall(B, Fmax, {SrcLane, AltResult}, "alt_result");
+  }
+  Value *Cond = CreateBuiltinCall(B, IsInf, Result, "cond");
+  Cond = B.CreateICmpEQ(Cond, B.getInt32(0), "cmp");
+  Result = B.CreateSelect(Cond, Result, AltResult, "final_result");
+  return Result;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineNormalize(IRBuilder<> &B,
+                                                 ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+
+  Value *Src0 = Args[0];
+
+  NameMangler Mangler(&B.getContext());
+  Type *SrcType = Src0->getType();
+  auto *SrcVecType = dyn_cast<FixedVectorType>(SrcType);
+  if (SrcVecType) {
+    SrcType = SrcVecType->getElementType();
+  }
+
+  TypeQualifiers SrcQuals;
+  SmallVector<Type *, 4> Tys;
+  SmallVector<TypeQualifiers, 4> Quals;
+  SrcQuals.push_back(eTypeQualNone);
+
+  // Materialize 'rsqrt'.
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  BasicBlock *BB = B.GetInsertBlock();
+  if (!BB) {
+    return nullptr;
+  }
+  Function *F = BB->getParent();
+  if (!F) {
+    return nullptr;
+  }
+  Module *M = F->getParent();
+  if (!M) {
+    return nullptr;
+  }
+
+  if (!SrcVecType) {
+    // A normalized scalar is either 1.0 or -1.0, unless the input was NaN, or
+    // in other words, just the sign.
+    const std::string SignName = Mangler.mangleName("sign", Tys, Quals);
+    Function *Sign = materializeBuiltin(SignName, M);
+    if (!Sign) {
+      return nullptr;
+    }
+    return CreateBuiltinCall(B, Sign, Src0, "scalar_normalize");
+  }
+
+  const std::string RSqrtName = Mangler.mangleName("rsqrt", Tys, Quals);
+  Function *RSqrt = materializeBuiltin(RSqrtName, M);
+  if (!RSqrt) {
+    return nullptr;
+  }
+
+  // Call 'dot' on the input.
+  SmallVector<Value *, 4> DotArgs;
+  DotArgs.push_back(Src0);
+  DotArgs.push_back(Src0);
+  Value *Result = emitBuiltinInlineDot(B, DotArgs);
+  Result = CreateBuiltinCall(B, RSqrt, Result, "normalize");
+  if (SrcVecType) {
+    Result = B.CreateVectorSplat(SrcVecType->getNumElements(), Result);
+  }
+  Result = B.CreateFMul(Result, Src0, "normalized");
+  return Result;
+}
+
+static Value *emitAllAnyReduction(IRBuilder<> &B, ArrayRef<Value *> Args,
+                                  Instruction::BinaryOps ReduceOp) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Value *Arg0 = Args[0];
+  IntegerType *EleTy = dyn_cast<IntegerType>(Arg0->getType()->getScalarType());
+  if (!EleTy) {
+    return nullptr;
+  }
+
+  // Reduce the MSB of all vector lanes.
+  Value *ReducedVal = nullptr;
+  auto *VecTy = dyn_cast<FixedVectorType>(Arg0->getType());
+  if (VecTy) {
+    ReducedVal = B.CreateExtractElement(Arg0, B.getInt32(0));
+    for (unsigned i = 1; i < VecTy->getNumElements(); i++) {
+      Value *Lane = B.CreateExtractElement(Arg0, B.getInt32(i));
+      ReducedVal = B.CreateBinOp(ReduceOp, ReducedVal, Lane);
+    }
+  } else {
+    ReducedVal = Arg0;
+  }
+
+  // Shift the MSB to return either 0 or 1.
+  const unsigned ShiftAmount = EleTy->getPrimitiveSizeInBits() - 1;
+  Value *ShiftAmountVal = ConstantInt::get(EleTy, ShiftAmount);
+  Value *Result = B.CreateLShr(ReducedVal, ShiftAmountVal);
+  return B.CreateZExtOrTrunc(Result, B.getInt32Ty());
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineAll(IRBuilder<> &B,
+                                           ArrayRef<Value *> Args) {
+  return emitAllAnyReduction(B, Args, Instruction::And);
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineAny(IRBuilder<> &B,
+                                           ArrayRef<Value *> Args) {
+  return emitAllAnyReduction(B, Args, Instruction::Or);
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineSelect(Function *F, IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  if (F->arg_size() != 3) {
+    return nullptr;
+  }
+  Value *FalseVal = Args[0];
+  Value *TrueVal = Args[1];
+  Value *Cond = Args[2];
+  Type *RetTy = F->getReturnType();
+  auto *VecRetTy = dyn_cast<FixedVectorType>(RetTy);
+  Type *CondEleTy = Cond->getType()->getScalarType();
+  const unsigned CondEleBits = CondEleTy->getPrimitiveSizeInBits();
+  if (VecRetTy) {
+    const unsigned SimdWidth = VecRetTy->getNumElements();
+    Constant *ShiftAmount = ConstantInt::get(CondEleTy, CondEleBits - 1);
+    Constant *VecShiftAmount = ConstantVector::getSplat(
+        ElementCount::getFixed(SimdWidth), ShiftAmount);
+    Value *Mask = B.CreateAShr(Cond, VecShiftAmount);
+    Value *TrueValRaw = TrueVal;
+    Value *FalseValRaw = FalseVal;
+    if (VecRetTy->getElementType()->isFloatingPointTy()) {
+      auto *RawType = FixedVectorType::getInteger(VecRetTy);
+      TrueValRaw = B.CreateBitCast(TrueVal, RawType);
+      FalseValRaw = B.CreateBitCast(FalseVal, RawType);
+    }
+    Value *Result = B.CreateXor(TrueValRaw, FalseValRaw);
+    Result = B.CreateAnd(Result, Mask);
+    Result = B.CreateXor(Result, FalseValRaw);
+    if (Result->getType() != VecRetTy) {
+      Result = B.CreateBitCast(Result, VecRetTy);
+    }
+    return Result;
+  } else {
+    Value *Cmp = B.CreateICmpNE(Cond, Constant::getNullValue(CondEleTy));
+    return B.CreateSelect(Cmp, TrueVal, FalseVal);
+  }
+}
+
+/// @brief Emit the body of a builtin function as a call to a binary LLVM
+/// intrinsic. If one argument is a scalar type and the other a vector type,
+/// the scalar argument is splatted to the vector type.
+///
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] LHS first argument to be passed to the intrinsic.
+/// @param[in] RHS second argument to be passed to the intrinsic.
+/// @param[in] ID the LLVM intrinsic ID.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineAsLLVMBinaryIntrinsic(
+    IRBuilder<> &B, Value *LHS, Value *RHS, llvm::Intrinsic::ID ID) {
+  const Triple TT(B.GetInsertBlock()->getModule()->getTargetTriple());
+  if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) {
+    // fmin and fmax fail CTS on arm targets.
+    // This is a HACK and should be removed when CA-3595 is resolved.
+    return nullptr;
+  }
+
+  const auto *LHSTy = LHS->getType();
+  const auto *RHSTy = RHS->getType();
+  if (LHSTy->isVectorTy() != RHSTy->isVectorTy()) {
+    auto VectorEC =
+        multi_llvm::getVectorElementCount(LHSTy->isVectorTy() ? LHSTy : RHSTy);
+    if (!LHS->getType()->isVectorTy()) {
+      LHS = B.CreateVectorSplat(VectorEC, LHS);
+    }
+    if (!RHS->getType()->isVectorTy()) {
+      RHS = B.CreateVectorSplat(VectorEC, RHS);
+    }
+  }
+  return B.CreateBinaryIntrinsic(ID, LHS, RHS);
+}
+
+/// @brief Emit the body of the 'as_*' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineAs(Function *F, llvm::IRBuilder<> &B,
+                                          llvm::ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Value *Src = Args[0];
+  Type *SrcTy = Src->getType();
+  Type *DstTy = F->getReturnType();
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy);
+  auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+  Type *SrcEleTy = SrcVecTy ? SrcVecTy->getElementType() : nullptr;
+  Type *DstEleTy = DstVecTy ? DstVecTy->getElementType() : nullptr;
+  const unsigned SrcEleBits = SrcEleTy ? SrcEleTy->getPrimitiveSizeInBits() : 0;
+  const unsigned DstEleBits = DstEleTy ? DstEleTy->getPrimitiveSizeInBits() : 0;
+  const bool SrcDstHaveSameWidth =
+      SrcEleTy && DstEleTy && (SrcEleBits == DstEleBits);
+  const bool SrcVec3 = SrcVecTy && (SrcVecTy->getNumElements() == 3);
+  const bool SrcVec4 = SrcVecTy && (SrcVecTy->getNumElements() == 4);
+  const bool DstVec3 = DstVecTy && (DstVecTy->getNumElements() == 3);
+  const bool DstVec4 = DstVecTy && (DstVecTy->getNumElements() == 4);
+  bool LowerAsShuffle = false;
+  if (SrcVec3 && !DstVec3) {
+    if (!DstVec4 || !SrcDstHaveSameWidth) {
+      return nullptr;
+    }
+    LowerAsShuffle = true;
+  } else if (DstVec3 && !SrcVec3) {
+    if (!SrcVec4 || !SrcDstHaveSameWidth) {
+      return nullptr;
+    }
+    LowerAsShuffle = true;
+  }
+
+  // Lower some vec3 variants of as_* using vector shuffles.
+  if (LowerAsShuffle) {
+    SmallVector<Constant *, 4> Indices;
+    for (unsigned i = 0; i < DstVecTy->getNumElements(); i++) {
+      if (i < SrcVecTy->getNumElements()) {
+        Indices.push_back(B.getInt32(i));
+      } else {
+        Indices.push_back(UndefValue::get(B.getInt32Ty()));
+      }
+    }
+    Value *Mask = ConstantVector::get(Indices);
+    Src = B.CreateShuffleVector(Src, UndefValue::get(SrcVecTy), Mask);
+  }
+
+  // Common case: as_* is a simple bitcast.
+  return B.CreateBitCast(Src, DstTy, "as");
+}
+
+/// @brief Emit the body of the 'convert_*' builtin functions.
+///
+/// @param[in] F the function to emit inline.
+/// @param[in] builtinID Builtin ID of the function.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineConvert(Function *F, BuiltinID builtinID,
+                                               IRBuilder<> &B,
+                                               ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Type *DstTy = nullptr;
+  bool DstIsSigned = false;
+  auto &Ctx = B.getContext();
+  switch (builtinID) {
+    case eCLBuiltinConvertChar:
+      DstIsSigned = true;
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinConvertUChar:
+      DstTy = IntegerType::getInt8Ty(Ctx);
+      break;
+    case eCLBuiltinConvertShort:
+      DstIsSigned = true;
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinConvertUShort:
+      DstTy = IntegerType::getInt16Ty(Ctx);
+      break;
+    case eCLBuiltinConvertInt:
+      DstIsSigned = true;
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinConvertUInt:
+      DstTy = IntegerType::getInt32Ty(Ctx);
+      break;
+    case eCLBuiltinConvertLong:
+      DstIsSigned = true;
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinConvertULong:
+      DstTy = IntegerType::getInt64Ty(Ctx);
+      break;
+
+    default:
+      return nullptr;
+  }
+  if (!DstTy) {
+    return nullptr;
+  }
+
+  Value *Src = Args[0];
+  bool SrcIsSigned;
+  if (Src->getType()->isFloatingPointTy()) {
+    // All floating point types are signed
+    SrcIsSigned = true;
+  } else {
+    auto IsParamSignedOrNone = paramHasTypeQual(*F, 0, eTypeQualSignedInt);
+    if (!IsParamSignedOrNone) {
+      return nullptr;
+    }
+    SrcIsSigned = *IsParamSignedOrNone;
+  }
+
+  auto Opcode = CastInst::getCastOpcode(Src, SrcIsSigned, DstTy, DstIsSigned);
+  return B.CreateCast(Opcode, Src, DstTy, "inline_convert");
+}
+
+/// @brief Emit the body of the 'vloadN' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] Width Number of elements to load.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVLoad(Function *F, unsigned Width,
+                                             IRBuilder<> &B,
+                                             ArrayRef<Value *> Args) {
+  if (Width < 2) {
+    return nullptr;
+  }
+  (void)F;
+
+  Type *RetTy = F->getReturnType();
+  assert(isa<FixedVectorType>(RetTy) && "vloadN must return a vector type");
+  Type *EltTy = RetTy->getScalarType();
+
+  Value *Ptr = Args[1];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  auto *DataTy = FixedVectorType::get(EltTy, Width);
+  Value *Data = UndefValue::get(DataTy);
+
+  // Emit the base pointer.
+  Value *Offset = Args[0];
+  IntegerType *OffsetTy = dyn_cast<IntegerType>(Offset->getType());
+  if (!OffsetTy) {
+    return nullptr;
+  }
+  Value *Stride = ConstantInt::get(OffsetTy, Width);
+  Offset = B.CreateMul(Offset, Stride);
+  Value *GEPBase = B.CreateGEP(EltTy, Ptr, Offset, "vload_base");
+
+  if (Width == 3) {
+    for (unsigned i = 0; i < Width; i++) {
+      Value *Index = B.getInt32(i);
+      Value *GEP = B.CreateGEP(EltTy, GEPBase, Index);
+      Value *Lane = B.CreateLoad(EltTy, GEP, false, "vload");
+      Data = B.CreateInsertElement(Data, Lane, Index, "vload_insert");
+    }
+  } else {
+    PointerType *VecPtrTy = DataTy->getPointerTo(PtrTy->getAddressSpace());
+    Value *VecBase = B.CreateBitCast(GEPBase, VecPtrTy, "vload_ptr");
+    auto *Load = B.CreateLoad(DataTy, VecBase, false, "vload");
+
+    const unsigned Align = DataTy->getScalarSizeInBits() / 8;
+    Load->setAlignment(MaybeAlign(Align).valueOrOne());
+    Data = Load;
+  }
+
+  return Data;
+}
+
+/// @brief Emit the body of the 'vstoreN' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] Width Number of elements to store.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVStore(Function *F, unsigned Width,
+                                              IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  if (Width < 2) {
+    return nullptr;
+  }
+  (void)F;
+
+  Value *Data = Args[0];
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Data->getType());
+  if (!VecDataTy || (VecDataTy->getNumElements() != Width)) {
+    return nullptr;
+  }
+
+  Value *Ptr = Args[2];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+
+  // Emit the base pointer.
+  Value *Offset = Args[1];
+  IntegerType *OffsetTy = dyn_cast<IntegerType>(Offset->getType());
+  if (!OffsetTy) {
+    return nullptr;
+  }
+  Value *Stride = ConstantInt::get(OffsetTy, Width);
+  Offset = B.CreateMul(Offset, Stride);
+  Value *GEPBase =
+      B.CreateGEP(VecDataTy->getElementType(), Ptr, Offset, "vstore_base");
+
+  // Emit store(s).
+  StoreInst *Store = nullptr;
+  if (Width == 3) {
+    for (unsigned i = 0; i < Width; i++) {
+      Value *Index = B.getInt32(i);
+      Value *Lane = B.CreateExtractElement(Data, Index, "vstore_extract");
+      Value *GEP = B.CreateGEP(VecDataTy->getElementType(), GEPBase, Index);
+      Store = B.CreateStore(Lane, GEP, false);
+    }
+  } else {
+    PointerType *VecPtrTy = VecDataTy->getPointerTo(PtrTy->getAddressSpace());
+    Value *VecBase = B.CreateBitCast(GEPBase, VecPtrTy, "vstore_ptr");
+    Store = B.CreateStore(Data, VecBase, false);
+
+    const unsigned Align = VecDataTy->getScalarSizeInBits() / 8;
+    Store->setAlignment(MaybeAlign(Align).valueOrOne());
+  }
+  return Store;
+}
+
+/// @brief Emit the body of the 'vload_half' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVLoadHalf(Function *F, IRBuilder<> &B,
+                                                 ArrayRef<Value *> Args) {
+  if (F->getType()->isVectorTy()) {
+    return nullptr;
+  }
+
+  // Cast the pointer to ushort*.
+  Value *Ptr = Args[1];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  Type *U16Ty = B.getInt16Ty();
+  Type *U16PtrTy = PointerType::get(U16Ty, PtrTy->getAddressSpace());
+  Value *DataPtr = B.CreateBitCast(Ptr, U16PtrTy);
+
+  // Emit the base pointer.
+  Value *Offset = Args[0];
+  DataPtr = B.CreateGEP(U16Ty, DataPtr, Offset, "vload_base");
+
+  // Load a ushort.
+  Value *Data = B.CreateLoad(B.getInt16Ty(), DataPtr, "vload_half");
+
+  // Declare the conversion builtin.
+  Module *M = F->getParent();
+  Function *HalfToFloatFn =
+      declareBuiltin(M, eCLBuiltinConvertHalfToFloat, B.getFloatTy(),
+                     {B.getInt16Ty()}, {eTypeQualNone});
+  if (!HalfToFloatFn) {
+    return nullptr;
+  }
+
+  // Convert it to float.
+  CallInst *CI = CreateBuiltinCall(B, HalfToFloatFn, {Data});
+  CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// @brief Emit the body of the 'vstore_half' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] Mode Rounding mode to use, e.g. '_rte'.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode,
+                                                  IRBuilder<> &B,
+                                                  ArrayRef<Value *> Args) {
+  Value *Data = Args[0];
+  if (!Data || Data->getType()->isVectorTy()) {
+    return nullptr;
+  }
+
+  // Declare the conversion builtin.
+  BuiltinID ConvID;
+
+  if (Data->getType() == B.getFloatTy()) {
+    ConvID = StringSwitch<BuiltinID>(Mode)
+                 .Case("", eCLBuiltinConvertFloatToHalf)
+                 .Case("_rte", eCLBuiltinConvertFloatToHalfRte)
+                 .Case("_rtz", eCLBuiltinConvertFloatToHalfRtz)
+                 .Case("_rtp", eCLBuiltinConvertFloatToHalfRtp)
+                 .Case("_rtn", eCLBuiltinConvertFloatToHalfRtn)
+                 .Default(eBuiltinInvalid);
+  } else {
+    ConvID = StringSwitch<BuiltinID>(Mode)
+                 .Case("", eCLBuiltinConvertDoubleToHalf)
+                 .Case("_rte", eCLBuiltinConvertDoubleToHalfRte)
+                 .Case("_rtz", eCLBuiltinConvertDoubleToHalfRtz)
+                 .Case("_rtp", eCLBuiltinConvertDoubleToHalfRtp)
+                 .Case("_rtn", eCLBuiltinConvertDoubleToHalfRtn)
+                 .Default(eBuiltinInvalid);
+  }
+  if (ConvID == eBuiltinInvalid) {
+    return nullptr;
+  }
+  Module *M = F->getParent();
+
+  // Normally, the vstore_half functions take the number to store as a float.
+  // However, if the double extension is enabled, it is also possible to use
+  // double instead. This means that we might have to convert either a float or
+  // a double to a half.
+  Function *FloatToHalfFn = declareBuiltin(M, ConvID, B.getInt16Ty(),
+                                           {Data->getType()}, {eTypeQualNone});
+  if (!FloatToHalfFn) {
+    return nullptr;
+  }
+
+  // Convert the data from float/double to half.
+  CallInst *CI = CreateBuiltinCall(B, FloatToHalfFn, {Data});
+  CI->setCallingConv(F->getCallingConv());
+  Data = CI;
+
+  // Cast the pointer to ushort*.
+  Value *Ptr = Args[2];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  auto U16Ty = B.getInt16Ty();
+  Type *U16PtrTy = PointerType::get(U16Ty, PtrTy->getAddressSpace());
+  Value *DataPtr = B.CreateBitCast(Ptr, U16PtrTy);
+
+  // Emit the base pointer.
+  Value *Offset = Args[1];
+  DataPtr = B.CreateGEP(U16Ty, DataPtr, Offset, "vstore_base");
+
+  // Store the ushort.
+  return B.CreateStore(Data, DataPtr, "vstore_half");
+}
+
+/// @brief Emit the body of a relational builtin function.
+///
+/// This function handles relational builtins that accept two arguments, such as
+/// the comparison builtins.
+///
+/// @param[in] BuiltinID Identifier of the builtin to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithTwoArguments(
+    BuiltinID BuiltinID, IRBuilder<> &B, ArrayRef<Value *> Args) {
+  CmpInst::Predicate Pred = CmpInst::FCMP_FALSE;
+  CmpInst::Predicate Pred2 = CmpInst::FCMP_FALSE;
+  switch (BuiltinID) {
+    default:
+      return nullptr;
+    case eCLBuiltinIsEqual:
+      Pred = CmpInst::FCMP_OEQ;
+      break;
+    case eCLBuiltinIsNotEqual:
+      Pred = CmpInst::FCMP_UNE;
+      break;
+    case eCLBuiltinIsGreater:
+      Pred = CmpInst::FCMP_OGT;
+      break;
+    case eCLBuiltinIsGreaterEqual:
+      Pred = CmpInst::FCMP_OGE;
+      break;
+    case eCLBuiltinIsLess:
+      Pred = CmpInst::FCMP_OLT;
+      break;
+    case eCLBuiltinIsLessEqual:
+      Pred = CmpInst::FCMP_OLE;
+      break;
+    case eCLBuiltinIsLessGreater:
+      Pred = CmpInst::FCMP_OLT;
+      Pred2 = CmpInst::FCMP_OGT;
+      break;
+    case eCLBuiltinIsOrdered:
+      Pred = CmpInst::FCMP_ORD;
+      break;
+    case eCLBuiltinIsUnordered:
+      Pred = CmpInst::FCMP_UNO;
+      break;
+  }
+
+  if (Args.size() != 2) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0], *Src1 = Args[1];
+  Value *Cmp = B.CreateFCmp(Pred, Src0, Src1, "relational");
+
+  Type *ResultEleTy = nullptr;
+  Type *Src0Ty = Src0->getType();
+  if (Src0->getType() == B.getDoubleTy()) {
+    // Special case because relational(doubleN, doubleN) returns longn while
+    // relational(double, double) returns int.
+    if (Src0Ty->isVectorTy()) {
+      ResultEleTy = B.getInt64Ty();
+    } else {
+      ResultEleTy = B.getInt32Ty();
+    }
+  } else if (Src0->getType() == B.getHalfTy()) {
+    // Special case because relational(HalfTyN, HalfTyN) returns i16 while
+    // relational(HalfTy, HalfTy) returns int.
+    if (Src0Ty->isVectorTy()) {
+      ResultEleTy = B.getInt16Ty();
+    } else {
+      ResultEleTy = B.getInt32Ty();
+    }
+  } else {
+    // All the other cases can be handled here.
+    ResultEleTy = B.getIntNTy(Src0->getType()->getScalarSizeInBits());
+  }
+  Value *Result = nullptr;
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(Src0->getType());
+  if (SrcVecTy) {
+    auto *ResultVecTy =
+        FixedVectorType::get(ResultEleTy, SrcVecTy->getNumElements());
+    Result = B.CreateSExt(Cmp, ResultVecTy, "relational");
+  } else {
+    Result = B.CreateZExt(Cmp, ResultEleTy, "relational");
+  }
+
+  if (Pred2 != CmpInst::FCMP_FALSE) {
+    Value *Cmp2 = B.CreateFCmp(Pred2, Src0, Src1, "relational");
+    Value *True = SrcVecTy ? Constant::getAllOnesValue(Result->getType())
+                           : ConstantInt::get(Result->getType(), 1);
+    Result = B.CreateSelect(Cmp2, True, Result);
+  }
+
+  return Result;
+}
+
+/// @brief Emit the body of a relational builtin function.
+///
+/// This function handles relational builtins that accept a single argument,
+/// such as the builtins checking if the argument is infinite or not.
+///
+/// @param[in] BuiltinID Identifier of the builtin to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Arg Argument passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithOneArgument(
+    BuiltinID BuiltinID, IRBuilder<> &B, Value *Arg) {
+  Value *Result = nullptr;
+  // The types (and misc info) that we will be using
+  Type *ArgTy = Arg->getType();
+  const bool isVectorTy = ArgTy->isVectorTy();
+  const unsigned Width =
+      isVectorTy ? multi_llvm::getVectorNumElements(ArgTy) : 0;
+  Type *ArgEleTy = isVectorTy ? multi_llvm::getVectorElementType(ArgTy) : ArgTy;
+  Type *SignedTy = ArgEleTy == B.getFloatTy() ? B.getInt32Ty() : B.getInt64Ty();
+  Type *ReturnTy = (ArgEleTy == B.getDoubleTy() && isVectorTy) ? B.getInt64Ty()
+                                                               : B.getInt32Ty();
+
+  if (ArgEleTy != B.getFloatTy() && ArgEleTy != B.getDoubleTy()) {
+    return nullptr;
+  }
+  // Create all the masks we are going to be using
+  Constant *ExponentMask = nullptr;
+  Constant *MantissaMask = nullptr;
+  Constant *NonSignMask = nullptr;
+  Constant *Zero = nullptr;
+  if (ArgEleTy == B.getFloatTy()) {
+    ExponentMask = B.getInt32(0x7F800000u);
+    MantissaMask = B.getInt32(0x007FFFFFu);
+    NonSignMask = B.getInt32(0x7FFFFFFFu);
+    Zero = B.getInt32(0u);
+  } else if (ArgEleTy == B.getDoubleTy()) {
+    ExponentMask = B.getInt64(0x7FF0000000000000u);
+    MantissaMask = B.getInt64(0x000FFFFFFFFFFFFFu);
+    NonSignMask = B.getInt64(0x7FFFFFFFFFFFFFFFu);
+    Zero = B.getInt64(0u);
+  }
+
+  // For the vector versions, we need to create vector types and values
+  if (isVectorTy) {
+    SignedTy = FixedVectorType::get(SignedTy, Width);
+    ReturnTy = FixedVectorType::get(ReturnTy, Width);
+    const auto EC = ElementCount::getFixed(Width);
+    ExponentMask = ConstantVector::getSplat(EC, ExponentMask);
+    MantissaMask = ConstantVector::getSplat(EC, MantissaMask);
+    NonSignMask = ConstantVector::getSplat(EC, NonSignMask);
+    Zero = ConstantVector::getSplat(EC, Zero);
+  }
+
+  // We will be needing access to the argument as an integer (bitcast) value
+  Value *STArg = B.CreateBitCast(Arg, SignedTy);
+
+  // Emit the IR that will calculate the result
+  switch (BuiltinID) {
+    default:
+      llvm_unreachable("Invalid Builtin ID");
+      break;
+    case eCLBuiltinIsFinite:
+      Result = B.CreateAnd(STArg, NonSignMask);
+      Result = B.CreateICmpSLT(Result, ExponentMask);
+      break;
+    case eCLBuiltinIsInf:
+      Result = B.CreateAnd(STArg, NonSignMask);
+      Result = B.CreateICmpEQ(Result, ExponentMask);
+      break;
+    case eCLBuiltinIsNan: {
+      Result = B.CreateAnd(STArg, NonSignMask);
+      // This checks if the exponent is all ones (the same as the ExponentMask)
+      // and also if the significant (the mantissa) is not zero. If the mantissa
+      // is zero then it would be infinite, not NaN.
+      Value *ExponentAllOnes =
+          B.CreateICmpEQ(ExponentMask, B.CreateAnd(ExponentMask, Result));
+      Value *MantissaNotZero =
+          B.CreateICmpSGT(B.CreateAnd(MantissaMask, Result), Zero);
+      Result = B.CreateAnd(ExponentAllOnes, MantissaNotZero);
+      break;
+    }
+    case eCLBuiltinIsNormal: {
+      Result = B.CreateAnd(STArg, NonSignMask);
+      Value *ExponentBitsNotAllSet = B.CreateICmpSLT(Result, ExponentMask);
+      Value *ExponentBitsNonZero = B.CreateICmpSGT(Result, MantissaMask);
+      Result = B.CreateAnd(ExponentBitsNotAllSet, ExponentBitsNonZero);
+      break;
+    }
+    case eCLBuiltinSignBit:
+      Result = B.CreateICmpSLT(STArg, Zero);
+      break;
+  }
+
+  // Convert the i1 result from the comparison instruction to the type that the
+  // builtin returns
+  if (isVectorTy) {
+    // 0 for false, -1 (all 1s) for true
+    Result = B.CreateSExt(Result, ReturnTy);
+  } else {
+    // 0 for false, 1 for true
+    Result = B.CreateZExt(Result, ReturnTy);
+  }
+
+  return Result;
+}
+
+/// @brief Emit the body of a vector shuffle builtin function.
+///
+/// @param[in] BuiltinID Identifier of the builtin to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineShuffle(BuiltinID BuiltinID,
+                                               IRBuilder<> &B,
+                                               ArrayRef<Value *> Args) {
+  // Make sure we have the correct number of arguments.
+  assert(((BuiltinID == eCLBuiltinShuffle && Args.size() == 2) ||
+          (BuiltinID == eCLBuiltinShuffle2 && Args.size() == 3)) &&
+         "Wrong number of arguments!");
+
+  // It is not worth splitting shuffle and shuffle2 into two functions as a lot
+  // of the code is the same.
+  const bool isShuffle2 = (BuiltinID == eCLBuiltinShuffle2);
+
+  // Get the mask and the mask type.
+  Value *Mask = Args[isShuffle2 ? 2 : 1];
+  auto MaskVecTy = cast<FixedVectorType>(Mask->getType());
+  IntegerType *MaskTy = cast<IntegerType>(MaskVecTy->getElementType());
+  const int MaskWidth = MaskVecTy->getNumElements();
+
+  // TODO: Support non-constant masks (in a less efficient way)
+  if (!isa<Constant>(Mask)) {
+    return nullptr;
+  }
+
+  // We need to mask the mask elements, since the OpenCL standard specifies that
+  // we should only take the ilogb(2N-1)+1 least significant bits from each mask
+  // element into consideration, where N the number of elements in the vector
+  // according to vec_step.
+  auto ShuffleTy = cast<FixedVectorType>(Args[0]->getType());
+  const int Width = ShuffleTy->getNumElements();
+  // Vectors for size 3 are not supported by the shuffle builtin.
+  assert(Width != 3 && "Invalid vector width of 3!");
+  const int N = (Width == 3 ? 4 : Width);
+  const int SignificantBits =
+      stdcompat::ilogb(2 * N - 1) + (isShuffle2 ? 1 : 0);
+  const unsigned BitMask = ~((~0u) << SignificantBits);
+  Value *BitMaskV = ConstantVector::getSplat(ElementCount::getFixed(MaskWidth),
+                                             ConstantInt::get(MaskTy, BitMask));
+  // The builtin's mask may have different integer types, while the LLVM
+  // instruction only supports i32.
+  // Mask the mask.
+  Value *MaskedMask = B.CreateAnd(Mask, BitMaskV, "mask");
+  MaskedMask = B.CreateIntCast(
+      MaskedMask, FixedVectorType::get(B.getInt32Ty(), MaskWidth), false);
+
+  // Create the shufflevector instruction.
+  Value *Arg1 = (isShuffle2 ? Args[1] : UndefValue::get(ShuffleTy));
+  return B.CreateShuffleVector(Args[0], Arg1, MaskedMask, "shuffle");
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlinePrintf(BuiltinID, IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  Module &M = *(B.GetInsertBlock()->getModule());
+
+  // Declare printf if needed.
+  Function *Printf = M.getFunction("printf");
+  if (!Printf) {
+    PointerType *PtrTy = PointerType::getUnqual(B.getInt8Ty());
+    FunctionType *PrintfTy = FunctionType::get(B.getInt32Ty(), {PtrTy}, true);
+    Printf =
+        Function::Create(PrintfTy, GlobalValue::ExternalLinkage, "printf", &M);
+    Printf->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+
+  return CreateBuiltinCall(B, Printf, Args);
+}
+
+// Must be kept in sync with our OpenCL headers!
+enum : uint32_t {
+  CLK_LOCAL_MEM_FENCE = 1,
+  CLK_GLOBAL_MEM_FENCE = 2,
+  // FIXME: We don't support image fences in our headers
+};
+
+// Must be kept in sync with our OpenCL headers!
+enum : uint32_t {
+  memory_scope_work_item = 1,
+  memory_scope_sub_group = 2,
+  memory_scope_work_group = 3,
+  memory_scope_device = 4,
+  memory_scope_all_svm_devices = 5,
+  memory_scope_all_devices = 6,
+};
+
+// Must be kept in sync with our OpenCL headers!
+enum : uint32_t {
+  memory_order_relaxed = 0,
+  memory_order_acquire = 1,
+  memory_order_release = 2,
+  memory_order_acq_rel = 3,
+  memory_order_seq_cst = 4,
+};
+
+static std::optional<unsigned> parseMemFenceFlagsParam(Value *const P) {
+  // Grab the 'flags' parameter.
+  if (auto *const Flags = dyn_cast<ConstantInt>(P)) {
+    // cl_mem_fence_flags is a bitfield and can be 0 or a combination of
+    // CLK_(GLOBAL|LOCAL|IMAGE)_MEM_FENCE values ORed together.
+    switch (Flags->getZExtValue()) {
+      case 0:
+        return std::nullopt;
+      case CLK_LOCAL_MEM_FENCE:
+        return BIMuxInfoConcept::MemSemanticsWorkGroupMemory;
+      case CLK_GLOBAL_MEM_FENCE:
+        return BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory;
+      case CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE:
+        return (BIMuxInfoConcept::MemSemanticsWorkGroupMemory |
+                BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory);
+      default:
+        llvm_unreachable("unhandled memory fence flags");
+    }
+  }
+  return std::nullopt;
+}
+
+static std::optional<unsigned> parseMemoryScopeParam(Value *const P) {
+  if (auto *const Scope = dyn_cast<ConstantInt>(P)) {
+    switch (Scope->getZExtValue()) {
+      case memory_scope_work_item:
+        return BIMuxInfoConcept::MemScopeWorkItem;
+      case memory_scope_sub_group:
+        return BIMuxInfoConcept::MemScopeSubGroup;
+      case memory_scope_work_group:
+        return BIMuxInfoConcept::MemScopeWorkGroup;
+      case memory_scope_device:
+        return BIMuxInfoConcept::MemScopeDevice;
+      // 3.3.5. memory_scope_all_devices is an alias for
+      // memory_scope_all_svm_devices.
+      case memory_scope_all_devices:
+      case memory_scope_all_svm_devices:
+        return BIMuxInfoConcept::MemScopeCrossDevice;
+      default:
+        llvm_unreachable("unhandled memory scope");
+    }
+  }
+  return std::nullopt;
+}
+
+static std::optional<unsigned> parseMemoryOrderParam(Value *const P) {
+  if (auto *const Order = dyn_cast<ConstantInt>(P)) {
+    switch (Order->getZExtValue()) {
+      case memory_order_relaxed:
+        return BIMuxInfoConcept::MemSemanticsRelaxed;
+      case memory_order_acquire:
+        return BIMuxInfoConcept::MemSemanticsAcquire;
+      case memory_order_release:
+        return BIMuxInfoConcept::MemSemanticsRelease;
+      case memory_order_acq_rel:
+        return BIMuxInfoConcept::MemSemanticsAcquireRelease;
+      case memory_order_seq_cst:
+        return BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
+      default:
+        llvm_unreachable("unhandled memory order");
+    }
+  }
+  return std::nullopt;
+}
+
+// This function returns a mux builtin ID for the corresponding CL builtin ID
+// when that lowering is straightforward and the function types of each builtin
+// are identical.
+static std::optional<BuiltinID> get1To1BuiltinLowering(BuiltinID CLBuiltinID) {
+  switch (CLBuiltinID) {
+    default:
+      return std::nullopt;
+    case eCLBuiltinGetWorkDim:
+      return eMuxBuiltinGetWorkDim;
+    case eCLBuiltinGetGroupId:
+      return eMuxBuiltinGetGroupId;
+    case eCLBuiltinGetGlobalSize:
+      return eMuxBuiltinGetGlobalSize;
+    case eCLBuiltinGetGlobalOffset:
+      return eMuxBuiltinGetGlobalOffset;
+    case eCLBuiltinGetLocalId:
+      return eMuxBuiltinGetLocalId;
+    case eCLBuiltinGetLocalSize:
+      return eMuxBuiltinGetLocalSize;
+    case eCLBuiltinGetEnqueuedLocalSize:
+      return eMuxBuiltinGetEnqueuedLocalSize;
+    case eCLBuiltinGetNumGroups:
+      return eMuxBuiltinGetNumGroups;
+    case eCLBuiltinGetGlobalId:
+      return eMuxBuiltinGetGlobalId;
+    case eCLBuiltinGetLocalLinearId:
+      return eMuxBuiltinGetLocalLinearId;
+    case eCLBuiltinGetGlobalLinearId:
+      return eMuxBuiltinGetGlobalLinearId;
+    case eCLBuiltinGetSubgroupSize:
+      return eMuxBuiltinGetSubGroupSize;
+    case eCLBuiltinGetMaxSubgroupSize:
+      return eMuxBuiltinGetMaxSubGroupSize;
+    case eCLBuiltinGetSubgroupLocalId:
+      return eMuxBuiltinGetSubGroupLocalId;
+    case eCLBuiltinGetNumSubgroups:
+      return eMuxBuiltinGetNumSubGroups;
+    case eCLBuiltinGetEnqueuedNumSubgroups:
+      // Note - this is mapping to the same builtin as
+      // eCLBuiltinGetNumSubgroups, as we don't currently support
+      // non-uniform work-group sizes.
+      return eMuxBuiltinGetNumSubGroups;
+    case eCLBuiltinGetSubgroupId:
+      return eMuxBuiltinGetSubGroupId;
+  }
+}
+
+Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
+    CallInst &CI, BIMuxInfoConcept &BIMuxImpl) {
+  auto &M = *CI.getModule();
+  auto *const F = CI.getCalledFunction();
+  assert(F && "No calling function?");
+  const auto ID = identifyBuiltin(*F);
+
+  // Handle straightforward 1:1 mappings.
+  if (auto MuxID = get1To1BuiltinLowering(ID)) {
+    auto *const MuxBuiltinFn = BIMuxImpl.getOrDeclareMuxBuiltin(*MuxID, M);
+    assert(MuxBuiltinFn && "Could not get/declare mux builtin");
+    const SmallVector<Value *> Args(CI.args());
+    auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName(), &CI);
+    NewCI->takeName(&CI);
+    NewCI->setAttributes(MuxBuiltinFn->getAttributes());
+    return NewCI;
+  }
+
+  IRBuilder<> B(&CI);
+  LLVMContext &Ctx = M.getContext();
+  auto *const I32Ty = Type::getInt32Ty(Ctx);
+
+  auto CtrlBarrierID = eMuxBuiltinWorkGroupBarrier;
+  unsigned DefaultMemScope = BIMuxInfoConcept::MemScopeWorkGroup;
+  unsigned DefaultMemOrder =
+      BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
+
+  switch (ID) {
+    default:
+      // Sub-group and work-group builtins need lowering to their mux
+      // equivalents.
+      if (auto *const NewI = lowerGroupBuiltinToMuxBuiltin(CI, ID, BIMuxImpl)) {
+        return NewI;
+      }
+      return nullptr;
+    case eCLBuiltinSubGroupBarrier:
+      CtrlBarrierID = eMuxBuiltinSubGroupBarrier;
+      DefaultMemScope = BIMuxInfoConcept::MemScopeSubGroup;
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinBarrier:
+    case eCLBuiltinWorkGroupBarrier: {
+      // Memory Scope which the barrier controls. Defaults to 'workgroup' or
+      // 'subgroup' scope depending on the barrier, but sub_group_barrier and
+      // work_group_barrier can optionally provide a scope.
+      unsigned ScopeVal = DefaultMemScope;
+      if ((ID == eCLBuiltinSubGroupBarrier ||
+           ID == eCLBuiltinWorkGroupBarrier) &&
+          F->arg_size() == 2) {
+        if (auto Scope = parseMemoryScopeParam(CI.getOperand(1))) {
+          ScopeVal = *Scope;
+        }
+      }
+
+      const unsigned SemanticsVal =
+          DefaultMemOrder |
+          parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
+
+      auto *const CtrlBarrier =
+          BIMuxImpl.getOrDeclareMuxBuiltin(CtrlBarrierID, M);
+
+      auto *const BarrierID = ConstantInt::get(I32Ty, 0);
+      auto *const Scope = ConstantInt::get(I32Ty, ScopeVal);
+      auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
+      auto *const NewCI = B.CreateCall(
+          CtrlBarrier, {BarrierID, Scope, Semantics}, CI.getName());
+      NewCI->setAttributes(CtrlBarrier->getAttributes());
+      NewCI->takeName(&CI);
+      return NewCI;
+    }
+    case eCLBuiltinAtomicWorkItemFence:
+      // atomic_work_item_fence has two parameters which we can parse.
+      DefaultMemOrder =
+          parseMemoryOrderParam(CI.getOperand(1)).value_or(DefaultMemOrder);
+      DefaultMemScope =
+          parseMemoryScopeParam(CI.getOperand(2)).value_or(DefaultMemScope);
+      LLVM_FALLTHROUGH;
+    case eCLBuiltinMemFence:
+    case eCLBuiltinReadMemFence:
+    case eCLBuiltinWriteMemFence: {
+      // The deprecated 'fence' builtins default to memory_scope_work_group and
+      // have one possible order each.
+      if (ID == eCLBuiltinMemFence) {
+        DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquireRelease;
+      } else if (ID == eCLBuiltinReadMemFence) {
+        DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquire;
+      } else if (ID == eCLBuiltinWriteMemFence) {
+        DefaultMemOrder = BIMuxInfoConcept::MemSemanticsRelease;
+      }
+      const unsigned SemanticsVal =
+          DefaultMemOrder |
+          parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
+      auto *const MemBarrier =
+          BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
+      auto *const Scope = ConstantInt::get(I32Ty, DefaultMemScope);
+      auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
+      auto *const NewCI =
+          B.CreateCall(MemBarrier, {Scope, Semantics}, CI.getName());
+      NewCI->setAttributes(MemBarrier->getAttributes());
+      NewCI->takeName(&CI);
+      return NewCI;
+    }
+    case eCLBuiltinAsyncWorkGroupCopy:
+    case eCLBuiltinAsyncWorkGroupStridedCopy:
+    case eCLBuiltinAsyncWorkGroupCopy2D2D:
+    case eCLBuiltinAsyncWorkGroupCopy3D3D:
+      return lowerAsyncBuiltinToMuxBuiltin(CI, ID, BIMuxImpl);
+    case eCLBuiltinWaitGroupEvents: {
+      auto *const MuxWait =
+          BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinDMAWait, M);
+      assert(MuxWait && "Could not get/declare __mux_dma_wait");
+      auto *const Count = CI.getArgOperand(0);
+      auto *Events = CI.getArgOperand(1);
+
+      assert(Events->getType()->isPointerTy() &&
+             (Events->getType()->getPointerAddressSpace() ==
+                  compiler::utils::AddressSpace::Private ||
+              Events->getType()->getPointerAddressSpace() ==
+                  compiler::utils::AddressSpace::Generic) &&
+             "Pointer to event must be in address space 0 or 4.");
+
+      Events = B.CreatePointerBitCastOrAddrSpaceCast(
+          Events, PointerType::getUnqual(Ctx), "mux.events");
+      auto *const NewCI = B.CreateCall(MuxWait, {Count, Events}, CI.getName());
+      NewCI->setAttributes(MuxWait->getAttributes());
+      NewCI->takeName(&CI);
+      return NewCI;
+    }
+  }
+}
+
+Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
+    CallInst &CI, BuiltinID ID, BIMuxInfoConcept &BIMuxImpl) {
+  auto &M = *CI.getModule();
+  auto *const F = CI.getCalledFunction();
+  assert(F && "No calling function?");
+
+  // Some ops need extra checking to determine their mux ID:
+  // * add/mul operations are split into integer/float
+  // * min/max operations are split into signed/unsigned/float
+  // So we set a 'base' builtin ID for these operations to the (unsigned)
+  // integer variant and do a checking step afterwards where we refine the
+  // builtin ID.
+  bool RecheckOpType = false;
+  BaseBuiltinID MuxBuiltinID = eBuiltinInvalid;
+  switch (ID) {
+    default:
+      return nullptr;
+    case eCLBuiltinSubgroupAll:
+      MuxBuiltinID = eMuxBuiltinSubgroupAll;
+      break;
+    case eCLBuiltinSubgroupAny:
+      MuxBuiltinID = eMuxBuiltinSubgroupAny;
+      break;
+    case eCLBuiltinSubgroupBroadcast:
+      MuxBuiltinID = eMuxBuiltinSubgroupBroadcast;
+      break;
+    case eCLBuiltinSubgroupReduceAdd:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceAdd;
+      break;
+    case eCLBuiltinSubgroupReduceMin:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceUMin;
+      break;
+    case eCLBuiltinSubgroupReduceMax:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceUMax;
+      break;
+    case eCLBuiltinSubgroupReduceMul:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceMul;
+      break;
+    case eCLBuiltinSubgroupReduceAnd:
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceAnd;
+      break;
+    case eCLBuiltinSubgroupReduceOr:
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceOr;
+      break;
+    case eCLBuiltinSubgroupReduceXor:
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceXor;
+      break;
+    case eCLBuiltinSubgroupReduceLogicalAnd:
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalAnd;
+      break;
+    case eCLBuiltinSubgroupReduceLogicalOr:
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalOr;
+      break;
+    case eCLBuiltinSubgroupReduceLogicalXor:
+      MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalXor;
+      break;
+    case eCLBuiltinSubgroupScanAddInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanAddInclusive;
+      break;
+    case eCLBuiltinSubgroupScanAddExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanAddExclusive;
+      break;
+    case eCLBuiltinSubgroupScanMinInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanUMinInclusive;
+      break;
+    case eCLBuiltinSubgroupScanMinExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanUMinExclusive;
+      break;
+    case eCLBuiltinSubgroupScanMaxInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxInclusive;
+      break;
+    case eCLBuiltinSubgroupScanMaxExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxExclusive;
+      break;
+    case eCLBuiltinSubgroupScanMulInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanMulInclusive;
+      break;
+    case eCLBuiltinSubgroupScanMulExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinSubgroupScanMulExclusive;
+      break;
+    case eCLBuiltinSubgroupScanAndInclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanAndInclusive;
+      break;
+    case eCLBuiltinSubgroupScanAndExclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanAndExclusive;
+      break;
+    case eCLBuiltinSubgroupScanOrInclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanOrInclusive;
+      break;
+    case eCLBuiltinSubgroupScanOrExclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanOrExclusive;
+      break;
+    case eCLBuiltinSubgroupScanXorInclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanXorInclusive;
+      break;
+    case eCLBuiltinSubgroupScanXorExclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanXorExclusive;
+      break;
+    case eCLBuiltinSubgroupScanLogicalAndInclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndInclusive;
+      break;
+    case eCLBuiltinSubgroupScanLogicalAndExclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndExclusive;
+      break;
+    case eCLBuiltinSubgroupScanLogicalOrInclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrInclusive;
+      break;
+    case eCLBuiltinSubgroupScanLogicalOrExclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrExclusive;
+      break;
+    case eCLBuiltinSubgroupScanLogicalXorInclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorInclusive;
+      break;
+    case eCLBuiltinSubgroupScanLogicalXorExclusive:
+      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorExclusive;
+      break;
+    case eCLBuiltinWorkgroupAll:
+      MuxBuiltinID = eMuxBuiltinWorkgroupAll;
+      break;
+    case eCLBuiltinWorkgroupAny:
+      MuxBuiltinID = eMuxBuiltinWorkgroupAny;
+      break;
+    case eCLBuiltinWorkgroupBroadcast:
+      MuxBuiltinID = eMuxBuiltinWorkgroupBroadcast;
+      break;
+    case eCLBuiltinWorkgroupReduceAdd:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceAdd;
+      break;
+    case eCLBuiltinWorkgroupReduceMin:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMin;
+      break;
+    case eCLBuiltinWorkgroupReduceMax:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMax;
+      break;
+    case eCLBuiltinWorkgroupReduceMul:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceMul;
+      break;
+    case eCLBuiltinWorkgroupReduceAnd:
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceAnd;
+      break;
+    case eCLBuiltinWorkgroupReduceOr:
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceOr;
+      break;
+    case eCLBuiltinWorkgroupReduceXor:
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceXor;
+      break;
+    case eCLBuiltinWorkgroupReduceLogicalAnd:
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalAnd;
+      break;
+    case eCLBuiltinWorkgroupReduceLogicalOr:
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalOr;
+      break;
+    case eCLBuiltinWorkgroupReduceLogicalXor:
+      MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalXor;
+      break;
+    case eCLBuiltinWorkgroupScanAddInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanAddInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanAddExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanAddExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanMinInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanMinExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanMaxInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanMaxExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanMulInclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanMulInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanMulExclusive:
+      RecheckOpType = true;
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanMulExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanAndInclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanAndInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanAndExclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanAndExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanOrInclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanOrInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanOrExclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanOrExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanXorInclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanXorInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanXorExclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanXorExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanLogicalAndInclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanLogicalAndExclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanLogicalOrInclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanLogicalOrExclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrExclusive;
+      break;
+    case eCLBuiltinWorkgroupScanLogicalXorInclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorInclusive;
+      break;
+    case eCLBuiltinWorkgroupScanLogicalXorExclusive:
+      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorExclusive;
+      break;
+  }
+
+  if (RecheckOpType) {
+    // We've assumed (unsigned) integer operations, but we may actually have
+    // signed integer, or floating point, operations. Refine the builtin ID to
+    // the correct 'overload' now.
+    compiler::utils::NameMangler Mangler(&F->getContext());
+    SmallVector<Type *, 4> ArgumentTypes;
+    SmallVector<compiler::utils::TypeQualifiers, 4> Qualifiers;
+
+    const auto DemangledName = std::string(
+        Mangler.demangleName(F->getName(), ArgumentTypes, Qualifiers));
+
+    assert(Qualifiers.size() == 1 && ArgumentTypes.size() == 1 &&
+           "Unknown collective builtin");
+    auto &Qual = Qualifiers[0];
+
+    bool IsSignedInt = false;
+    while (!IsSignedInt && Qual.getCount()) {
+      IsSignedInt |= Qual.pop_front() == compiler::utils::eTypeQualSignedInt;
+    }
+
+    const bool IsFP = ArgumentTypes[0]->isFloatingPointTy();
+    switch (MuxBuiltinID) {
+      default:
+        llvm_unreachable("unknown group operation for which to check the type");
+      case eMuxBuiltinSubgroupReduceAdd:
+        MuxBuiltinID = IsFP ? eMuxBuiltinSubgroupReduceFAdd : MuxBuiltinID;
+        break;
+      case eMuxBuiltinSubgroupReduceMul:
+        MuxBuiltinID = IsFP ? eMuxBuiltinSubgroupReduceFMul : MuxBuiltinID;
+        break;
+      case eMuxBuiltinSubgroupReduceUMin:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinSubgroupReduceFMin
+                 : (IsSignedInt ? eMuxBuiltinSubgroupReduceSMin : MuxBuiltinID);
+        break;
+      case eMuxBuiltinSubgroupReduceUMax:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinSubgroupReduceFMax
+                 : (IsSignedInt ? eMuxBuiltinSubgroupReduceSMax : MuxBuiltinID);
+        break;
+      case eMuxBuiltinSubgroupScanAddInclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinSubgroupScanFAddInclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinSubgroupScanAddExclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinSubgroupScanFAddExclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinSubgroupScanMulInclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinSubgroupScanFMulInclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinSubgroupScanMulExclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinSubgroupScanFMulExclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinSubgroupScanUMinInclusive:
+        MuxBuiltinID = IsFP
+                           ? eMuxBuiltinSubgroupScanFMinInclusive
+                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMinInclusive
+                                          : MuxBuiltinID);
+        break;
+      case eMuxBuiltinSubgroupScanUMinExclusive:
+        MuxBuiltinID = IsFP
+                           ? eMuxBuiltinSubgroupScanFMinExclusive
+                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMinExclusive
+                                          : MuxBuiltinID);
+        break;
+      case eMuxBuiltinSubgroupScanUMaxInclusive:
+        MuxBuiltinID = IsFP
+                           ? eMuxBuiltinSubgroupScanFMaxInclusive
+                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMaxInclusive
+                                          : MuxBuiltinID);
+        break;
+      case eMuxBuiltinSubgroupScanUMaxExclusive:
+        MuxBuiltinID = IsFP
+                           ? eMuxBuiltinSubgroupScanFMaxExclusive
+                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMaxExclusive
+                                          : MuxBuiltinID);
+        break;
+      case eMuxBuiltinWorkgroupReduceAdd:
+        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFAdd : MuxBuiltinID;
+        break;
+      case eMuxBuiltinWorkgroupReduceMul:
+        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFMul : MuxBuiltinID;
+        break;
+      case eMuxBuiltinWorkgroupReduceUMin:
+        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFMin
+                            : (IsSignedInt ? eMuxBuiltinWorkgroupReduceSMin
+                                           : MuxBuiltinID);
+        break;
+      case eMuxBuiltinWorkgroupReduceUMax:
+        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFMax
+                            : (IsSignedInt ? eMuxBuiltinWorkgroupReduceSMax
+                                           : MuxBuiltinID);
+        break;
+      case eMuxBuiltinWorkgroupScanAddInclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFAddInclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinWorkgroupScanAddExclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFAddExclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinWorkgroupScanMulInclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFMulInclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinWorkgroupScanMulExclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFMulExclusive : MuxBuiltinID;
+        break;
+      case eMuxBuiltinWorkgroupScanUMinInclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFMinInclusive
+                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMinInclusive
+                                : MuxBuiltinID);
+        break;
+      case eMuxBuiltinWorkgroupScanUMinExclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFMinExclusive
+                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMinExclusive
+                                : MuxBuiltinID);
+        break;
+      case eMuxBuiltinWorkgroupScanUMaxInclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFMaxInclusive
+                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMaxInclusive
+                                : MuxBuiltinID);
+        break;
+      case eMuxBuiltinWorkgroupScanUMaxExclusive:
+        MuxBuiltinID =
+            IsFP ? eMuxBuiltinWorkgroupScanFMaxExclusive
+                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMaxExclusive
+                                : MuxBuiltinID);
+        break;
+    }
+  }
+
+  const bool IsAnyAll = MuxBuiltinID == eMuxBuiltinSubgroupAny ||
+                        MuxBuiltinID == eMuxBuiltinSubgroupAll ||
+                        MuxBuiltinID == eMuxBuiltinWorkgroupAny ||
+                        MuxBuiltinID == eMuxBuiltinWorkgroupAll;
+  SmallVector<Type *, 2> OverloadInfo;
+  if (!IsAnyAll) {
+    OverloadInfo.push_back(CI.getOperand(0)->getType());
+  } else {
+    OverloadInfo.push_back(IntegerType::getInt1Ty(M.getContext()));
+  }
+
+  auto *const MuxBuiltinFn =
+      BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, OverloadInfo);
+
+  assert(MuxBuiltinFn && "Missing mux builtin");
+  auto *const SizeTy = getSizeType(M);
+  auto *const I32Ty = Type::getInt32Ty(M.getContext());
+
+  SmallVector<Value *, 4> Args;
+  if (MuxBuiltinID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+      MuxBuiltinID <= eLastMuxWorkgroupCollectiveBuiltin) {
+    // Work-group operations have a barrier ID first.
+    Args.push_back(ConstantInt::get(I32Ty, 0));
+  }
+  // Then the arg itself
+  // If it's an any/all operation, we must first reduce to i1 because that's how
+  // the mux builtins expect their arguments.
+  auto *Val = CI.getOperand(0);
+  if (!IsAnyAll) {
+    Args.push_back(Val);
+  } else {
+    assert(Val->getType()->isIntegerTy());
+    auto *NEZero =
+        ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, Val,
+                         ConstantInt::getNullValue(Val->getType()), "", &CI);
+    Args.push_back(NEZero);
+  }
+
+  if (MuxBuiltinID == eMuxBuiltinSubgroupBroadcast) {
+    // Pass on the ID parameter
+    Args.push_back(CI.getOperand(1));
+  }
+  if (MuxBuiltinID == eMuxBuiltinWorkgroupBroadcast) {
+    // The mux version always has three indices. Any missing ones are replaced
+    // with zeros
+    for (unsigned i = 0, e = CI.arg_size(); i != 3; i++) {
+      Args.push_back(1 + i < e ? CI.getOperand(1 + i)
+                               : ConstantInt::getNullValue(SizeTy));
+    }
+  }
+
+  auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName(), &CI);
+  NewCI->takeName(&CI);
+  NewCI->setAttributes(MuxBuiltinFn->getAttributes());
+
+  if (!IsAnyAll) {
+    return NewCI;
+  }
+  // For any/all we need to recreate the original i32 return value.
+  return SExtInst::Create(Instruction::SExt, NewCI, CI.getType(), "sext", &CI);
+}
+
+Instruction *CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(
+    CallInst &CI, BuiltinID ID, BIMuxInfoConcept &BIMuxImpl) {
+  assert((ID == eCLBuiltinAsyncWorkGroupCopy ||
+          ID == eCLBuiltinAsyncWorkGroupStridedCopy ||
+          ID == eCLBuiltinAsyncWorkGroupCopy2D2D ||
+          ID == eCLBuiltinAsyncWorkGroupCopy3D3D) &&
+         "Invalid ID");
+
+  IRBuilder<> B(&CI);
+  auto &M = *CI.getModule();
+  LLVMContext &Ctx = M.getContext();
+  const auto &DL = M.getDataLayout();
+
+  switch (ID) {
+    default:
+      llvm_unreachable("Unhandled builtin");
+    case eCLBuiltinAsyncWorkGroupCopy:
+    case eCLBuiltinAsyncWorkGroupStridedCopy: {
+      NameMangler Mangler(&Ctx);
+
+      // Do a full demangle to determing the pointer element type of the first
+      // argument.
+      SmallVector<Type *, 4> BuiltinArgTypes, BuiltinArgPointeeTypes;
+      SmallVector<compiler::utils::TypeQualifiers, 4> BuiltinArgQuals;
+
+      [[maybe_unused]] const StringRef BuiltinName = Mangler.demangleName(
+          CI.getCalledFunction()->getName(), BuiltinArgTypes,
+          BuiltinArgPointeeTypes, BuiltinArgQuals);
+      assert(!BuiltinName.empty() && BuiltinArgTypes[0]->isPointerTy() &&
+             BuiltinArgPointeeTypes[0] && "Could not demangle async builtin");
+
+      auto *const DataTy = BuiltinArgPointeeTypes[0];
+      const bool IsStrided = ID == eCLBuiltinAsyncWorkGroupStridedCopy;
+
+      auto *const Dst = CI.getArgOperand(0);
+      auto *const Src = CI.getArgOperand(1);
+      auto *const NumElements = CI.getArgOperand(2);
+      auto *const EventIn = CI.getArgOperand(3 + IsStrided);
+
+      // Find out which way the DMA is going and declare the appropriate mux
+      // builtin.
+      const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                          compiler::utils::AddressSpace::Local;
+      const auto ElementTypeWidthInBytes =
+          DL.getTypeAllocSize(DataTy).getFixedValue();
+      auto *const ElementSize =
+          ConstantInt::get(NumElements->getType(), ElementTypeWidthInBytes);
+
+      auto *const WidthInBytes =
+          IsStrided ? ElementSize
+                    : B.CreateMul(ElementSize, NumElements, "width.bytes");
+
+      const BuiltinID MuxBuiltinID =
+          IsRead ? (IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D)
+                 : (IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D);
+
+      auto *const MuxDMA =
+          BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, EventIn->getType());
+      assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+      CallInst *NewCI = nullptr;
+      if (!IsStrided) {
+        NewCI = B.CreateCall(MuxDMA, {Dst, Src, WidthInBytes, EventIn},
+                             "mux.out.event");
+      } else {
+        // The stride from async_work_group_strided_copy is in elements, but the
+        // stride in the __mux builtins are in bytes so we need to scale the
+        // value.
+        auto *const Stride = CI.getArgOperand(3);
+        auto *const StrideInBytes =
+            B.CreateMul(ElementSize, Stride, "stride.bytes");
+
+        // For async_work_group_strided_copy, the stride only applies to the
+        // global memory, as we are doing scatters/gathers.
+        auto *const DstStride = IsRead ? ElementSize : StrideInBytes;
+        auto *const SrcStride = IsRead ? StrideInBytes : ElementSize;
+
+        NewCI = B.CreateCall(MuxDMA,
+                             {Dst, Src, WidthInBytes, DstStride, SrcStride,
+                              NumElements, EventIn},
+                             "mux.out.event");
+      }
+      NewCI->setAttributes(MuxDMA->getAttributes());
+      NewCI->takeName(&CI);
+      return NewCI;
+    }
+    case eCLBuiltinAsyncWorkGroupCopy2D2D: {
+      // Unpack the arguments for ease of access.
+      auto *const Dst = CI.getArgOperand(0);
+      auto *const DstOffset = CI.getArgOperand(1);
+      auto *const Src = CI.getArgOperand(2);
+      auto *const SrcOffset = CI.getArgOperand(3);
+      auto *const NumBytesPerEl = CI.getArgOperand(4);
+      auto *const NumElsPerLine = CI.getArgOperand(5);
+      auto *const NumLines = CI.getArgOperand(6);
+      auto *const SrcTotalLineLength = CI.getArgOperand(7);
+      auto *const DstTotalLineLength = CI.getArgOperand(8);
+      auto *const EventIn = CI.getArgOperand(9);
+
+      // Find out which way the DMA is going and declare the appropriate mux
+      // builtin.
+      const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                          compiler::utils::AddressSpace::Local;
+      auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
+          IsRead ? eMuxBuiltinDMARead2D : eMuxBuiltinDMAWrite2D, M,
+          EventIn->getType());
+      assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+      auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
+      auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
+      auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
+      auto *const ByteTy = B.getInt8Ty();
+      auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
+      auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
+      auto *const SrcStrideBytes =
+          B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
+      auto *const DstStrideBytes =
+          B.CreateMul(DstTotalLineLength, NumBytesPerEl);
+      auto *const NewCI = B.CreateCall(
+          MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes, DstStrideBytes,
+                   SrcStrideBytes, NumLines, EventIn});
+      NewCI->setAttributes(MuxDMA->getAttributes());
+      NewCI->takeName(&CI);
+      return NewCI;
+    }
+    case eCLBuiltinAsyncWorkGroupCopy3D3D: {
+      auto *const Dst = CI.getArgOperand(0);
+      auto *const DstOffset = CI.getArgOperand(1);
+      auto *const Src = CI.getArgOperand(2);
+      auto *const SrcOffset = CI.getArgOperand(3);
+      auto *const NumBytesPerEl = CI.getArgOperand(4);
+      auto *const NumElsPerLine = CI.getArgOperand(5);
+      auto *const NumLines = CI.getArgOperand(6);
+      auto *const NumPlanes = CI.getArgOperand(7);
+      auto *const SrcTotalLineLength = CI.getArgOperand(8);
+      auto *const SrcTotalPlaneArea = CI.getArgOperand(9);
+      auto *const DstTotalLineLength = CI.getArgOperand(10);
+      auto *const DstTotalPlaneArea = CI.getArgOperand(11);
+      auto *const EventIn = CI.getArgOperand(12);
+
+      // Find out which way the DMA is going and declare the appropriate mux
+      // builtin.
+      const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                          compiler::utils::AddressSpace::Local;
+      auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
+          IsRead ? eMuxBuiltinDMARead3D : eMuxBuiltinDMAWrite3D, M,
+          EventIn->getType());
+      assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+      auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
+      auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
+      auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
+      auto *const ByteTy = B.getInt8Ty();
+      auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
+      auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
+      auto *const SrcLineStrideBytes =
+          B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
+      auto *const DstLineStrideBytes =
+          B.CreateMul(DstTotalLineLength, NumBytesPerEl);
+      auto *const SrcPlaneStrideBytes =
+          B.CreateMul(SrcTotalPlaneArea, NumBytesPerEl);
+      auto *const DstPlaneStrideBytes =
+          B.CreateMul(DstTotalPlaneArea, NumBytesPerEl);
+      auto *const NewCI =
+          B.CreateCall(MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes,
+                                DstLineStrideBytes, SrcLineStrideBytes,
+                                NumLines, DstPlaneStrideBytes,
+                                SrcPlaneStrideBytes, NumPlanes, EventIn});
+      NewCI->setAttributes(MuxDMA->getAttributes());
+      NewCI->takeName(&CI);
+      return NewCI;
+    }
+  }
+
+  return nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+Function *CLBuiltinLoader::materializeBuiltin(StringRef BuiltinName,
+                                              Module *DestM,
+                                              BuiltinMatFlags Flags) {
+  auto *const BuiltinModule = this->getBuiltinsModule();
+
+  // Retrieve it from the builtin module.
+  if (!BuiltinModule) {
+    return nullptr;
+  }
+  Function *SrcBuiltin = BuiltinModule->getFunction(BuiltinName);
+  if (!SrcBuiltin) {
+    return nullptr;
+  }
+
+  // The user only wants a declaration.
+  if (!(Flags & eBuiltinMatDefinition)) {
+    if (!DestM) {
+      return SrcBuiltin;
+    } else {
+      FunctionType *FT = dyn_cast<FunctionType>(SrcBuiltin->getFunctionType());
+      Function *BuiltinDecl = cast<Function>(
+          DestM->getOrInsertFunction(BuiltinName, FT).getCallee());
+      BuiltinDecl->copyAttributesFrom(SrcBuiltin);
+      BuiltinDecl->setCallingConv(SrcBuiltin->getCallingConv());
+      return BuiltinDecl;
+    }
+  }
+
+  // Materialize the builtin and its callees.
+  std::set<Function *> Callees;
+  std::vector<Function *> Worklist;
+  Worklist.push_back(SrcBuiltin);
+  while (!Worklist.empty()) {
+    // Materialize the first function in the work list.
+    Function *Current = Worklist.front();
+    Worklist.erase(Worklist.begin());
+    if (!Callees.insert(Current).second) {
+      continue;
+    }
+    if (!BuiltinModule->materialize(Current)) {
+      return nullptr;
+    }
+
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+    if (Current->IsNewDbgInfoFormat != BuiltinModule->IsNewDbgInfoFormat) {
+      if (BuiltinModule->IsNewDbgInfoFormat) {
+        Current->convertToNewDbgValues();
+      } else {
+        Current->convertFromNewDbgValues();
+      }
+    }
+#endif
+
+    // Find any callees in the function and add them to the list.
+    for (BasicBlock &BB : *Current) {
+      for (Instruction &I : BB) {
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI) {
+          continue;
+        }
+        Function *callee = CI->getCalledFunction();
+        if (!callee) {
+          continue;
+        }
+        Worklist.push_back(callee);
+      }
+    }
+  }
+
+  if (!DestM) {
+    return SrcBuiltin;
+  }
+
+  // Copy builtin and callees to the target module if requested by the user.
+  ValueToValueMapTy ValueMap;
+  SmallVector<ReturnInst *, 4> Returns;
+  // Avoid linking errors.
+  const GlobalValue::LinkageTypes Linkage = GlobalValue::LinkOnceAnyLinkage;
+
+  // Declare the callees in the module if they don't already exist.
+  for (Function *Callee : Callees) {
+    Function *NewCallee = DestM->getFunction(Callee->getName());
+    if (!NewCallee) {
+      FunctionType *FT = Callee->getFunctionType();
+      NewCallee = Function::Create(FT, Linkage, Callee->getName(), DestM);
+    } else {
+      NewCallee->setLinkage(Linkage);
+    }
+    Function::arg_iterator NewArgI = NewCallee->arg_begin();
+    for (Argument &Arg : Callee->args()) {
+      NewArgI->setName(Arg.getName());
+      ValueMap[&Arg] = &*(NewArgI++);
+    }
+    NewCallee->copyAttributesFrom(Callee);
+    ValueMap[Callee] = NewCallee;
+  }
+
+  // Clone the callees' bodies into the module.
+  GlobalValueMaterializer Materializer(*DestM);
+  for (Function *Callee : Callees) {
+    if (Callee->isDeclaration()) {
+      continue;
+    }
+    Function *NewCallee = cast<Function>(ValueMap[Callee]);
+    assert(DestM);
+    const auto CloneType = DestM == Callee->getParent()
+                               ? CloneFunctionChangeType::LocalChangesOnly
+                               : CloneFunctionChangeType::DifferentModule;
+    CloneFunctionInto(NewCallee, Callee, ValueMap, CloneType, Returns, "",
+                      nullptr, nullptr, &Materializer);
+    Returns.clear();
+  }
+
+  // Clone global variable initializers.
+  for (GlobalVariable *var : Materializer.variables()) {
+    GlobalVariable *newVar = dyn_cast_or_null<GlobalVariable>(ValueMap[var]);
+    if (!newVar) {
+      return nullptr;
+    }
+    Constant *oldInit = var->getInitializer();
+    Constant *newInit = MapValue(oldInit, ValueMap);
+    newVar->setInitializer(newInit);
+  }
+
+  return cast<Function>(ValueMap[SrcBuiltin]);
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
new file mode 100644
index 0000000000000..75cf246e6065a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/define_mux_builtins_pass.h>
+
+#define DEBUG_TYPE "define-mux-builtins"
+
+using namespace llvm;
+
+PreservedAnalyses compiler::utils::DefineMuxBuiltinsPass::run(
+    Module &M, ModuleAnalysisManager &AM) {
+  bool Changed = false;
+  auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
+
+  auto functionNeedsDefining = [&BI](Function &F) {
+    return F.isDeclaration() && !F.isIntrinsic() &&
+           BI.isMuxBuiltinID(BI.analyzeBuiltin(F).ID);
+  };
+
+  // Define all mux builtins
+  for (auto &F : M.functions()) {
+    if (!functionNeedsDefining(F)) {
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "  Defining mux builtin: " << F.getName() << "\n";);
+
+    // Define the builtin. If it declares any new dependent builtins, those
+    // will be appended to the module's function list and so will be
+    // encountered by later iterations.
+    auto Builtin = BI.analyzeBuiltin(F);
+    if (BI.defineMuxBuiltin(Builtin.ID, M, Builtin.mux_overload_info)) {
+      Changed = true;
+    }
+  }
+
+  // While declaring any builtins should go to the end of the module's list of
+  // functions, it's not technically impossible for something else to happen.
+  // As such, assert that we are leaving the module in the state we are
+  // contractually obliged to: with all functions that need defining having
+  // been defined.
+  assert(all_of(M.functions(),
+                [&](Function &F) {
+                  return F.isDeclaration() || !functionNeedsDefining(F);
+                }) &&
+         "Did not define a function that requires it");
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
new file mode 100644
index 0000000000000..4b1b656aeae46
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/dma.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <array>
+
+namespace compiler {
+namespace utils {
+
+llvm::Value *isThreadEQ(llvm::BasicBlock *bb, unsigned x, unsigned y,
+                        unsigned z, llvm::Function &LocalIDFn) {
+  llvm::IRBuilder<> builder(bb);
+  LocalIDFn.setCallingConv(llvm::CallingConv::SPIR_FUNC);
+  auto *const indexType = LocalIDFn.arg_begin()->getType();
+  llvm::Value *result = llvm::ConstantInt::getTrue(bb->getContext());
+
+  const std::array<unsigned, 3> threadIDs{x, y, z};
+  for (unsigned i = 0; i < threadIDs.size(); ++i) {
+    auto *const index = llvm::ConstantInt::get(indexType, i);
+    auto *const localID = builder.CreateCall(&LocalIDFn, index);
+    localID->setCallingConv(LocalIDFn.getCallingConv());
+
+    auto *thread =
+        llvm::ConstantInt::get(LocalIDFn.getReturnType(), threadIDs[i]);
+    auto *const cmp = builder.CreateICmpEQ(localID, thread);
+    result = (i == 0) ? cmp : builder.CreateAnd(result, cmp);
+  }
+
+  return result;
+}
+
+llvm::Value *isThreadZero(llvm::BasicBlock *BB, llvm::Function &LocalIDFn) {
+  return isThreadEQ(BB, 0, 0, 0, LocalIDFn);
+}
+
+void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock,
+                      llvm::BasicBlock *falseBlock, llvm::Function &LocalIDFn) {
+  // only thread 0 in the work group should execute the DMA.
+  llvm::IRBuilder<> entryBuilder(entryBlock);
+  entryBuilder.CreateCondBr(isThreadZero(entryBlock, LocalIDFn), trueBlock,
+                            falseBlock);
+}
+
+llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m) {
+  if (auto *eventType = llvm::StructType::getTypeByName(
+          m.getContext(), MuxBuiltins::dma_event_type)) {
+    return eventType;
+  }
+
+  return llvm::StructType::create(m.getContext(), MuxBuiltins::dma_event_type);
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
new file mode 100644
index 0000000000000..cec28d87c6322
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/encode_kernel_metadata_pass.h>
+#include <compiler/utils/metadata.h>
+
+using namespace llvm;
+
+PreservedAnalyses compiler::utils::TransferKernelMetadataPass::run(
+    Module &M, ModuleAnalysisManager &) {
+  SmallVector<KernelInfo, 4> Kernels;
+  populateKernelList(M, Kernels);
+
+  for (const auto &Kernel : Kernels) {
+    if (auto *F = M.getFunction(Kernel.Name)) {
+      setOrigFnName(*F);
+      setIsKernelEntryPt(*F);
+      if (Kernel.ReqdWGSize) {
+        encodeLocalSizeMetadata(*F, *Kernel.ReqdWGSize);
+      }
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses compiler::utils::EncodeKernelMetadataPass::run(
+    Module &M, ModuleAnalysisManager &) {
+  if (auto *F = M.getFunction(KernelName)) {
+    setOrigFnName(*F);
+    setIsKernelEntryPt(*F);
+    if (LocalSizes) {
+      encodeLocalSizeMetadata(*F, *LocalSizes);
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
new file mode 100644
index 0000000000000..e808e0494f716
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
@@ -0,0 +1,73 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Constant.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IR/Value.h>
+
+using namespace llvm;
+static llvm::Constant *getNeutralIdentityHelper(RecurKind Kind, Type *Ty,
+                                                bool UseNaN, bool UseFZero) {
+  switch (Kind) {
+    default:
+      return nullptr;
+    case RecurKind::And:
+      return ConstantInt::getAllOnesValue(Ty);
+    case RecurKind::Or:
+    case RecurKind::Add:
+    case RecurKind::Xor:
+      return ConstantInt::getNullValue(Ty);
+    case RecurKind::SMin:
+      return ConstantInt::get(
+          Ty, APInt::getSignedMaxValue(Ty->getScalarSizeInBits()));
+    case RecurKind::SMax:
+      return ConstantInt::get(
+          Ty, APInt::getSignedMinValue(Ty->getScalarSizeInBits()));
+    case RecurKind::UMin:
+      return ConstantInt::get(Ty,
+                              APInt::getMaxValue(Ty->getScalarSizeInBits()));
+    case RecurKind::UMax:
+      return ConstantInt::get(Ty,
+                              APInt::getMinValue(Ty->getScalarSizeInBits()));
+    case RecurKind::FAdd:
+      // -0.0 + 0.0 = 0.0 meaning -0.0 (not 0.0) is the neutral value for floats
+      // under addition.
+      return UseFZero ? ConstantFP::get(Ty, 0.0) : ConstantFP::get(Ty, -0.0);
+    case RecurKind::FMin:
+      return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ false)
+                    : ConstantFP::getInfinity(Ty, /*Negative*/ false);
+    case RecurKind::FMax:
+      return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ true)
+                    : ConstantFP::getInfinity(Ty, /*Negative*/ true);
+    case RecurKind::Mul:
+      return ConstantInt::get(Ty, 1);
+    case RecurKind::FMul:
+      return ConstantFP::get(Ty, 1.0);
+  }
+}
+
+llvm::Constant *compiler::utils::getNeutralVal(RecurKind Kind, Type *Ty) {
+  return getNeutralIdentityHelper(Kind, Ty, /*UseNaN*/ true,
+                                  /*UseFZero*/ false);
+}
+
+llvm::Constant *compiler::utils::getIdentityVal(RecurKind Kind, Type *Ty) {
+  return getNeutralIdentityHelper(Kind, Ty, /*UseNaN*/ false, /*UseFZero*/
+                                  true);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
new file mode 100644
index 0000000000000..45dc86ec4edaa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
@@ -0,0 +1,912 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/mangling.h>
+#include <compiler/utils/target_extension_types.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cstring>
+#include <optional>
+
+namespace compiler {
+namespace utils {
+using namespace llvm;
+
+NameMangler::NameMangler(LLVMContext *context) : Context(context) {}
+
+std::string NameMangler::mangleName(StringRef Name, ArrayRef<Type *> Tys,
+                                    ArrayRef<TypeQualifiers> Quals) {
+  std::string MangledName;
+  raw_string_ostream O(MangledName);
+  O << "_Z" << Name.size() << Name;
+  for (unsigned i = 0; i < Tys.size(); i++) {
+    const ArrayRef<Type *> PrevTys = Tys.slice(0, i);
+    const ArrayRef<TypeQualifiers> PrevQuals = Quals.slice(0, i);
+    if (!mangleType(O, Tys[i], Quals[i], PrevTys, PrevQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return MangledName;
+}
+
+StringRef NameMangler::demangleName(
+    StringRef Name, SmallVectorImpl<llvm::Type *> &Types,
+    SmallVectorImpl<llvm::Type *> &PointerElementTypes,
+    SmallVectorImpl<TypeQualifiers> &Quals) {
+  // Parse the name part.
+  Lexer L(Name);
+  Name = demangleName(L);
+  if (Name.empty()) {
+    return StringRef{};
+  }
+
+  // Parse the argument part.
+  while (L.Left() > 0) {
+    Type *ArgTy = nullptr;
+    Type *ArgEltTy = nullptr;
+    TypeQualifiers ArgQuals;
+    if (!demangleType(L, ArgTy, &ArgEltTy, ArgQuals, Types, Quals)) {
+      return StringRef{};
+    }
+    Types.push_back(ArgTy);
+    PointerElementTypes.push_back(ArgEltTy);
+    Quals.push_back(ArgQuals);
+  }
+  return Name;
+}
+
+StringRef NameMangler::demangleName(StringRef Name,
+                                    SmallVectorImpl<llvm::Type *> &Types,
+                                    SmallVectorImpl<TypeQualifiers> &Quals) {
+  SmallVector<llvm::Type *, 4> EltTys;
+  return demangleName(Name, Types, EltTys, Quals);
+}
+
+StringRef NameMangler::demangleName(StringRef Name) {
+  Lexer L(Name);
+  StringRef DemangledName = demangleName(L);
+  if (!DemangledName.empty()) {
+    return DemangledName;
+  }
+  return Name;
+}
+
+int NameMangler::resolveSubstitution(unsigned SubID,
+                                     SmallVectorImpl<Type *> &Tys,
+                                     SmallVectorImpl<TypeQualifiers> &Quals) {
+  unsigned CurrentSubID = 0;
+  int ResolvedID = -1;
+  for (unsigned i = 0; i < Tys.size(); i++) {
+    // Determine whether the type is a builtin or not.
+    // Builtin types cannot be substituted.
+    Type *Ty = Tys[i];
+    TypeQualifiers &TyQuals = Quals[i];
+    if (isTypeBuiltin(Ty, TyQuals)) {
+      continue;
+    }
+    if (CurrentSubID == SubID) {
+      ResolvedID = (int)i;
+      break;
+    }
+    CurrentSubID++;
+  }
+  return ResolvedID;
+}
+
+bool NameMangler::emitSubstitution(raw_ostream &O, Type *Ty,
+                                   TypeQualifiers Quals,
+                                   ArrayRef<Type *> PrevTys,
+                                   ArrayRef<TypeQualifiers> PrevQuals) {
+  if (isTypeBuiltin(Ty, Quals)) {
+    return false;
+  }
+
+  // Look for a previously-mangled non-builtin type we could use as a
+  // substitution.
+  int SubstitutionID = -1;
+  bool FoundMatch = false;
+  for (unsigned j = 0; j < PrevTys.size(); j++) {
+    Type *PrevTy = PrevTys[j];
+    TypeQualifiers PrevQual = PrevQuals[j];
+    if (!isTypeBuiltin(PrevTy, PrevQual)) {
+      SubstitutionID++;
+      if ((PrevTy == Ty) && (PrevQual == Quals)) {
+        FoundMatch = true;
+        break;
+      }
+    }
+  }
+  if (!FoundMatch) {
+    return false;
+  }
+
+  // Found a match, emit the substitution.
+  O << "S";
+  if (SubstitutionID > 0) {
+    O << SubstitutionID;
+  }
+  O << "_";
+  return true;
+}
+
+bool NameMangler::isTypeBuiltin(Type *Ty, TypeQualifiers &Quals) {
+  (void)Quals;
+  switch (Ty->getTypeID()) {
+    default:
+    case Type::StructTyID:
+    case Type::ArrayTyID:
+    case Type::PointerTyID:
+    case Type::FixedVectorTyID:
+      return false;
+    case Type::VoidTyID:
+    case Type::HalfTyID:
+    case Type::FloatTyID:
+    case Type::DoubleTyID:
+    case Type::IntegerTyID:
+      return true;
+  }
+}
+
+const char *NameMangler::mangleSimpleType(Type *Ty, TypeQualifier Qual) {
+  const bool IsSigned = (Qual & eTypeQualSignedInt);
+  switch (Ty->getTypeID()) {
+    default:
+      break;
+    case Type::VoidTyID:
+      return "v";
+    case Type::HalfTyID:
+      return "Dh";
+    case Type::FloatTyID:
+      return "f";
+    case Type::DoubleTyID:
+      return "d";
+    case Type::IntegerTyID:
+      switch (cast<IntegerType>(Ty)->getBitWidth()) {
+        default:
+          break;
+        case 1:
+          return "b";  // bool
+        case 8:
+          return IsSigned ? "c" : "h";
+        case 16:
+          return IsSigned ? "s" : "t";
+        case 32:
+          return IsSigned ? "i" : "j";
+        case 64:
+          return IsSigned ? "l" : "m";
+      }
+  }
+  return nullptr;
+}
+
+bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Qual) {
+  return mangleType(O, Ty, Qual, ArrayRef<Type *>(),
+                    ArrayRef<TypeQualifiers>());
+}
+
+static void manglePointerQuals(raw_ostream &O, TypeQualifier Qual,
+                               unsigned AddressSpace) {
+  if (Qual & eTypeQualPointerRestrict) {
+    O << 'r';
+  }
+  if (Qual & eTypeQualPointerVolatile) {
+    O << 'V';
+  }
+  if (Qual & eTypeQualPointerConst) {
+    O << 'K';
+  }
+  if (AddressSpace > 0) {
+    O << "U3AS" << AddressSpace;
+  }
+}
+
+bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Quals,
+                             ArrayRef<Type *> PrevTys,
+                             ArrayRef<TypeQualifiers> PrevQuals) {
+  if (emitSubstitution(O, Ty, Quals, PrevTys, PrevQuals)) {
+    return true;
+  }
+
+  const TypeQualifier Qual = Quals.pop_front();
+  if (const char *SimpleName = mangleSimpleType(Ty, Qual)) {
+    O << SimpleName;
+    return true;
+  } else if (isa<ScalableVectorType>(Ty)) {
+    std::string tmp;
+    raw_string_ostream Otmp(tmp);
+    auto *VecTy = cast<llvm::VectorType>(Ty);
+    Otmp << "nxv"
+         << multi_llvm::getVectorElementCount(VecTy).getKnownMinValue();
+    if (!mangleType(Otmp, VecTy->getElementType(), Quals, PrevTys, PrevQuals)) {
+      return false;
+    }
+    O << "u" << tmp.size() << tmp;
+    return true;
+  } else if (Ty->isVectorTy()) {
+    auto *VecTy = cast<FixedVectorType>(Ty);
+    O << "Dv" << VecTy->getNumElements() << "_";
+    return mangleType(O, VecTy->getElementType(), Quals, PrevTys, PrevQuals);
+  } else if (Ty->isPointerTy()) {
+    PointerType *PtrTy = cast<PointerType>(Ty);
+    const unsigned AddressSpace = PtrTy->getAddressSpace();
+#if LLVM_VERSION_LESS(17, 0)
+    assert(PtrTy->isOpaque() && "No support for typed pointers past LLVM 15");
+#endif
+    O << "u3ptr";
+    manglePointerQuals(O, Qual, AddressSpace);
+    return true;
+#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+  } else if (Ty->isTargetExtTy()) {
+    if (auto Name = mangleBuiltinType(Ty)) {
+      O << *Name;
+      return true;
+    }
+    return false;
+#endif
+  } else {
+    return false;
+  }
+}
+
+bool NameMangler::demangleSimpleType(Lexer &L, Type *&Ty, TypeQualifier &Qual) {
+  const int c = L.Current();
+  Ty = nullptr;
+  Qual = eTypeQualNone;
+  if ((c < 0) || !Context) {
+    return false;
+  }
+
+  switch (c) {
+    default:
+      return false;
+    case 'v':
+      Ty = llvm::Type::getVoidTy(*Context);
+      break;
+    case 'D':
+      if (!L.Consume("Dh")) {
+        return false;
+      }
+      Ty = llvm::Type::getHalfTy(*Context);
+      return true;
+    case 'f':
+      Ty = llvm::Type::getFloatTy(*Context);
+      break;
+    case 'd':
+      Ty = llvm::Type::getDoubleTy(*Context);
+      break;
+    case 'b':
+      Ty = llvm::Type::getInt1Ty(*Context);
+      break;
+    case 'c':
+    case 'h':
+      Ty = llvm::Type::getInt8Ty(*Context);
+      if (c == 'c') {
+        Qual = eTypeQualSignedInt;
+      }
+      break;
+    case 's':
+    case 't':
+      Ty = llvm::Type::getInt16Ty(*Context);
+      if (c == 's') {
+        Qual = eTypeQualSignedInt;
+      }
+      break;
+    case 'i':
+    case 'j':
+      Ty = llvm::Type::getInt32Ty(*Context);
+      if (c == 'i') {
+        Qual = eTypeQualSignedInt;
+      }
+      break;
+    case 'l':
+    case 'm':
+      Ty = llvm::Type::getInt64Ty(*Context);
+      if (c == 'l') {
+        Qual = eTypeQualSignedInt;
+      }
+      break;
+  }
+  L.Consume();
+  return true;
+}
+
+std::optional<std::string> NameMangler::mangleBuiltinType(Type *Ty) {
+  // With opaque pointers, before LLVM 17 we can't actually mangle OpenCL
+  // builtin types because our APIs don't expose the ability to mangle a pointer
+  // based on its element type.
+  // This is never a problem in the compiler as we don't generate such functions
+  // on the fly, but it is a weakness in the API. We could fix this, or wait it
+  // out until LLVM 17 becomes the minimum version, at which point target
+  // extension types save the day.
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ty;
+  return nullptr;
+#else
+  auto *const TgtTy = cast<TargetExtType>(Ty);
+  const StringRef Name = TgtTy->getName();
+
+  if (Name == "spirv.Event") {
+    return "9ocl_event";
+  }
+
+  if (Name == "spirv.Sampler") {
+    return "11ocl_sampler";
+  }
+
+  if (Name != "spirv.Image") {
+    // FIXME: Some types don't have official target extension types.
+    // "opencl.clk_event_t" -> "12ocl_clkevent"
+    // "opencl.queue_t" -> "9ocl_queue"
+    // "opencl.ndrange_t" -> "11ocl_ndrange"
+    // "opencl.reserve_id_t" -> "13ocl_reserveid"
+    return std::nullopt;
+  }
+
+  auto Dim = TgtTy->getIntParameter(tgtext::ImageTyDimensionalityIdx);
+  auto Depth = TgtTy->getIntParameter(tgtext::ImageTyDepthIdx);
+  auto Arrayed = TgtTy->getIntParameter(tgtext::ImageTyArrayedIdx);
+  auto MS = TgtTy->getIntParameter(tgtext::ImageTyMSIdx);
+
+  std::string MangledName = "ocl_image";
+
+  switch (Dim) {
+    default:
+      return std::nullopt;
+    case tgtext::ImageDim1D:
+      MangledName += "1d";
+      break;
+    case tgtext::ImageDim2D:
+      MangledName += "2d";
+      break;
+    case tgtext::ImageDim3D:
+      MangledName += "3d";
+      break;
+    case tgtext::ImageDimBuffer:
+      MangledName += "1dbuffer";
+      break;
+  }
+
+  if (Arrayed == tgtext::ImageArrayed) {
+    MangledName += "array";
+  }
+
+  if (MS == tgtext::ImageMSMultiSampled) {
+    MangledName += "msaa";
+  }
+
+  if (Depth == tgtext::ImageDepth) {
+    MangledName += "depth";
+  }
+
+  return std::to_string(MangledName.size()) + MangledName;
+#endif
+}
+
+bool NameMangler::demangleOpenCLBuiltinType(Lexer &L, Type *&Ty) {
+  if (L.Consume("12memory_scope") || L.Consume("12memory_order")) {
+    Ty = IntegerType::getInt32Ty(*Context);
+    return true;
+  }
+
+#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+  if (auto *TargetExtTy = [this, &L]() -> Type * {
+        if (L.Consume("11ocl_image1d")) {
+          return compiler::utils::tgtext::getImage1DTy(*Context);
+        } else if (L.Consume("16ocl_image1darray")) {
+          return compiler::utils::tgtext::getImage1DArrayTy(*Context);
+        } else if (L.Consume("17ocl_image1dbuffer")) {
+          return compiler::utils::tgtext::getImage1DBufferTy(*Context);
+        } else if (L.Consume("11ocl_image2d")) {
+          return compiler::utils::tgtext::getImage2DTy(*Context);
+        } else if (L.Consume("16ocl_image2darray")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(*Context);
+        } else if (L.Consume("16ocl_image2ddepth")) {
+          return compiler::utils::tgtext::getImage2DTy(*Context, /*Depth*/ true,
+                                                       /*MS*/ false);
+        } else if (L.Consume("21ocl_image2darraydepth")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(*Context);
+        } else if (L.Consume("15ocl_image2dmsaa")) {
+          return compiler::utils::tgtext::getImage2DTy(
+              *Context, /*Depth*/ false, /*MS*/ true);
+        } else if (L.Consume("20ocl_image2darraymsaa")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(
+              *Context, /*Depth*/ false, /*MS*/ true);
+        } else if (L.Consume("20ocl_image2dmsaadepth")) {
+          return compiler::utils::tgtext::getImage2DTy(*Context, /*Depth*/ true,
+                                                       /*MS*/ true);
+        } else if (L.Consume("35ocl_image2darraymsaadepth")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(
+              *Context, /*Depth*/ true, /*MS*/ true);
+        } else if (L.Consume("11ocl_image3d")) {
+          return compiler::utils::tgtext::getImage3DTy(*Context);
+        } else if (L.Consume("11ocl_sampler")) {
+          return compiler::utils::tgtext::getSamplerTy(*Context);
+        } else if (L.Consume("9ocl_event")) {
+          return compiler::utils::tgtext::getEventTy(*Context);
+        }
+        return nullptr;
+      }()) {
+    Ty = TargetExtTy;
+    return true;
+  }
+#endif
+
+  StringRef Name;
+  //
+  // TODO: Avoid hard coded name. See redmine issue #8656 please.
+  //
+  if (L.Consume("11ocl_image1d")) {
+    Name = "opencl.image1d_t";
+  } else if (L.Consume("16ocl_image1darray")) {
+    Name = "opencl.image1d_array_t";
+  } else if (L.Consume("17ocl_image1dbuffer")) {
+    Name = "opencl.image1d_buffer_t";
+  } else if (L.Consume("11ocl_image2d")) {
+    Name = "opencl.image2d_t";
+  } else if (L.Consume("16ocl_image2darray")) {
+    Name = "opencl.image2d_array_t";
+  } else if (L.Consume("16ocl_image2ddepth")) {
+    Name = "opencl.image2d_depth_t";
+  } else if (L.Consume("21ocl_image2darraydepth")) {
+    Name = "opencl.image2d_array_depth_t";
+  } else if (L.Consume("15ocl_image2dmsaa")) {
+    Name = "opencl.image2d_msaa_t";
+  } else if (L.Consume("20ocl_image2darraymsaa")) {
+    Name = "opencl.image2d_array_msaa_t";
+  } else if (L.Consume("20ocl_image2dmsaadepth")) {
+    Name = "opencl.image2d_msaa_depth_t";
+  } else if (L.Consume("35ocl_image2darraymsaadepth")) {
+    Name = "opencl.image2d_array_msaa_depth_t";
+  } else if (L.Consume("11ocl_image3d")) {
+    Name = "opencl.image3d_t";
+  } else if (L.Consume("11ocl_sampler")) {
+    Name = "opencl_sampler_t";
+  } else if (L.Consume("9ocl_event")) {
+    Name = "opencl.event_t";
+  } else if (L.Consume("12ocl_clkevent")) {
+    Name = "opencl.clk_event_t";
+  } else if (L.Consume("9ocl_queue")) {
+    Name = "opencl.queue_t";
+  } else if (L.Consume("11ocl_ndrange")) {
+    Name = "opencl.ndrange_t";
+  } else if (L.Consume("13ocl_reserveid")) {
+    Name = "opencl.reserve_id_t";
+  } else {
+    return false;
+  }
+
+  if (auto *const OpenCLType =
+          llvm::StructType::getTypeByName(*Context, Name)) {
+    Ty = OpenCLType;
+  } else {
+    Ty = llvm::StructType::create(*Context, Name);
+  }
+
+  return true;
+}
+
+struct PointerASQuals {
+  unsigned AS;
+  TypeQualifier Qual;
+};
+
+static std::optional<PointerASQuals> demanglePointerQuals(Lexer &L) {
+  TypeQualifier PointerQual = eTypeQualNone;
+
+  // Parse the optional pointer qualifier.
+  if (L.Current() < 0) {
+    return std::nullopt;
+  }
+
+  // Parse the optional address space qualifier.
+  bool DemangledAS = false;
+  unsigned AddressSpace = 0;
+
+  if (L.Consume("U3AS")) {
+    if (!L.ConsumeInteger(AddressSpace)) {
+      return std::nullopt;
+    }
+    DemangledAS = true;
+  }
+
+  switch (L.Current()) {
+    default:
+      break;
+    case 'K':
+      PointerQual = eTypeQualPointerConst;
+      L.Consume();
+      break;
+    case 'r':
+      PointerQual = eTypeQualPointerRestrict;
+      L.Consume();
+      break;
+    case 'V':
+      PointerQual = eTypeQualPointerVolatile;
+      L.Consume();
+      break;
+  }
+
+  if (!DemangledAS && L.Consume("U3AS") && !L.ConsumeInteger(AddressSpace)) {
+    return std::nullopt;
+  }
+
+  return PointerASQuals{AddressSpace, PointerQual};
+}
+
+bool NameMangler::demangleType(Lexer &L, Type *&Ty, Type **PointerEltTy,
+                               TypeQualifiers &Quals,
+                               SmallVectorImpl<llvm::Type *> &CtxTypes,
+                               SmallVectorImpl<TypeQualifiers> &CtxQuals) {
+  Ty = nullptr;
+  if (L.Left() < 1) {
+    return false;
+  }
+
+  // Assume the element type is null, and set it if we find a pointer.
+  if (PointerEltTy) {
+    *PointerEltTy = nullptr;
+  }
+
+  // Match vector types.
+  if (L.Consume("Dv")) {
+    const TypeQualifier VectorQual = eTypeQualNone;
+    unsigned NumElements = 0;
+    Quals.push_back(VectorQual);
+    if (!L.ConsumeInteger(NumElements) || !L.Consume("_")) {
+      return false;
+    }
+
+    // Parse the vector element type.
+    Type *ElementType = nullptr;
+    if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) {
+      return false;
+    }
+    Ty = FixedVectorType::get(ElementType, NumElements);
+    return true;
+  }
+
+  // Match opaque pointer types
+  if (L.Consume("u3ptr")) {
+    const auto QualsAS = demanglePointerQuals(L);
+    if (!QualsAS) {
+      return false;
+    }
+    Quals.push_back(QualsAS->Qual);
+    return PointerType::get(nullptr, QualsAS->AS);
+  }
+
+  // Match scalable vector types.
+  if (L.Consume("u")) {
+    unsigned TypeNameLength = 0;
+    if (!L.ConsumeInteger(TypeNameLength) || !L.Consume("nxv")) {
+      return false;
+    }
+    if (TypeNameLength > L.Left()) {
+      return false;
+    }
+    const TypeQualifier VectorQual = eTypeQualNone;
+    unsigned NumElements = 0;
+    Quals.push_back(VectorQual);
+    if (!L.ConsumeInteger(NumElements)) {
+      return false;
+    }
+
+    // Parse the vector element type.
+    Type *ElementType = nullptr;
+    if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) {
+      return false;
+    }
+    Ty = llvm::VectorType::get(ElementType,
+                               ElementCount::getScalable(NumElements));
+    return true;
+  }
+
+  // Match pointer types.
+  if (L.Consume("P")) {
+    const auto QualsAS = demanglePointerQuals(L);
+    if (!QualsAS) {
+      return false;
+    }
+
+    Quals.push_back(QualsAS->Qual);
+
+    // Parse the element type.
+    Type *ElementType = nullptr;
+    if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) {
+      return false;
+    }
+    assert(ElementType);
+    if (PointerEltTy) {
+      *PointerEltTy = ElementType;
+    }
+    if (ElementType->isVoidTy()) {
+      Ty = llvm::PointerType::get(Type::getInt8Ty(*Context), QualsAS->AS);
+    } else {
+      Ty = llvm::PointerType::get(ElementType, QualsAS->AS);
+    }
+    return true;
+  }
+
+  // Match simple types.
+  TypeQualifier SimpleQual = eTypeQualNone;
+  if (demangleSimpleType(L, Ty, SimpleQual)) {
+    Quals.push_back(SimpleQual);
+    return true;
+  }
+
+  // Handle substitutions.
+  if (L.Consume("S")) {
+    unsigned SubID = 0;
+    if (L.ConsumeInteger(SubID)) {
+      SubID++;
+    }
+    if (!L.Consume("_")) {
+      return false;
+    }
+
+    // Resolve it, using a previous type and qualifier.
+    const int entryIndex = resolveSubstitution(SubID, CtxTypes, CtxQuals);
+    if ((entryIndex < 0) || ((unsigned)entryIndex >= CtxTypes.size())) {
+      return false;
+    }
+    Ty = CtxTypes[entryIndex];
+    Quals.push_back(CtxQuals[entryIndex]);
+    return true;
+  }
+
+  if (demangleOpenCLBuiltinType(L, Ty)) {
+    return true;
+  }
+
+  return false;
+}
+
+StringRef NameMangler::demangleName(Lexer &L) {
+  unsigned NameLength = 0;
+  if (!L.Consume("_Z")) {
+    return StringRef();
+  } else if (!L.ConsumeInteger(NameLength)) {
+    return StringRef();
+  } else if (NameLength > L.Left()) {
+    return StringRef();
+  }
+  StringRef Name = L.TextLeft().substr(0, NameLength);
+  L.Consume(NameLength);
+  return Name;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TypeQualifiers::TypeQualifiers() : storage_(0) {}
+
+TypeQualifiers::TypeQualifiers(TypeQualifier Qual) : storage_(0) {
+  push_back(Qual);
+}
+
+TypeQualifiers::TypeQualifiers(TypeQualifier Qual1, TypeQualifier Qual2)
+    : storage_(0) {
+  push_back(Qual1);
+  push_back(Qual2);
+}
+
+TypeQualifiers::TypeQualifiers(unsigned Qual) : storage_(0) { push_back(Qual); }
+
+TypeQualifiers::TypeQualifiers(unsigned Qual1, unsigned Qual2) : storage_(0) {
+  push_back(Qual1);
+  push_back(Qual2);
+}
+
+TypeQualifiers::StorageT TypeQualifiers::getCount() const {
+  const StorageT Mask = ((1 << NumCountBits) - 1);
+  return storage_ & Mask;
+}
+
+void TypeQualifiers::setCount(StorageT NewCount) {
+  const StorageT Mask = ((1 << NumCountBits) - 1);
+  // Clear the old count.
+  storage_ &= ~Mask;
+  // Set the new count.
+  storage_ |= ((NewCount << 0) & Mask);
+}
+
+TypeQualifier TypeQualifiers::front() const {
+  const StorageT Size = getCount();
+  if (Size == 0) {
+    return eTypeQualNone;
+  }
+  const unsigned Mask = ((1 << NumQualBits) - 1);
+  const unsigned Field = (storage_ >> NumCountBits) & Mask;
+  return (TypeQualifier)Field;
+}
+
+TypeQualifier TypeQualifiers::pop_front() {
+  const TypeQualifier Qual = front();
+  const StorageT Size = getCount();
+  if (Size > 0) {
+    // Pop the field bits.
+    storage_ >>= NumQualBits;
+    // Set the new count, since the old one was overwritten.
+    setCount(Size - 1);
+  }
+  return Qual;
+}
+
+TypeQualifier TypeQualifiers::at(unsigned Idx) const {
+  const StorageT Size = getCount();
+  if (Idx >= Size) {
+    return eTypeQualNone;
+  }
+  const unsigned ShAmt = NumCountBits + (Idx * NumQualBits);
+  const unsigned Field = (storage_ >> ShAmt) & ((1 << NumQualBits) - 1);
+  return TypeQualifier(Field);
+}
+
+bool TypeQualifiers::push_back(TypeQualifier Qual) {
+  const StorageT Size = getCount();
+  if (Size == MaxSize) {
+    return false;
+  }
+  const unsigned Offset = NumCountBits + (Size * NumQualBits);
+  const unsigned Field = Qual & ((1 << NumQualBits) - 1);
+  storage_ |= (static_cast<StorageT>(Field) << Offset);
+  setCount(Size + 1);
+  return true;
+}
+
+bool TypeQualifiers::push_back(unsigned Qual) {
+  return push_back((TypeQualifier)Qual);
+}
+
+bool TypeQualifiers::push_back(TypeQualifiers Quals) {
+  while (Quals.getCount() > 0) {
+    if (!push_back(Quals.pop_front())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+Lexer::Lexer(StringRef text) : Text(text), Pos(0) {}
+
+unsigned Lexer::Left() const { return Text.size() - Pos; }
+
+unsigned Lexer::CurrentPos() const { return Pos; }
+
+StringRef Lexer::TextLeft() const { return Text.substr(Pos); }
+
+int Lexer::Current() const { return Left() ? Text[Pos] : -1; }
+
+bool Lexer::Consume() { return Consume(1); }
+
+bool Lexer::Consume(unsigned Size) {
+  if (Left() < Size) {
+    return false;
+  }
+  Pos += Size;
+  return true;
+}
+
+bool Lexer::Consume(StringRef Pattern) {
+  if (Left() < Pattern.size()) {
+    return false;
+  } else if (!TextLeft().starts_with(Pattern)) {
+    return false;
+  }
+  Pos += Pattern.size();
+  return true;
+}
+
+bool Lexer::ConsumeInteger(unsigned &Result) {
+  size_t NumDigits = 0;
+  size_t i = Pos;
+  while ((i < Text.size()) && isdigit(Text[i])) {
+    i++;
+    NumDigits++;
+  }
+  const StringRef NumText = Text.substr(Pos, NumDigits);
+  if (NumText.size() == 0) {
+    return false;
+  }
+  if (NumText.getAsInteger(10, Result)) {
+    return false;
+  }
+  Pos += NumDigits;
+  return true;
+}
+
+bool Lexer::ConsumeSignedInteger(int &Result) {
+  size_t NumChars = 0;
+  size_t i = Pos;
+  if (Text[i] == '-') {
+    i++;
+    NumChars++;
+  }
+  while ((i < Text.size()) && isdigit(Text[i])) {
+    i++;
+    NumChars++;
+  }
+  const StringRef NumText = Text.substr(Pos, NumChars);
+  if (NumText.size() == 0) {
+    return false;
+  }
+  if (NumText.getAsInteger(10, Result)) {
+    return false;
+  }
+  Pos += NumChars;
+  return true;
+}
+
+bool Lexer::ConsumeAlpha(StringRef &Result) {
+  size_t NumChars = 0;
+  size_t i = Pos;
+  while ((i < Text.size()) && isalpha(Text[i])) {
+    i++;
+    NumChars++;
+  }
+  if (NumChars == 0) {
+    return false;
+  }
+  Result = Text.substr(Pos, NumChars);
+  Pos += NumChars;
+  return true;
+}
+
+bool Lexer::ConsumeAlphanumeric(StringRef &Result) {
+  size_t NumChars = 0;
+  size_t i = Pos;
+  while ((i < Text.size()) && isalnum(Text[i])) {
+    i++;
+    NumChars++;
+  }
+  if (NumChars == 0) {
+    return false;
+  }
+  Result = Text.substr(Pos, NumChars);
+  Pos += NumChars;
+  return true;
+}
+
+bool Lexer::ConsumeUntil(char C, StringRef &Result) {
+  const size_t CPos = Text.find_first_of(C, Pos);
+  if (CPos == std::string::npos) {
+    Result = StringRef();
+    return false;
+  }
+  Result = Text.substr(Pos, CPos - Pos);
+  Pos = CPos;
+  return true;
+}
+
+bool Lexer::ConsumeWhitespace() {
+  bool consumed = false;
+  while (Pos < Text.size() && isspace(Text[Pos])) {
+    consumed = true;
+    ++Pos;
+  }
+
+  return consumed;
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
new file mode 100644
index 0000000000000..179bf2480266e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
@@ -0,0 +1,394 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/metadata.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+uint32_t getOpenCLVersion(const llvm::Module &m) {
+  if (auto *const md = m.getNamedMetadata("opencl.ocl.version")) {
+    if (md->getNumOperands() == 1) {
+      auto *const op = md->getOperand(0);
+      if (op->getNumOperands() == 2) {
+        const auto major =
+            mdconst::extract<ConstantInt>(op->getOperand(0))->getZExtValue();
+        const auto minor =
+            mdconst::extract<ConstantInt>(op->getOperand(1))->getZExtValue();
+        return (major * 100 + minor) * 1000;
+      }
+    }
+  }
+  return OpenCLC12;
+}
+
+static constexpr const char *ReqdWGSizeMD = "reqd_work_group_size";
+
+static MDTuple *encodeVectorizationInfo(const VectorizationInfo &info,
+                                        LLVMContext &Ctx) {
+  auto *const i32Ty = Type::getInt32Ty(Ctx);
+
+  return MDTuple::get(
+      Ctx,
+      {ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.vf.getKnownMin())),
+       ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.vf.isScalable())),
+       ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.simdDimIdx)),
+       ConstantAsMetadata::get(
+           ConstantInt::get(i32Ty, info.IsVectorPredicated))});
+}
+
+static std::optional<VectorizationInfo> extractVectorizationInfo(MDTuple *md) {
+  if (md->getNumOperands() != 4) {
+    return std::nullopt;
+  }
+  auto *const widthMD = mdconst::extract<ConstantInt>(md->getOperand(0));
+  auto *const isScalableMD = mdconst::extract<ConstantInt>(md->getOperand(1));
+  auto *const simdDimIdxMD = mdconst::extract<ConstantInt>(md->getOperand(2));
+  auto *const isVPMD = mdconst::extract<ConstantInt>(md->getOperand(3));
+
+  VectorizationInfo info;
+
+  info.vf.setKnownMin(widthMD->getZExtValue());
+  info.vf.setIsScalable(isScalableMD->equalsInt(1));
+  info.simdDimIdx = simdDimIdxMD->getZExtValue();
+  info.IsVectorPredicated = isVPMD->equalsInt(1);
+
+  return info;
+}
+
+static std::optional<LinkMetadataResult> parseVectorLinkMD(MDNode *mdnode) {
+  if (auto info =
+          extractVectorizationInfo(dyn_cast<MDTuple>(mdnode->getOperand(0)))) {
+    // The Function may well be null.
+    Function *vecFn = mdconst::extract_or_null<Function>(mdnode->getOperand(1));
+    return LinkMetadataResult(vecFn, *info);
+  }
+  return std::nullopt;
+}
+
+void encodeVectorizationFailedMetadata(Function &f,
+                                       const VectorizationInfo &info) {
+  auto *veczInfo = encodeVectorizationInfo(info, f.getContext());
+  f.addMetadata("codeplay_ca_vecz.base.fail", *veczInfo);
+}
+
+void linkOrigToVeczFnMetadata(Function &origF, Function &vectorF,
+                              const VectorizationInfo &info) {
+  auto *veczInfo = encodeVectorizationInfo(info, origF.getContext());
+  auto *const mdTuple = MDTuple::get(
+      origF.getContext(), {veczInfo, ValueAsMetadata::get(&vectorF)});
+  origF.addMetadata("codeplay_ca_vecz.base", *mdTuple);
+}
+
+void linkVeczToOrigFnMetadata(Function &vectorizedF, Function &origF,
+                              const VectorizationInfo &info) {
+  auto *veczInfo = encodeVectorizationInfo(info, vectorizedF.getContext());
+  auto *const mdTuple = MDTuple::get(origF.getContext(),
+                                     {veczInfo, ValueAsMetadata::get(&origF)});
+  vectorizedF.addMetadata("codeplay_ca_vecz.derived", *mdTuple);
+}
+
+static bool parseVectorizedFunctionLinkMetadata(
+    Function &f, StringRef mdName,
+    SmallVectorImpl<LinkMetadataResult> &results) {
+  SmallVector<MDNode *, 1> nodes;
+
+  f.getMetadata(mdName, nodes);
+  if (nodes.empty()) {
+    return false;
+  }
+  results.reserve(results.size() + nodes.size());
+  for (auto *mdnode : nodes) {
+    if (auto link = parseVectorLinkMD(mdnode)) {
+      results.emplace_back(*link);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool parseOrigToVeczFnLinkMetadata(Function &f,
+                                   SmallVectorImpl<LinkMetadataResult> &VFs) {
+  return parseVectorizedFunctionLinkMetadata(f, "codeplay_ca_vecz.base", VFs);
+}
+
+std::optional<LinkMetadataResult> parseVeczToOrigFnLinkMetadata(Function &f) {
+  auto *mdnode = f.getMetadata("codeplay_ca_vecz.derived");
+  if (!mdnode) {
+    return std::nullopt;
+  }
+  return parseVectorLinkMD(mdnode);
+}
+
+void dropVeczOrigMetadata(Function &f) {
+  f.setMetadata("codeplay_ca_vecz.base", nullptr);
+}
+
+void dropVeczDerivedMetadata(Function &f) {
+  f.setMetadata("codeplay_ca_vecz.derived", nullptr);
+}
+
+void encodeWrapperFnMetadata(Function &f, const VectorizationInfo &mainInfo,
+                             std::optional<VectorizationInfo> tailInfo) {
+  MDTuple *tailInfoMD = nullptr;
+  auto *mainInfoMD = encodeVectorizationInfo(mainInfo, f.getContext());
+
+  if (tailInfo) {
+    tailInfoMD = encodeVectorizationInfo(*tailInfo, f.getContext());
+  }
+
+  f.setMetadata("codeplay_ca_wrapper",
+                MDTuple::get(f.getContext(), {mainInfoMD, tailInfoMD}));
+}
+
+std::optional<std::pair<VectorizationInfo, std::optional<VectorizationInfo>>>
+parseWrapperFnMetadata(Function &f) {
+  auto *const mdnode = f.getMetadata("codeplay_ca_wrapper");
+  if (!mdnode || mdnode->getNumOperands() != 2) {
+    return std::nullopt;
+  }
+
+  auto *const mainTuple = dyn_cast_or_null<MDTuple>(mdnode->getOperand(0));
+  if (!mainTuple) {
+    return std::nullopt;
+  }
+
+  VectorizationInfo mainInfo;
+  std::optional<VectorizationInfo> tailInfo;
+
+  if (auto info = extractVectorizationInfo(mainTuple)) {
+    mainInfo = *info;
+  } else {
+    return std::nullopt;
+  }
+
+  if (auto *const tailTuple =
+          dyn_cast_or_null<MDTuple>(mdnode->getOperand(1))) {
+    if (auto info = extractVectorizationInfo(tailTuple)) {
+      tailInfo = info;
+    }
+  }
+
+  return std::make_pair(mainInfo, tailInfo);
+}
+
+void copyFunctionMetadata(Function &fromF, Function &toF, bool includeDebug) {
+  if (includeDebug) {
+    toF.copyMetadata(&fromF, 0);
+    return;
+  }
+  // Copy the metadata into the new kernel ignoring any debug info.
+  SmallVector<std::pair<unsigned, MDNode *>, 5> metadata;
+  fromF.getAllMetadata(metadata);
+
+  // Iterate through the metadata and only add nodes to the new one if they
+  // are not debug info.
+  for (const auto &pair : metadata) {
+    if (auto *nonDebug = dyn_cast_or_null<MDTuple>(pair.second)) {
+      toF.setMetadata(pair.first, nonDebug);
+    }
+  }
+}
+
+void encodeLocalSizeMetadata(Function &f, const std::array<uint64_t, 3> &size) {
+  // We may be truncating i64 to i32 but we don't expect local sizes to ever
+  // exceed 32 bits.
+  auto *const i32Ty = Type::getInt32Ty(f.getContext());
+  auto *const mdTuple =
+      MDTuple::get(f.getContext(),
+                   {ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[0])),
+                    ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[1])),
+                    ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[2]))});
+  f.setMetadata(ReqdWGSizeMD, mdTuple);
+}
+
+std::optional<std::array<uint64_t, 3>> getLocalSizeMetadata(const Function &f) {
+  if (auto *md = f.getMetadata(ReqdWGSizeMD)) {
+    return std::array<uint64_t, 3>{
+        mdconst::extract<ConstantInt>(md->getOperand(0))->getZExtValue(),
+        mdconst::extract<ConstantInt>(md->getOperand(1))->getZExtValue(),
+        mdconst::extract<ConstantInt>(md->getOperand(2))->getZExtValue()};
+  }
+  return std::nullopt;
+}
+
+static constexpr const char *MuxScheduledFnMD = "mux_scheduled_fn";
+
+void dropSchedulingParameterMetadata(Function &f) {
+  f.setMetadata(MuxScheduledFnMD, nullptr);
+}
+
+SmallVector<int, 4> getSchedulingParameterFunctionMetadata(const Function &f) {
+  SmallVector<int, 4> idxs;
+  if (auto *md = f.getMetadata(MuxScheduledFnMD)) {
+    for (auto &op : md->operands()) {
+      idxs.push_back(mdconst::extract<ConstantInt>(op)->getSExtValue());
+    }
+  }
+  return idxs;
+}
+
+void setSchedulingParameterFunctionMetadata(Function &f, ArrayRef<int> idxs) {
+  if (idxs.empty()) {
+    return;
+  }
+  SmallVector<Metadata *, 4> mdOps;
+  auto *const i32Ty = Type::getInt32Ty(f.getContext());
+  for (auto idx : idxs) {
+    mdOps.push_back(ConstantAsMetadata::get(ConstantInt::get(i32Ty, idx)));
+  }
+  auto *const mdOpsTuple = MDTuple::get(f.getContext(), mdOps);
+  f.setMetadata(MuxScheduledFnMD, mdOpsTuple);
+}
+
+static constexpr const char *MuxSchedulingParamsMD = "mux-scheduling-params";
+
+void setSchedulingParameterModuleMetadata(Module &m,
+                                          ArrayRef<std::string> names) {
+  SmallVector<Metadata *, 4> paramDebugNames;
+  for (const auto &name : names) {
+    paramDebugNames.push_back(MDString::get(m.getContext(), name));
+  }
+  auto *const md = m.getOrInsertNamedMetadata(MuxSchedulingParamsMD);
+  md->clearOperands();
+  md->addOperand(MDNode::get(m.getContext(), paramDebugNames));
+}
+
+NamedMDNode *getSchedulingParameterModuleMetadata(const Module &m) {
+  return m.getNamedMetadata(MuxSchedulingParamsMD);
+}
+
+std::optional<unsigned> isSchedulingParameter(const Function &f, unsigned idx) {
+  if (auto *md = f.getMetadata(MuxScheduledFnMD)) {
+    for (const auto &op : enumerate(md->operands())) {
+      auto paramIdx = mdconst::extract<ConstantInt>(op.value())->getSExtValue();
+      if (paramIdx >= 0 && (unsigned)paramIdx == idx) {
+        return op.index();
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+// Uses the format of a metadata node directly applied to a function.
+std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
+    const Function &f) {
+  if (auto mdnode = f.getMetadata(ReqdWGSizeMD)) {
+    std::array<uint64_t, 3> wgs = {0, 1, 1};
+    assert(mdnode->getNumOperands() >= 1 && mdnode->getNumOperands() <= 3 &&
+           "Unsupported number of operands in reqd_work_group_size");
+    for (const auto &[idx, op] : enumerate(mdnode->operands())) {
+      wgs[idx] = mdconst::extract<ConstantInt>(op)->getZExtValue();
+    }
+    return wgs;
+  }
+  return std::nullopt;
+}
+
+// Uses the format of a metadata node that's a part of the opencl.kernels node.
+std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
+    const MDNode &node) {
+  for (uint32_t i = 1; i < node.getNumOperands(); ++i) {
+    MDNode *const subNode = cast<MDNode>(node.getOperand(i));
+    MDString *const operandName = cast<MDString>(subNode->getOperand(0));
+    if (operandName->getString() == ReqdWGSizeMD) {
+      auto *const op0 = mdconst::extract<ConstantInt>(subNode->getOperand(1));
+      auto *const op1 = mdconst::extract<ConstantInt>(subNode->getOperand(2));
+      auto *const op2 = mdconst::extract<ConstantInt>(subNode->getOperand(3));
+      // KLOCWORK "UNINIT.STACK.ARRAY.MUST" possible false positive
+      // Initialization of looks like an uninitialized access to Klocwork
+      std::array<uint64_t, 3> wgs = {
+          {op0->getZExtValue(), op1->getZExtValue(), op2->getZExtValue()}};
+      return wgs;
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<uint32_t> parseMaxWorkDimMetadata(const Function &f) {
+  if (auto *mdnode = f.getMetadata("max_work_dim")) {
+    auto *op0 = mdconst::extract<ConstantInt>(mdnode->getOperand(0));
+    return op0->getZExtValue();
+  }
+
+  return std::nullopt;
+}
+
+void populateKernelList(Module &m, SmallVectorImpl<KernelInfo> &results) {
+  // Construct list of kernels from metadata, if present.
+  if (auto *md = m.getNamedMetadata("opencl.kernels")) {
+    for (uint32_t i = 0, e = md->getNumOperands(); i < e; ++i) {
+      MDNode *const kernelNode = md->getOperand(i);
+      ValueAsMetadata *vmdKernel =
+          cast<ValueAsMetadata>(kernelNode->getOperand(0));
+      KernelInfo info{vmdKernel->getValue()->getName()};
+      if (auto wgs = parseRequiredWGSMetadata(*kernelNode)) {
+        info.ReqdWGSize = *wgs;
+      }
+      results.push_back(info);
+    }
+    return;
+  }
+
+  // No metadata - assume all functions with the SPIR_KERNEL calling
+  // convention are kernels.
+  for (auto &f : m) {
+    if (f.hasName() && f.getCallingConv() == CallingConv::SPIR_KERNEL) {
+      KernelInfo info(f.getName());
+      if (auto wgs = parseRequiredWGSMetadata(f)) {
+        info.ReqdWGSize = *wgs;
+      }
+      results.push_back(info);
+    }
+  }
+}
+
+void replaceKernelInOpenCLKernelsMetadata(Function &fromF, Function &toF,
+                                          Module &M) {
+  // update the kernel metadata
+  if (auto *const namedMD = M.getNamedMetadata("opencl.kernels")) {
+    for (auto *md : namedMD->operands()) {
+      if (md && md->getOperand(0) == ValueAsMetadata::get(&fromF)) {
+        md->replaceOperandWith(0, ValueAsMetadata::get(&toF));
+      }
+    }
+  }
+}
+
+static constexpr const char *ReqdSGSizeMD = "intel_reqd_sub_group_size";
+
+void encodeReqdSubgroupSizeMetadata(Function &f, uint32_t size) {
+  auto *const i32Ty = Type::getInt32Ty(f.getContext());
+  auto *const mdTuple = MDTuple::get(
+      f.getContext(), ConstantAsMetadata::get(ConstantInt::get(i32Ty, size)));
+  f.setMetadata(ReqdSGSizeMD, mdTuple);
+}
+
+std::optional<uint32_t> getReqdSubgroupSize(const Function &f) {
+  if (auto *md = f.getMetadata(ReqdSGSizeMD)) {
+    return mdconst::extract<ConstantInt>(md->getOperand(0))->getZExtValue();
+  }
+  return std::nullopt;
+}
+
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
new file mode 100644
index 0000000000000..b1120d26c1444
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -0,0 +1,1331 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/dma.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/scheduling.h>
+#include <compiler/utils/target_extension_types.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/Support/ModRef.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <optional>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+namespace SchedParamIndices {
+enum {
+  WI = 0,
+  WG = 1,
+  TOTAL,
+};
+}
+
+static Function *defineLocalWorkItemBuiltin(BIMuxInfoConcept &BI, BuiltinID ID,
+                                            Module &M) {
+  // Simple 'local' work-item getters and setters.
+  bool IsSetter = false;
+  bool HasRankArg = false;
+  std::optional<WorkItemInfoStructField::Type> WIFieldIdx;
+  switch (ID) {
+    default:
+      return nullptr;
+    case eMuxBuiltinSetLocalId:
+      IsSetter = true;
+      LLVM_FALLTHROUGH;
+    case eMuxBuiltinGetLocalId:
+      HasRankArg = true;
+      WIFieldIdx = WorkItemInfoStructField::local_id;
+      break;
+    case eMuxBuiltinSetSubGroupId:
+      IsSetter = true;
+      LLVM_FALLTHROUGH;
+    case eMuxBuiltinGetSubGroupId:
+      WIFieldIdx = WorkItemInfoStructField::sub_group_id;
+      break;
+    case eMuxBuiltinSetNumSubGroups:
+      IsSetter = true;
+      LLVM_FALLTHROUGH;
+    case eMuxBuiltinGetNumSubGroups:
+      WIFieldIdx = WorkItemInfoStructField::num_sub_groups;
+      break;
+    case eMuxBuiltinSetMaxSubGroupSize:
+      IsSetter = true;
+      LLVM_FALLTHROUGH;
+    case eMuxBuiltinGetMaxSubGroupSize:
+      WIFieldIdx = WorkItemInfoStructField::max_sub_group_size;
+      break;
+  }
+
+  Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID));
+  assert(F && WIFieldIdx);
+
+  // Gather up the list of scheduling parameters on this builtin
+  const auto &SchedParams = BI.getFunctionSchedulingParameters(*F);
+  assert(SchedParamIndices::WI < SchedParams.size());
+
+  // Grab the work-item info argument
+  const auto &SchedParam = SchedParams[SchedParamIndices::WI];
+  auto *const StructTy = dyn_cast<StructType>(SchedParam.ParamPointeeTy);
+  assert(SchedParam.ArgVal && StructTy == getWorkItemInfoStructTy(M) &&
+         "Inconsistent scheduling parameter data");
+
+  if (IsSetter) {
+    populateStructSetterFunction(*F, *SchedParam.ArgVal, StructTy, *WIFieldIdx,
+                                 HasRankArg);
+  } else {
+    populateStructGetterFunction(*F, *SchedParam.ArgVal, StructTy, *WIFieldIdx,
+                                 HasRankArg);
+  }
+
+  return F;
+}
+
+static Function *defineLocalWorkGroupBuiltin(BIMuxInfoConcept &BI, BuiltinID ID,
+                                             Module &M) {
+  // Simple work-group getters
+  bool HasRankArg = true;
+  size_t DefaultVal = 0;
+  std::optional<WorkGroupInfoStructField::Type> WGFieldIdx;
+  switch (ID) {
+    default:
+      return nullptr;
+    case eMuxBuiltinGetLocalSize:
+      DefaultVal = 1;
+      WGFieldIdx = WorkGroupInfoStructField::local_size;
+      break;
+    case eMuxBuiltinGetGroupId:
+      DefaultVal = 0;
+      WGFieldIdx = WorkGroupInfoStructField::group_id;
+      break;
+    case eMuxBuiltinGetNumGroups:
+      DefaultVal = 1;
+      WGFieldIdx = WorkGroupInfoStructField::num_groups;
+      break;
+    case eMuxBuiltinGetGlobalOffset:
+      DefaultVal = 0;
+      WGFieldIdx = WorkGroupInfoStructField::global_offset;
+      break;
+    case eMuxBuiltinGetWorkDim:
+      DefaultVal = 1;
+      HasRankArg = false;
+      WGFieldIdx = WorkGroupInfoStructField::work_dim;
+      break;
+  }
+
+  Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID));
+  assert(F && WGFieldIdx);
+
+  // Gather up the list of scheduling parameters on this builtin
+  const auto &SchedParams = BI.getFunctionSchedulingParameters(*F);
+  assert(SchedParamIndices::WG < SchedParams.size());
+
+  // Grab the work-group info argument
+  const auto &SchedParam = SchedParams[SchedParamIndices::WG];
+  auto *const StructTy = dyn_cast<StructType>(SchedParam.ParamPointeeTy);
+  assert(SchedParam.ArgVal && StructTy == getWorkGroupInfoStructTy(M) &&
+         "Inconsistent scheduling parameter data");
+
+  populateStructGetterFunction(*F, *SchedParam.ArgVal, StructTy, *WGFieldIdx,
+                               HasRankArg, DefaultVal);
+  return F;
+}
+
+// FIXME: Assumes a sub-group size of 1.
+static Function *defineSubGroupGroupOpBuiltin(Function &F,
+                                              GroupCollective GroupOp,
+                                              ArrayRef<Type *> OverloadInfo) {
+  if (!GroupOp.isSubGroupScope()) {
+    return nullptr;
+  }
+
+  auto *Arg = F.getArg(0);
+
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+
+  switch (GroupOp.Op) {
+    default:
+      llvm_unreachable("Unhandled group operation");
+    case GroupCollective::OpKind::Any:
+    case GroupCollective::OpKind::All:
+    case GroupCollective::OpKind::Broadcast:
+    case GroupCollective::OpKind::Reduction:
+    case GroupCollective::OpKind::ScanInclusive:
+      // In the trivial size=1 case, all of these operations just return the
+      // argument back again
+      B.CreateRet(Arg);
+      break;
+    case GroupCollective::OpKind::ScanExclusive: {
+      // In the trivial size=1 case, exclusive scans return the identity.
+      assert(!OverloadInfo.empty());
+      auto *const IdentityVal =
+          getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]);
+      assert(IdentityVal && "Unable to deduce identity val");
+      B.CreateRet(IdentityVal);
+      break;
+    }
+    case GroupCollective::OpKind::Shuffle:
+    case GroupCollective::OpKind::ShuffleXor:
+      // In the trivial size=1 case, all of these operations just return the
+      // argument back again. Any computed shuffle index other than the only
+      // one in the sub-group would be out of bounds anyway.
+      B.CreateRet(Arg);
+      break;
+    case GroupCollective::OpKind::ShuffleUp: {
+      auto *const Prev = F.getArg(0);
+      auto *const Curr = F.getArg(1);
+      auto *const Delta = F.getArg(2);
+      // In the trivial size=1 case, negative delta is the desired index (since
+      // we're subtracting it from zero). If it's greater than zero and less
+      // than the size, we return 'current', else if it's less than zero and
+      // greater than or equal to the negative size, we return 'prev'. So if
+      // 'delta' is zero, return 'current', else return 'prev'. Anything else
+      // is out of bounds so we can simplify things here.
+      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+      auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel");
+      B.CreateRet(Sel);
+      break;
+    }
+    case GroupCollective::OpKind::ShuffleDown: {
+      auto *const Curr = F.getArg(0);
+      auto *const Next = F.getArg(1);
+      auto *const Delta = F.getArg(2);
+      // In the trivial size=1 case, the delta is the desired index (since
+      // we're adding it to zero). If it's less than the size, we return
+      // 'current', else if it's greater or equal to the size but less than
+      // twice the size, we return 'next'. So if 'delta' is zero, return
+      // 'current', else return 'next'. Anything else is out of bounds so we
+      // can simplify things here.
+      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+      auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel");
+      B.CreateRet(Sel);
+      break;
+    }
+  }
+
+  return &F;
+}
+
+static Value *createCallHelper(IRBuilder<> &B, Function &F,
+                               ArrayRef<Value *> Args) {
+  auto *const CI = B.CreateCall(&F, Args);
+  CI->setAttributes(F.getAttributes());
+  CI->setCallingConv(F.getCallingConv());
+  return CI;
+}
+
+void BIMuxInfoConcept::setDefaultBuiltinAttributes(Function &F,
+                                                   bool AlwaysInline) {
+  // Many of our mux builtin functions are marked alwaysinline (unless they're
+  // already marked noinline)
+  if (AlwaysInline && !F.hasFnAttribute(Attribute::NoInline)) {
+    F.addFnAttr(Attribute::AlwaysInline);
+  }
+  // We never use exceptions
+  F.addFnAttr(Attribute::NoUnwind);
+  // Recursion is not supported in ComputeMux
+  F.addFnAttr(Attribute::NoRecurse);
+}
+
+Function *BIMuxInfoConcept::defineGetGlobalId(Module &M) {
+  Function *F =
+      M.getFunction(BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalId));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  // Create an IR builder with a single basic block in our function
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "entry", F));
+
+  auto *const MuxGetGroupIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGroupId, M);
+  auto *const MuxGetGlobalOffsetFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalOffset, M);
+  auto *const MuxGetLocalIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetGroupIdFn && MuxGetGlobalOffsetFn && MuxGetLocalIdFn &&
+         MuxGetLocalSizeFn);
+
+  // Pass on all arguments through to dependent builtins. We expect that each
+  // function has identical prototypes, regardless of whether scheduling
+  // parameters have been added
+  const SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  auto *const GetGroupIdCall = createCallHelper(B, *MuxGetGroupIdFn, Args);
+  auto *const GetGlobalOffsetCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Args);
+  auto *const GetLocalIdCall = createCallHelper(B, *MuxGetLocalIdFn, Args);
+  auto *const GetLocalSizeCall = createCallHelper(B, *MuxGetLocalSizeFn, Args);
+
+  // (get_group_id(i) * get_local_size(i))
+  auto *Ret = B.CreateMul(GetGroupIdCall, GetLocalSizeCall);
+  // (get_group_id(i) * get_local_size(i)) + get_local_id(i)
+  Ret = B.CreateAdd(Ret, GetLocalIdCall);
+  // get_global_id(i) = (get_group_id(i) * get_local_size(i)) +
+  //                    get_local_id(i) + get_global_offset(i)
+  Ret = B.CreateAdd(Ret, GetGlobalOffsetCall);
+
+  // ... and return our result
+  B.CreateRet(Ret);
+  return F;
+}
+
+// FIXME: Assumes a sub-group size of 1.
+Function *BIMuxInfoConcept::defineGetSubGroupSize(Function &F) {
+  setDefaultBuiltinAttributes(F);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+
+  assert(F.getReturnType() == B.getInt32Ty());
+  B.CreateRet(B.getInt32(1));
+
+  return &F;
+}
+
+// FIXME: Assumes a sub-group size of 1.
+Function *BIMuxInfoConcept::defineGetSubGroupLocalId(Function &F) {
+  setDefaultBuiltinAttributes(F);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+
+  assert(F.getReturnType() == B.getInt32Ty());
+  B.CreateRet(B.getInt32(0));
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineGetGlobalSize(Module &M) {
+  Function *F =
+      M.getFunction(BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalSize));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetNumGroupsFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetNumGroups, M);
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetNumGroupsFn && MuxGetLocalSizeFn);
+
+  // create an IR builder with a single basic block in our function
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. We expect that each
+  // function has identical prototypes, regardless of whether scheduling
+  // parameters have been added
+  const SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  // call get_num_groups
+  auto *const GetNumGroupsCall = createCallHelper(B, *MuxGetNumGroupsFn, Args);
+
+  // call get_local_size
+  auto *const GetLocalSizeCall = createCallHelper(B, *MuxGetLocalSizeFn, Args);
+
+  // get_global_size(i) = get_num_groups(i) * get_local_size(i)
+  auto *const Ret = B.CreateMul(GetNumGroupsCall, GetLocalSizeCall);
+
+  // and return our result
+  B.CreateRet(Ret);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineGetLocalLinearId(Module &M) {
+  Function *F = M.getFunction(
+      BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetLocalLinearId));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetLocalIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetLocalIdFn && MuxGetLocalSizeFn);
+
+  // Create a call to all the required builtins.
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. Ignoring the index
+  // parameters we'll add, we expect that each function has identical
+  // prototypes, regardless of whether scheduling parameters have been added
+  SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  SmallVector<Value *, 4> Idx0Args = {B.getInt32(0)};
+  append_range(Idx0Args, Args);
+  SmallVector<Value *, 4> Idx1Args = {B.getInt32(1)};
+  append_range(Idx1Args, Args);
+  SmallVector<Value *, 4> Idx2Args = {B.getInt32(2)};
+  append_range(Idx2Args, Args);
+
+  auto *const GetLocalIDXCall = createCallHelper(B, *MuxGetLocalIdFn, Idx0Args);
+  auto *const GetLocalIDYCall = createCallHelper(B, *MuxGetLocalIdFn, Idx1Args);
+  auto *const GetLocalIDZCall = createCallHelper(B, *MuxGetLocalIdFn, Idx2Args);
+
+  auto *const GetLocalSizeXCall =
+      createCallHelper(B, *MuxGetLocalSizeFn, Idx0Args);
+  auto *const GetLocalSizeYCall =
+      createCallHelper(B, *MuxGetLocalSizeFn, Idx1Args);
+
+  // get_local_id(2) * get_local_size(1).
+  auto *ZTerm = B.CreateMul(GetLocalIDZCall, GetLocalSizeYCall);
+  // get_local_id(2) * get_local_size(1) * get_local_size(0).
+  ZTerm = B.CreateMul(ZTerm, GetLocalSizeXCall);
+
+  // get_local_id(1) * get_local_size(0).
+  auto *const YTerm = B.CreateMul(GetLocalIDYCall, GetLocalSizeXCall);
+
+  // get_local_id(2) * get_local_size(1) * get_local_size(0) +
+  // get_local_id(1) * get_local_size(0).
+  auto *Ret = B.CreateAdd(ZTerm, YTerm);
+  // get_local_id(2) * get_local_size(1) * get_local_size(0) +
+  // get_local_id(1) * get_local_size(0) + get_local_id(0).
+  Ret = B.CreateAdd(Ret, GetLocalIDXCall);
+
+  B.CreateRet(Ret);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineGetGlobalLinearId(Module &M) {
+  Function *F = M.getFunction(
+      BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalLinearId));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetGlobalIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalId, M);
+  auto *const MuxGetGlobalOffsetFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalOffset, M);
+  auto *const MuxGetGlobalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalSize, M);
+  assert(MuxGetGlobalIdFn && MuxGetGlobalOffsetFn && MuxGetGlobalSizeFn);
+
+  // Create a call to all the required builtins.
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. Ignoring the index
+  // parameters we'll add, we expect that each function has identical
+  // prototypes, regardless of whether scheduling parameters have been added
+  SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  SmallVector<Value *, 4> Idx0Args = {B.getInt32(0)};
+  append_range(Idx0Args, Args);
+  SmallVector<Value *, 4> Idx1Args = {B.getInt32(1)};
+  append_range(Idx1Args, Args);
+  SmallVector<Value *, 4> Idx2Args = {B.getInt32(2)};
+  append_range(Idx2Args, Args);
+
+  auto *const GetGlobalIDXCall =
+      createCallHelper(B, *MuxGetGlobalIdFn, Idx0Args);
+  auto *const GetGlobalIDYCall =
+      createCallHelper(B, *MuxGetGlobalIdFn, Idx1Args);
+  auto *const GetGlobalIDZCall =
+      createCallHelper(B, *MuxGetGlobalIdFn, Idx2Args);
+
+  auto *const GetGlobalOffsetXCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Idx0Args);
+  auto *const GetGlobalOffsetYCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Idx1Args);
+  auto *const GetGlobalOffsetZCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Idx2Args);
+
+  auto *const GetGlobalSizeXCall =
+      createCallHelper(B, *MuxGetGlobalSizeFn, Idx0Args);
+  auto *const GetGlobalSizeYCall =
+      createCallHelper(B, *MuxGetGlobalSizeFn, Idx1Args);
+
+  // global linear id is calculated as follows:
+  // get_global_linear_id() =
+  // (get_global_id(2) - get_global_offset(2)) * get_global_size(1) *
+  // get_global_size(0) + (get_global_id(1) - get_global_offset(1)) *
+  // get_global_size(0) + get_global_id(0) - get_global_offset(0).
+  // =
+  // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)) * get_global_size(0) +
+  // get_global_id(0) - get_global_offset(0).
+
+  auto *ZTerm = B.CreateSub(GetGlobalIDZCall, GetGlobalOffsetZCall);
+  // (get_global_id(2) - get_global_offset(2)) * get_global_size(1).
+  ZTerm = B.CreateMul(ZTerm, GetGlobalSizeYCall);
+
+  // get_global_id(1) - get_global_offset(1).
+  auto *const YTerm = B.CreateSub(GetGlobalIDYCall, GetGlobalOffsetYCall);
+
+  // (get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)
+  auto *YZTermsCombined = B.CreateAdd(ZTerm, YTerm);
+
+  // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)) * get_global_size(0).
+  YZTermsCombined = B.CreateMul(YZTermsCombined, GetGlobalSizeXCall);
+
+  // get_global_id(0) - get_global_offset(0).
+  auto *const XTerm = B.CreateSub(GetGlobalIDXCall, GetGlobalOffsetXCall);
+
+  // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)) * get_global_size(0) +
+  // get_global_id(0) - get_global_offset(0).
+  auto *const Ret = B.CreateAdd(XTerm, YZTermsCombined);
+
+  B.CreateRet(Ret);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineGetEnqueuedLocalSize(Module &M) {
+  Function *F = M.getFunction(
+      BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetEnqueuedLocalSize));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetLocalSizeFn);
+
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. We expect that each
+  // function has identical prototypes, regardless of whether scheduling
+  // parameters have been added
+  const SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  // Since we don't support non-uniform subgroups
+  // get_enqueued_local_size(x) == get_local_size(x).
+  auto *const GetLocalSize = createCallHelper(B, *MuxGetLocalSizeFn, Args);
+
+  B.CreateRet(GetLocalSize);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineMemBarrier(Function &F, unsigned,
+                                             unsigned SemanticsIdx) {
+  // FIXME: We're ignoring some operands here. We're dropping the 'scope' but
+  // our set of default set of targets can't make use of anything but a
+  // single-threaded fence. We're also ignoring the kind of memory being
+  // controlled by the barrier.
+  // See CA-2997 and CA-3042 for related discussions.
+  auto &M = *F.getParent();
+  setDefaultBuiltinAttributes(F);
+  F.setLinkage(GlobalValue::InternalLinkage);
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", &F));
+
+  // Grab the semantics argument.
+  Value *Semantics = F.getArg(SemanticsIdx);
+  // Mask out only the memory ordering value.
+  Semantics = B.CreateAnd(Semantics, B.getInt32(MemSemanticsMask));
+
+  // Don't insert this exit block just yet
+  auto *const ExitBB = BasicBlock::Create(M.getContext(), "exit");
+
+  auto *const DefaultBB =
+      BasicBlock::Create(M.getContext(), "case.default", &F);
+  auto *const Switch = B.CreateSwitch(Semantics, DefaultBB);
+
+  const struct {
+    StringRef Name;
+    unsigned SwitchVal;
+    AtomicOrdering Ordering;
+  } Data[4] = {
+      {"case.acquire", MemSemanticsAcquire, AtomicOrdering::Acquire},
+      {"case.release", MemSemanticsRelease, AtomicOrdering::Release},
+      {"case.acq_rel", MemSemanticsAcquireRelease,
+       AtomicOrdering::AcquireRelease},
+      {"case.seq_cst", MemSemanticsSequentiallyConsistent,
+       AtomicOrdering::SequentiallyConsistent},
+  };
+
+  for (const auto &D : Data) {
+    auto *const BB = BasicBlock::Create(M.getContext(), D.Name, &F);
+
+    Switch->addCase(B.getInt32(D.SwitchVal), BB);
+    B.SetInsertPoint(BB);
+    B.CreateFence(D.Ordering, SyncScope::SingleThread);
+    B.CreateBr(ExitBB);
+  }
+
+  // The default case assumes a 'relaxed' ordering and emits no fence
+  // whatsoever.
+  B.SetInsertPoint(DefaultBB);
+  B.CreateBr(ExitBB);
+
+  ExitBB->insertInto(&F);
+  B.SetInsertPoint(ExitBB);
+  B.CreateRetVoid();
+
+  return &F;
+}
+
+static BasicBlock *copy1D(Module &M, BasicBlock &ParentBB, Value *DstPtr,
+                          Value *SrcPtr, Value *NumBytes) {
+  Type *const I8Ty = IntegerType::get(M.getContext(), 8);
+
+  assert(SrcPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+  assert(DstPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+
+  compiler::utils::CreateLoopOpts opts;
+  opts.IVs = {SrcPtr, DstPtr};
+  opts.loopIVNames = {"dma.src", "dma.dst"};
+
+  // This is a simple loop copy a byte at a time from SrcPtr to DstPtr.
+  BasicBlock *ExitBB = compiler::utils::createLoop(
+      &ParentBB, nullptr, ConstantInt::get(getSizeType(M), 0), NumBytes, opts,
+      [&](BasicBlock *BB, Value *X, ArrayRef<Value *> IVsCurr,
+          MutableArrayRef<Value *> IVsNext) {
+        IRBuilder<> B(BB);
+        Value *const CurrentDmaSrcPtr1DPhi = IVsCurr[0];
+        Value *const CurrentDmaDstPtr1DPhi = IVsCurr[1];
+        Value *load = B.CreateLoad(I8Ty, CurrentDmaSrcPtr1DPhi);
+        B.CreateStore(load, CurrentDmaDstPtr1DPhi);
+        IVsNext[0] = B.CreateGEP(I8Ty, CurrentDmaSrcPtr1DPhi,
+                                 ConstantInt::get(X->getType(), 1));
+        IVsNext[1] = B.CreateGEP(I8Ty, CurrentDmaDstPtr1DPhi,
+                                 ConstantInt::get(X->getType(), 1));
+        return BB;
+      });
+
+  return ExitBB;
+}
+
+static BasicBlock *copy2D(Module &M, BasicBlock &ParentBB, Value *DstPtr,
+                          Value *SrcPtr, Value *LineSizeBytes,
+                          Value *LineStrideDst, Value *LineStrideSrc,
+                          Value *NumLines) {
+  Type *const I8Ty = IntegerType::get(M.getContext(), 8);
+
+  assert(SrcPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+  assert(DstPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+
+  compiler::utils::CreateLoopOpts opts;
+  opts.IVs = {SrcPtr, DstPtr};
+  opts.loopIVNames = {"dma.src", "dma.dst"};
+
+  // This is a loop over the range of lines, calling a 1D copy on each line
+  BasicBlock *ExitBB = compiler::utils::createLoop(
+      &ParentBB, nullptr, ConstantInt::get(getSizeType(M), 0), NumLines, opts,
+      [&](BasicBlock *block, Value *, ArrayRef<Value *> IVsCurr,
+          MutableArrayRef<Value *> IVsNext) {
+        IRBuilder<> loopIr(block);
+        Value *CurrentDmaSrcPtrPhi = IVsCurr[0];
+        Value *CurrentDmaDstPtrPhi = IVsCurr[1];
+
+        IVsNext[0] = loopIr.CreateGEP(I8Ty, CurrentDmaSrcPtrPhi, LineStrideSrc);
+        IVsNext[1] = loopIr.CreateGEP(I8Ty, CurrentDmaDstPtrPhi, LineStrideDst);
+        return copy1D(M, *block, CurrentDmaDstPtrPhi, CurrentDmaSrcPtrPhi,
+                      LineSizeBytes);
+      });
+
+  return ExitBB;
+}
+
+Function *BIMuxInfoConcept::defineDMA1D(Function &F) {
+  Argument *const ArgDstPtr = F.getArg(0);
+  Argument *const ArgSrcPtr = F.getArg(1);
+  Argument *const ArgWidth = F.getArg(2);
+  Argument *const ArgEvent = F.getArg(3);
+
+  auto &M = *F.getParent();
+  auto &Ctx = F.getContext();
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+  auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB);
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB);
+
+  auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB,
+                                    *GetLocalIDFn);
+
+  BasicBlock *const LoopExitBB =
+      copy1D(M, *LoopEntryBB, ArgDstPtr, ArgSrcPtr, ArgWidth);
+  IRBuilder<> LoopIRB(LoopExitBB);
+  LoopIRB.CreateBr(ExitBB);
+
+  IRBuilder<> ExitIRB(ExitBB);
+  ExitIRB.CreateRet(ArgEvent);
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineDMA2D(Function &F) {
+  Argument *const ArgDstPtr = F.getArg(0);
+  Argument *const ArcSrcPtr = F.getArg(1);
+  Argument *const ArgWidth = F.getArg(2);
+  Argument *const ArgDstStride = F.getArg(3);
+  Argument *const ArgSrcStride = F.getArg(4);
+  Argument *const ArgNumLines = F.getArg(5);
+  Argument *const ArgEvent = F.getArg(6);
+
+  auto &M = *F.getParent();
+  auto &Ctx = F.getContext();
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+  auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB);
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB);
+
+  auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB,
+                                    *GetLocalIDFn);
+
+  // Create a loop around 1D DMA memcpy, adding strides each time.
+  BasicBlock *const LoopExitBB =
+      copy2D(M, *LoopEntryBB, ArgDstPtr, ArcSrcPtr, ArgWidth, ArgDstStride,
+             ArgSrcStride, ArgNumLines);
+
+  IRBuilder<> LoopIRB(LoopExitBB);
+  LoopIRB.CreateBr(ExitBB);
+
+  IRBuilder<> ExitIRB(ExitBB);
+  ExitIRB.CreateRet(ArgEvent);
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineDMA3D(Function &F) {
+  Argument *const ArgDstPtr = F.getArg(0);
+  Argument *const ArgSrcPtr = F.getArg(1);
+  Argument *const ArgLineSize = F.getArg(2);
+  Argument *const ArgDstLineStride = F.getArg(3);
+  Argument *const ArgSrcLineStride = F.getArg(4);
+  Argument *const ArgNumLinesPerPlane = F.getArg(5);
+  Argument *const ArgDstPlaneStride = F.getArg(6);
+  Argument *const ArgSrcPlaneStride = F.getArg(7);
+  Argument *const ArgNumPlanes = F.getArg(8);
+  Argument *const ArgEvent = F.getArg(9);
+
+  auto &M = *F.getParent();
+  auto &Ctx = F.getContext();
+  Type *const I8Ty = IntegerType::get(Ctx, 8);
+
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+  auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB);
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB);
+
+  auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB,
+                                    *GetLocalIDFn);
+
+  assert(ArgSrcPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+  assert(ArgDstPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+
+  compiler::utils::CreateLoopOpts opts;
+  opts.IVs = {ArgSrcPtr, ArgDstPtr};
+  opts.loopIVNames = {"dma.src", "dma.dst"};
+
+  // Create a loop around 1D DMA memcpy, adding stride, local width each time.
+  BasicBlock *LoopExitBB = compiler::utils::createLoop(
+      LoopEntryBB, nullptr, ConstantInt::get(getSizeType(M), 0), ArgNumPlanes,
+      opts,
+      [&](BasicBlock *BB, Value *, ArrayRef<Value *> IVsCurr,
+          MutableArrayRef<Value *> IVsNext) {
+        IRBuilder<> loopIr(BB);
+        Value *CurrentDmaPlaneSrcPtrPhi = IVsCurr[0];
+        Value *CurrentDmaPlaneDstPtrPhi = IVsCurr[1];
+
+        IVsNext[0] =
+            loopIr.CreateGEP(I8Ty, CurrentDmaPlaneSrcPtrPhi, ArgSrcPlaneStride);
+        IVsNext[1] =
+            loopIr.CreateGEP(I8Ty, CurrentDmaPlaneDstPtrPhi, ArgDstPlaneStride);
+
+        return copy2D(M, *BB, CurrentDmaPlaneDstPtrPhi,
+                      CurrentDmaPlaneSrcPtrPhi, ArgLineSize, ArgDstLineStride,
+                      ArgSrcLineStride, ArgNumLinesPerPlane);
+      });
+
+  IRBuilder<> LoopExitIRB(LoopExitBB);
+  LoopExitIRB.CreateBr(ExitBB);
+
+  IRBuilder<> ExitIRB(ExitBB);
+  ExitIRB.CreateRet(ArgEvent);
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineDMAWait(Function &F) {
+  // By default this function is a simple return-void.
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+  B.CreateRetVoid();
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineMuxBuiltin(BuiltinID ID, Module &M,
+                                             ArrayRef<Type *> OverloadInfo) {
+  assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins");
+  Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo));
+  // FIXME: We'd ideally want to declare it here to reduce pass
+  // inter-dependencies.
+  assert(F && "Function should have been pre-declared");
+  if (!F->isDeclaration()) {
+    return F;
+  }
+
+  switch (ID) {
+    default:
+      break;
+    case eMuxBuiltinGetGlobalId:
+      return defineGetGlobalId(M);
+    case eMuxBuiltinGetGlobalSize:
+      return defineGetGlobalSize(M);
+    case eMuxBuiltinGetLocalLinearId:
+      return defineGetLocalLinearId(M);
+    case eMuxBuiltinGetGlobalLinearId:
+      return defineGetGlobalLinearId(M);
+    case eMuxBuiltinGetEnqueuedLocalSize:
+      return defineGetEnqueuedLocalSize(M);
+    // Just handle the memory synchronization requirements of any barrier
+    // builtin. We assume that the control requirements of work-group and
+    // sub-group control barriers have been handled by earlier passes.
+    case eMuxBuiltinMemBarrier:
+      return defineMemBarrier(*F, 0, 1);
+    case eMuxBuiltinSubGroupBarrier:
+    case eMuxBuiltinWorkGroupBarrier:
+      return defineMemBarrier(*F, 1, 2);
+    case eMuxBuiltinDMARead1D:
+    case eMuxBuiltinDMAWrite1D:
+      return defineDMA1D(*F);
+    case eMuxBuiltinDMARead2D:
+    case eMuxBuiltinDMAWrite2D:
+      return defineDMA2D(*F);
+    case eMuxBuiltinDMARead3D:
+    case eMuxBuiltinDMAWrite3D:
+      return defineDMA3D(*F);
+    case eMuxBuiltinDMAWait:
+      return defineDMAWait(*F);
+    case eMuxBuiltinGetSubGroupSize:
+      return defineGetSubGroupSize(*F);
+    case eMuxBuiltinGetSubGroupLocalId:
+      return defineGetSubGroupLocalId(*F);
+  }
+
+  if (auto *const NewF = defineLocalWorkItemBuiltin(*this, ID, M)) {
+    return NewF;
+  }
+
+  if (auto *const NewF = defineLocalWorkGroupBuiltin(*this, ID, M)) {
+    return NewF;
+  }
+
+  if (auto GroupOp = BuiltinInfo::isMuxGroupCollective(ID)) {
+    if (auto *const NewF =
+            defineSubGroupGroupOpBuiltin(*F, *GroupOp, OverloadInfo)) {
+      return NewF;
+    }
+  }
+
+  return nullptr;
+}
+
+bool BIMuxInfoConcept::requiresSchedulingParameters(BuiltinID ID) {
+  switch (ID) {
+    default:
+      return false;
+    case eMuxBuiltinGetLocalId:
+    case eMuxBuiltinSetLocalId:
+    case eMuxBuiltinGetSubGroupId:
+    case eMuxBuiltinSetSubGroupId:
+    case eMuxBuiltinGetNumSubGroups:
+    case eMuxBuiltinSetNumSubGroups:
+    case eMuxBuiltinGetMaxSubGroupSize:
+    case eMuxBuiltinSetMaxSubGroupSize:
+    case eMuxBuiltinGetLocalLinearId:
+      // Work-item struct only
+      return true;
+    case eMuxBuiltinGetWorkDim:
+    case eMuxBuiltinGetGroupId:
+    case eMuxBuiltinGetNumGroups:
+    case eMuxBuiltinGetGlobalSize:
+    case eMuxBuiltinGetLocalSize:
+    case eMuxBuiltinGetGlobalOffset:
+    case eMuxBuiltinGetEnqueuedLocalSize:
+      // Work-group struct only
+      return true;
+    case eMuxBuiltinGetGlobalId:
+    case eMuxBuiltinGetGlobalLinearId:
+      // Work-item and work-group structs
+      return true;
+  }
+}
+
+Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)M;
+  (void)Ty;
+#else
+  // We only map target extension types
+  assert(Ty && Ty->isTargetExtTy() && "Only expecting target extension types");
+  auto &Ctx = Ty->getContext();
+  auto *TgtExtTy = cast<TargetExtType>(Ty);
+
+  // Samplers are replaced by default with size_t.
+  if (TgtExtTy == compiler::utils::tgtext::getSamplerTy(Ctx)) {
+    return getSizeType(M);
+  }
+
+  // Events are replaced by default with size_t.
+  if (TgtExtTy == compiler::utils::tgtext::getEventTy(Ctx)) {
+    return getSizeType(M);
+  }
+
+  // *All* images are replaced by default with a pointer in the default address
+  // space to the same structure type (i.e., regardless of image dimensions,
+  // etc.)
+  if (TgtExtTy->getName() == "spirv.Image") {
+    return PointerType::getUnqual([&Ctx]() {
+      const char *MuxImageTyName = "MuxImage";
+      if (auto *STy = StructType::getTypeByName(Ctx, MuxImageTyName)) {
+        return STy;
+      }
+      return StructType::create(Ctx, MuxImageTyName);
+    }());
+  }
+
+#endif
+  return nullptr;
+}
+
+Function *BIMuxInfoConcept::getOrDeclareMuxBuiltin(
+    BuiltinID ID, Module &M, ArrayRef<Type *> OverloadInfo) {
+  assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins");
+  auto FnName = BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo);
+  if (auto *const F = M.getFunction(FnName)) {
+    return F;
+  }
+  auto &Ctx = M.getContext();
+  AttrBuilder AB(Ctx);
+  auto *const SizeTy = getSizeType(M);
+  auto *const Int32Ty = Type::getInt32Ty(Ctx);
+  auto *const VoidTy = Type::getVoidTy(Ctx);
+
+  Type *RetTy = nullptr;
+  SmallVector<Type *, 4> ParamTys;
+  SmallVector<std::string, 4> ParamNames;
+
+  switch (ID) {
+    // Ranked Getters
+    case eMuxBuiltinGetLocalId:
+    case eMuxBuiltinGetGlobalId:
+    case eMuxBuiltinGetLocalSize:
+    case eMuxBuiltinGetGlobalSize:
+    case eMuxBuiltinGetGlobalOffset:
+    case eMuxBuiltinGetNumGroups:
+    case eMuxBuiltinGetGroupId:
+    case eMuxBuiltinGetEnqueuedLocalSize:
+      ParamTys.push_back(Int32Ty);
+      ParamNames.push_back("idx");
+      LLVM_FALLTHROUGH;
+    // Unranked Getters
+    case eMuxBuiltinGetWorkDim:
+    case eMuxBuiltinGetSubGroupId:
+    case eMuxBuiltinGetNumSubGroups:
+    case eMuxBuiltinGetSubGroupSize:
+    case eMuxBuiltinGetMaxSubGroupSize:
+    case eMuxBuiltinGetSubGroupLocalId:
+    case eMuxBuiltinGetLocalLinearId:
+    case eMuxBuiltinGetGlobalLinearId: {
+      // Some builtins return uint, others return size_t
+      RetTy = (ID == eMuxBuiltinGetWorkDim || ID == eMuxBuiltinGetSubGroupId ||
+               ID == eMuxBuiltinGetNumSubGroups ||
+               ID == eMuxBuiltinGetSubGroupSize ||
+               ID == eMuxBuiltinGetMaxSubGroupSize ||
+               ID == eMuxBuiltinGetSubGroupLocalId)
+                  ? Int32Ty
+                  : SizeTy;
+      // All of our mux getters are readonly - they may never write data
+      AB.addMemoryAttr(MemoryEffects::readOnly());
+      break;
+    }
+    // Ranked Setters
+    case eMuxBuiltinSetLocalId:
+      ParamTys.push_back(Int32Ty);
+      ParamNames.push_back("idx");
+      LLVM_FALLTHROUGH;
+    // Unranked Setters
+    case eMuxBuiltinSetSubGroupId:
+    case eMuxBuiltinSetNumSubGroups:
+    case eMuxBuiltinSetMaxSubGroupSize: {
+      RetTy = VoidTy;
+      ParamTys.push_back(ID == eMuxBuiltinSetLocalId ? SizeTy : Int32Ty);
+      ParamNames.push_back("val");
+      break;
+    }
+    case eMuxBuiltinMemBarrier: {
+      RetTy = VoidTy;
+      for (auto PName : {"scope", "semantics"}) {
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back(PName);
+      }
+      AB.addAttribute(Attribute::NoMerge);
+      AB.addAttribute(Attribute::NoDuplicate);
+      AB.addAttribute(Attribute::Convergent);
+      break;
+    }
+    case eMuxBuiltinSubGroupBarrier:
+    case eMuxBuiltinWorkGroupBarrier: {
+      RetTy = VoidTy;
+      for (auto PName : {"id", "scope", "semantics"}) {
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back(PName);
+      }
+      AB.addAttribute(Attribute::NoMerge);
+      AB.addAttribute(Attribute::NoDuplicate);
+      AB.addAttribute(Attribute::Convergent);
+      break;
+    }
+    case eMuxBuiltinDMAWait:
+      RetTy = VoidTy;
+      // Num events
+      ParamTys.push_back(Int32Ty);
+      ParamNames.push_back("num_events");
+      // The events list
+      ParamTys.push_back(PointerType::getUnqual(Ctx));
+      ParamNames.push_back("events");
+      AB.addAttribute(Attribute::Convergent);
+      break;
+    case eMuxBuiltinDMARead1D:
+    case eMuxBuiltinDMAWrite1D: {
+      // We need to be told the target event type to declare this builtin.
+      assert(!OverloadInfo.empty() && "Missing event type");
+      auto *const EventTy = OverloadInfo[0];
+      RetTy = EventTy;
+      const bool IsRead = ID == eMuxBuiltinDMARead1D;
+
+      PointerType *const LocalPtrTy =
+          PointerType::get(Ctx, AddressSpace::Local);
+      PointerType *const GlobalPtrTy =
+          PointerType::get(Ctx, AddressSpace::Global);
+
+      ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+      ParamNames.push_back("dst");
+
+      ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+      ParamNames.push_back("src");
+
+      ParamTys.push_back(SizeTy);
+      ParamNames.push_back("num_bytes");
+
+      ParamTys.push_back(EventTy);
+      ParamNames.push_back("event");
+      break;
+    }
+    case eMuxBuiltinDMARead2D:
+    case eMuxBuiltinDMAWrite2D: {
+      // We need to be told the target event type to declare this builtin.
+      assert(!OverloadInfo.empty() && "Missing event type");
+      auto *const EventTy = OverloadInfo[0];
+      RetTy = EventTy;
+      const bool IsRead = ID == eMuxBuiltinDMARead2D;
+
+      PointerType *const LocalPtrTy =
+          PointerType::get(Ctx, AddressSpace::Local);
+      PointerType *const GlobalPtrTy =
+          PointerType::get(Ctx, AddressSpace::Global);
+
+      ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+      ParamNames.push_back("dst");
+
+      ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+      ParamNames.push_back("src");
+
+      for (auto &P : {"num_bytes", "dst_stride", "src_stride", "height"}) {
+        ParamTys.push_back(SizeTy);
+        ParamNames.push_back(P);
+      }
+
+      ParamTys.push_back(EventTy);
+      ParamNames.push_back("event");
+      break;
+    }
+    case eMuxBuiltinDMARead3D:
+    case eMuxBuiltinDMAWrite3D: {
+      // We need to be told the target event type to declare this builtin.
+      assert(!OverloadInfo.empty() && "Missing event type");
+      auto *const EventTy = OverloadInfo[0];
+      RetTy = EventTy;
+      const bool IsRead = ID == eMuxBuiltinDMARead3D;
+
+      PointerType *const LocalPtrTy =
+          PointerType::get(Ctx, AddressSpace::Local);
+      PointerType *const GlobalPtrTy =
+          PointerType::get(Ctx, AddressSpace::Global);
+
+      ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+      ParamNames.push_back("dst");
+
+      ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+      ParamNames.push_back("src");
+
+      for (auto &P :
+           {"num_bytes", "dst_line_stride", "src_line_stride", "height",
+            "dst_plane_stride", "src_plane_stride", "depth"}) {
+        ParamTys.push_back(SizeTy);
+        ParamNames.push_back(P);
+      }
+
+      ParamTys.push_back(EventTy);
+      ParamNames.push_back("event");
+      break;
+    }
+    default:
+      // Group builtins are more easily found using this helper rather than
+      // explicitly enumerating each switch case.
+      if (auto Group = BuiltinInfo::isMuxGroupCollective(ID)) {
+        RetTy = OverloadInfo.front();
+        AB.addAttribute(Attribute::Convergent);
+        switch (Group->Op) {
+          default:
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("val");
+            break;
+          case GroupCollective::OpKind::Broadcast:
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("val");
+            // Broadcasts additionally add ID parameters
+            if (Group->isSubGroupScope()) {
+              ParamTys.push_back(Int32Ty);
+              ParamNames.push_back("lid");
+            } else {
+              ParamTys.push_back(SizeTy);
+              ParamNames.push_back("lidx");
+              ParamTys.push_back(SizeTy);
+              ParamNames.push_back("lidy");
+              ParamTys.push_back(SizeTy);
+              ParamNames.push_back("lidz");
+            }
+            break;
+          case GroupCollective::OpKind::Shuffle:
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("val");
+            ParamTys.push_back(Int32Ty);
+            ParamNames.push_back("lid");
+            break;
+          case GroupCollective::OpKind::ShuffleXor:
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("val");
+            ParamTys.push_back(Int32Ty);
+            ParamNames.push_back("xor_val");
+            break;
+          case GroupCollective::OpKind::ShuffleUp:
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("prev");
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("curr");
+            ParamTys.push_back(Int32Ty);
+            ParamNames.push_back("delta");
+            break;
+          case GroupCollective::OpKind::ShuffleDown:
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("curr");
+            ParamTys.push_back(RetTy);
+            ParamNames.push_back("next");
+            ParamTys.push_back(Int32Ty);
+            ParamNames.push_back("delta");
+            break;
+        }
+        // All work-group operations have a 'barrier id' operand as their first
+        // parameter.
+        if (Group->isWorkGroupScope()) {
+          ParamTys.insert(ParamTys.begin(), Int32Ty);
+          ParamNames.insert(ParamNames.begin(), "id");
+        }
+      } else {
+        // Unknown mux builtin
+        return nullptr;
+      }
+  }
+
+  assert(RetTy);
+  assert(ParamTys.size() == ParamNames.size());
+
+  SmallVector<int, 4> SchedParamIdxs;
+  // Fill up the scalar parameters with the default attributes.
+  SmallVector<AttributeSet, 4> ParamAttrs(ParamTys.size(), AttributeSet());
+
+  if (requiresSchedulingParameters(ID) &&
+      getSchedulingParameterModuleMetadata(M)) {
+    for (const auto &P : getMuxSchedulingParameters(M)) {
+      ParamTys.push_back(P.ParamTy);
+      ParamNames.push_back(P.ParamName);
+      ParamAttrs.push_back(P.ParamAttrs);
+      SchedParamIdxs.push_back(ParamTys.size() - 1);
+    }
+  }
+
+  auto *const FnTy = FunctionType::get(RetTy, ParamTys, /*isVarArg*/ false);
+  auto *const F = Function::Create(FnTy, Function::ExternalLinkage, FnName, &M);
+  F->addFnAttrs(AB);
+
+  // Add some extra attributes we know are always true.
+  setDefaultBuiltinAttributes(*F);
+
+  for (unsigned i = 0, e = ParamNames.size(); i != e; i++) {
+    F->getArg(i)->setName(ParamNames[i]);
+    auto AB = AttrBuilder(Ctx, ParamAttrs[i]);
+    F->getArg(i)->addAttrs(AB);
+  }
+
+  setSchedulingParameterFunctionMetadata(*F, SchedParamIdxs);
+
+  return F;
+}
+
+// By default we use two parameters:
+// * one structure containing local work-group data
+// * one structure containing non-local work-group data
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) {
+  auto &Ctx = M.getContext();
+  auto &DL = M.getDataLayout();
+  AttributeSet DefaultAttrs;
+  DefaultAttrs = DefaultAttrs.addAttribute(Ctx, Attribute::NonNull);
+  DefaultAttrs = DefaultAttrs.addAttribute(Ctx, Attribute::NoAlias);
+
+  BuiltinInfo::SchedParamInfo WIInfo;
+  {
+    auto *const WIInfoS = getWorkItemInfoStructTy(M);
+    WIInfo.ID = SchedParamIndices::WI;
+    WIInfo.ParamPointeeTy = WIInfoS;
+    WIInfo.ParamTy = WIInfoS->getPointerTo();
+    WIInfo.ParamName = "wi-info";
+    WIInfo.ParamDebugName = WIInfoS->getStructName().str();
+    WIInfo.PassedExternally = false;
+
+    auto AB = AttrBuilder(Ctx, DefaultAttrs);
+    AB.addAlignmentAttr(DL.getABITypeAlign(WIInfoS));
+    AB.addDereferenceableAttr(DL.getTypeAllocSize(WIInfoS));
+    WIInfo.ParamAttrs = AttributeSet::get(Ctx, AB);
+  }
+
+  BuiltinInfo::SchedParamInfo WGInfo;
+  {
+    auto *const WGInfoS = getWorkGroupInfoStructTy(M);
+    WGInfo.ID = SchedParamIndices::WG;
+    WGInfo.ParamPointeeTy = WGInfoS;
+    WGInfo.ParamTy = WGInfoS->getPointerTo();
+    WGInfo.ParamName = "wg-info";
+    WGInfo.ParamDebugName = WGInfoS->getStructName().str();
+    WGInfo.PassedExternally = true;
+
+    auto AB = AttrBuilder(Ctx, DefaultAttrs);
+    AB.addAlignmentAttr(DL.getABITypeAlign(WGInfoS));
+    AB.addDereferenceableAttr(DL.getTypeAllocSize(WGInfoS));
+    WGInfo.ParamAttrs = AttributeSet::get(Ctx, AB);
+  }
+
+  return {WIInfo, WGInfo};
+}
+
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BIMuxInfoConcept::getFunctionSchedulingParameters(Function &F) {
+  // Query function metadata to determine whether this function has scheduling
+  // parameters
+  auto ParamIdxs = getSchedulingParameterFunctionMetadata(F);
+  if (ParamIdxs.empty()) {
+    return {};
+  }
+
+  auto SchedParamInfo = getMuxSchedulingParameters(*F.getParent());
+  // We don't allow a function to have a subset of the global scheduling
+  // parameters.
+  assert(ParamIdxs.size() >= SchedParamInfo.size());
+  // Set the concrete argument values on each of the scheduling parameter data.
+  for (auto it : zip(SchedParamInfo, ParamIdxs)) {
+    // Some scheduling parameters may not be present (returning an index of
+    // -1), in which case skip their concrete argument values.
+    if (std::get<1>(it) >= 0) {
+      std::get<0>(it).ArgVal = F.getArg(std::get<1>(it));
+    }
+  }
+
+  return SchedParamInfo;
+}
+
+Value *BIMuxInfoConcept::initializeSchedulingParamForWrappedKernel(
+    const BuiltinInfo::SchedParamInfo &Info, IRBuilder<> &B, Function &IntoF,
+    Function &) {
+  // We only expect to have to initialize the work-item info. The work-group
+  // info is straight passed through.
+  (void)IntoF;
+  assert(!Info.PassedExternally && Info.ID == SchedParamIndices::WI &&
+         Info.ParamName == "wi-info" &&
+         Info.ParamPointeeTy == getWorkItemInfoStructTy(*IntoF.getParent()));
+  return B.CreateAlloca(Info.ParamPointeeTy,
+                        /*ArraySize*/ nullptr, Info.ParamName);
+}
+
+std::optional<llvm::ConstantRange> BIMuxInfoConcept::getBuiltinRange(
+    llvm::CallInst &CI, BuiltinID ID,
+    std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+    std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const {
+  assert(CI.getCalledFunction() && CI.getType()->isIntegerTy() &&
+         "Unexpected builtin");
+
+  auto Bits = CI.getType()->getIntegerBitWidth();
+  // Assume we're indexing the global sizes array.
+  std::array<std::optional<uint64_t>, 3> *SizesPtr = &MaxGlobalSizes;
+
+  switch (ID) {
+    default:
+      return std::nullopt;
+    case eMuxBuiltinGetWorkDim:
+      return ConstantRange::getNonEmpty(APInt(Bits, 1), APInt(Bits, 4));
+    case eMuxBuiltinGetLocalId:
+    case eMuxBuiltinGetLocalSize:
+    case eMuxBuiltinGetEnqueuedLocalSize:
+      // Use the local sizes array, and fall through to common handling.
+      SizesPtr = &MaxLocalSizes;
+      [[fallthrough]];
+    case eMuxBuiltinGetGlobalSize: {
+      auto *DimIdx = CI.getOperand(0);
+      if (!isa<ConstantInt>(DimIdx)) {
+        return std::nullopt;
+      }
+      const uint64_t DimVal = cast<ConstantInt>(DimIdx)->getZExtValue();
+      if (DimVal >= SizesPtr->size()) {
+        return std::nullopt;
+      }
+      const std::optional<uint64_t> Size = (*SizesPtr)[DimVal];
+      if (!Size) {
+        return std::nullopt;
+      }
+      // ID builtins range [0,size) (exclusive), and size builtins [1,size]
+      // (inclusive). Thus offset the range by 1 at each low/high end when
+      // returning the range for a size builtin.
+      const int SizeAdjust = ID == eMuxBuiltinGetLocalSize ||
+                             ID == eMuxBuiltinGetEnqueuedLocalSize ||
+                             ID == eMuxBuiltinGetGlobalSize;
+      return ConstantRange::getNonEmpty(APInt(Bits, SizeAdjust),
+                                        APInt(Bits, Size.value() + SizeAdjust));
+    }
+  }
+}
+
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
new file mode 100644
index 0000000000000..17b5f0ebd77fb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -0,0 +1,312 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This pass replaces builtin functions with optimal equivalents.
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/optimal_builtin_replacement_pass.h>
+#include <llvm/ADT/PriorityWorklist.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstIterator.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <multi_llvm/triple.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#define DEBUG_TYPE "ca-optimal-builtins"
+
+using namespace llvm;
+
+namespace {
+
+void removeCallSite(CallBase &CB, LazyCallGraph &CG) {
+  Function *Caller = CB.getCaller();
+  Function *Callee = CB.getCaller();
+  auto CallerNode = CG.get(*Caller);
+  auto CalleeNode = CG.get(*Callee);
+  if (auto *CallerRef = CG.lookupRefSCC(CallerNode)) {
+    CallerRef->removeOutgoingEdge(CallerNode, CalleeNode);
+  }
+}
+
+}  // namespace
+
+namespace compiler {
+namespace utils {
+
+Value *OptimalBuiltinReplacementPass::replaceAbacusCLZ(
+    CallBase &CB, StringRef BaseName, const SmallVectorImpl<Type *> &,
+    const SmallVectorImpl<TypeQualifiers> &) {
+  if (BaseName != "__abacus_clz") {
+    return nullptr;
+  }
+  Module *M = CB.getModule();
+  SmallVector<Value *, 4> Args(CB.args());
+  // Get the declaration for the intrinsic
+  auto *const ArgTy = Args[0]->getType();
+  auto *const Intrinsic = Intrinsic::getDeclaration(M, Intrinsic::ctlz, ArgTy);
+  // If we didn't find the intrinsic or the return type isn't what we
+  // expect, skip this optimization
+  Function *Callee = CB.getCalledFunction();
+  assert(Callee);
+  if (!Intrinsic || Intrinsic->getReturnType() != Callee->getReturnType()) {
+    return nullptr;
+  }
+
+  // On 32-bit ARM, the llvm.ctlz intrinsic on 64-bit types is expanded using
+  // compiler-rt. Without online linking, we can't support that.
+  const Triple TT(CB.getModule()->getTargetTriple());
+  if (TT.getArch() == Triple::arm && ArgTy->isIntOrIntVectorTy(64)) {
+    return nullptr;
+  }
+
+  // LLVM's ctlz has a second argument to specify that zeroes in the first
+  // argument produces a defined result.
+  LLVMContext &Ctx = M->getContext();
+  Args.push_back(ConstantInt::getFalse(Ctx));
+
+  return CallInst::Create(Intrinsic, Args, "", &CB);
+}
+
+Value *OptimalBuiltinReplacementPass::replaceAbacusMulhi(
+    CallBase &CB, StringRef BaseName, const SmallVectorImpl<Type *> &,
+    const SmallVectorImpl<TypeQualifiers> &Quals) {
+  if (BaseName != "__abacus_mul_hi") {
+    return nullptr;
+  }
+  IRBuilder<> B(&CB);
+
+  auto I = CB.arg_begin();
+  Value *const LHS = *I++;
+  Value *const RHS = *I++;
+
+  const auto BitWidth = LHS->getType()->getScalarType()->getIntegerBitWidth();
+
+  // Don't perform this optimization on 64-bit types as 128-bit types aren't
+  // generally well supported.
+  if (BitWidth == 64) {
+    return nullptr;
+  }
+
+  unsigned VecWidth = 1;
+  if (const auto *VecTy = dyn_cast<VectorType>(LHS->getType())) {
+    VecWidth = multi_llvm::getVectorNumElements(VecTy);
+  }
+
+  Type *UpTy = B.getIntNTy(BitWidth * 2);
+  if (VecWidth != 1) {
+    UpTy = FixedVectorType::get(UpTy, VecWidth);
+  }
+
+  bool SrcIsSigned = false;
+  for (unsigned i = 0, e = Quals[0].getCount(); i != e; i++) {
+    if (Quals[0].at(i) == eTypeQualSignedInt) {
+      SrcIsSigned = true;
+      break;
+    }
+  }
+
+  const auto CastOp = SrcIsSigned ? Instruction::SExt : Instruction::ZExt;
+
+  auto *const UpLHS = B.CreateCast(CastOp, LHS, UpTy);
+  auto *const UpRHS = B.CreateCast(CastOp, RHS, UpTy);
+
+  auto *const Mul = B.CreateMul(UpLHS, UpRHS);
+
+  Constant *ShiftAmt = B.getIntN(BitWidth * 2, BitWidth);
+  if (VecWidth != 1) {
+    ShiftAmt = ConstantDataVector::getSplat(VecWidth, ShiftAmt);
+  }
+
+  auto *const Shift = B.CreateAShr(Mul, ShiftAmt);
+
+  return B.CreateTrunc(Shift, LHS->getType());
+}
+
+Value *OptimalBuiltinReplacementPass::replaceAbacusFMinFMax(
+    CallBase &CB, StringRef BaseName, const SmallVectorImpl<Type *> &,
+    const SmallVectorImpl<TypeQualifiers> &) {
+  const bool IsFMin = BaseName == "__abacus_fmin";
+  if (!IsFMin && BaseName != "__abacus_fmax") {
+    return nullptr;
+  }
+
+  const Triple TT(CB.getModule()->getTargetTriple());
+  // minnum/maxnum intrinsics fail CTS on arm targets. See
+  // https://llvm.org/PR27363.
+  if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) {
+    return nullptr;
+  }
+
+  IRBuilder<> B(&CB);
+
+  auto I = CB.arg_begin();
+  Value *LHS = *I++;
+  Value *RHS = *I++;
+
+  const auto *LHSTy = LHS->getType();
+  const auto *RHSTy = RHS->getType();
+
+  if (LHSTy->isVectorTy() != RHSTy->isVectorTy()) {
+    auto VectorEC =
+        multi_llvm::getVectorElementCount(LHSTy->isVectorTy() ? LHSTy : RHSTy);
+    if (!LHS->getType()->isVectorTy()) {
+      LHS = B.CreateVectorSplat(VectorEC, LHS);
+    }
+    if (!RHS->getType()->isVectorTy()) {
+      RHS = B.CreateVectorSplat(VectorEC, RHS);
+    }
+  }
+  return B.CreateBinaryIntrinsic(IsFMin ? Intrinsic::minnum : Intrinsic::maxnum,
+                                 LHS, RHS);
+}
+
+OptimalBuiltinReplacementPass::OptimalBuiltinReplacementPass() {
+  replacements.emplace_back(replaceAbacusCLZ);
+  replacements.emplace_back(replaceAbacusMulhi);
+  replacements.emplace_back(replaceAbacusFMinFMax);
+}
+
+Value *OptimalBuiltinReplacementPass::replaceBuiltinWithInlineIR(
+    CallBase &CB) const {
+  auto *M = CB.getModule();
+  NameMangler mangler(&M->getContext());
+
+  SmallVector<Type *, 4> Types;
+  SmallVector<TypeQualifiers, 4> Quals;
+  Function *Callee = CB.getCalledFunction();
+  assert(Callee);
+  const StringRef BaseName =
+      mangler.demangleName(Callee->getName(), Types, Quals);
+
+  for (const auto &replace_fn : replacements) {
+    if (replace_fn) {
+      if (auto *V = replace_fn(CB, BaseName, Types, Quals)) {
+        return V;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C,
+                                                     CGSCCAnalysisManager &AM,
+                                                     LazyCallGraph &CG,
+                                                     CGSCCUpdateResult &) {
+  // Without the possibility of recursion, we can expect all meaningful
+  // OpenCL/ComputeMux programs to be contained within a single singular SCC
+  // serving as the entry point. We use this as the root.
+  if (C.size() != 1) {
+    return PreservedAnalyses::all();
+  }
+  Module &M = *C.begin()->getFunction().getParent();
+
+  // Check that at least one node in this graph is a kernel.
+  if (none_of(C, [](const LazyCallGraph::Node &N) {
+        return N.getFunction().getCallingConv() == CallingConv::SPIR_KERNEL;
+      })) {
+    return PreservedAnalyses::all();
+  }
+
+  const auto &MAMProxy = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(C, CG);
+  if (auto *BI = MAMProxy.getCachedResult<BuiltinInfoAnalysis>(M)) {
+    replacements.emplace_back(
+        [BI](CallBase &CB, StringRef, const SmallVectorImpl<Type *> &,
+             const SmallVectorImpl<TypeQualifiers> &) -> Value * {
+          Function *Callee = CB.getCalledFunction();
+          auto const Props = BI->analyzeBuiltin(*Callee).properties;
+          if (Props & eBuiltinPropertyCanEmitInline) {
+            IRBuilder<> B(&CB);
+            const SmallVector<Value *, 4> Args(CB.args());
+            if (Value *Impl = BI->emitBuiltinInline(Callee, B, Args)) {
+              assert(
+                  Impl->getType() == CB.getType() &&
+                  "The inlinined function type must match that of the original "
+                  "function");
+              return Impl;
+            }
+          }
+          return nullptr;
+        });
+  }
+
+  if (adjustReplacements) {
+    adjustReplacements(replacements);
+  }
+
+  // If there are no replacements to run, for whatever reason, we can bail
+  // early.
+  if (replacements.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  SmallVector<CallBase *, 4> ToDelete;
+  // The SmallPriorityWorklist prioritises nodes which have been inserted
+  // multiple times, and avoids duplication of already-inserted items, but
+  // *not* ones already visited and popped off.
+  SmallPriorityWorklist<LazyCallGraph::Node *, 4> Worklist;
+  // Assuming we only have one node to begin with (see above), start off with
+  // that.
+  Worklist.insert(&*C.begin());
+  // While the worklist above prevents re-insertion, we might end up visiting
+  // the same function again after already visiting if popping it off the
+  // worklist. So we still have to keep track of recursion.
+  SmallPtrSet<LazyCallGraph::Node *, 4> Visited;
+
+  // Now visit all nodes in this "root" graph in order. We will visit
+  // outer-most functions (kernels) first before descending the call graph.
+  // This gives precedence to "outer-most" replacements.
+  while (!Worklist.empty()) {
+    LazyCallGraph::Node *N = Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "OptimalBuiltinReplacement: visiting " << *N << "\n");
+    for (Instruction &I : instructions(N->getFunction())) {
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
+        if (CB->getCalledFunction() && !isa<IntrinsicInst>(I)) {
+          if (Value *New = replaceBuiltinWithInlineIR(*CB)) {
+            LLVM_DEBUG(dbgs()
+                       << "\tOptimalBuiltinReplacement: replacing call to "
+                       << CB->getCalledFunction()->getName() << "\n");
+            ToDelete.push_back(CB);
+            removeCallSite(*CB, CG);
+            // Assume that replacements don't introduce new calls, and we can
+            // simply mark this one as gone and move on.
+            CB->replaceAllUsesWith(New);
+          } else if (auto *CalledN = CG.lookup(*CB->getCalledFunction())) {
+            if (Visited.insert(CalledN).second) {
+              Worklist.insert(CalledN);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const bool Modified = !ToDelete.empty();
+
+  // Clean up any dead calls.
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    I->eraseFromParent();
+  }
+
+  return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
new file mode 100644
index 0000000000000..35cf68a70dd35
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -0,0 +1,756 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/device_info.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cassert>
+
+llvm::AnalysisKey compiler::utils::DeviceInfoAnalysis::Key;
+
+namespace compiler {
+namespace utils {
+
+uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn) {
+  const llvm::Module *module = fn.getParent();
+  const auto &layout = module->getDataLayout();
+  uint64_t bytes = 0;
+
+  // BarrierPass asserts that `allocas` only exist in the entry block
+  for (auto &inst : fn.getEntryBlock()) {
+    if (!llvm::isa<llvm::AllocaInst>(inst)) {
+      continue;
+    }
+    const auto &alloca_inst = llvm::cast<llvm::AllocaInst>(inst);
+    const auto *type = alloca_inst.getType();
+    if (type->getAddressSpace() != AddressSpace::Private) {
+      continue;
+    }
+    auto *alloc_type = alloca_inst.getAllocatedType();
+    const auto alloc_size = layout.getTypeAllocSize(alloc_type);
+    if (alloca_inst.isArrayAllocation()) {
+      auto *arr_size_val = alloca_inst.getArraySize();
+      auto *const_int = llvm::dyn_cast<llvm::ConstantInt>(arr_size_val);
+      assert(const_int != nullptr &&
+             "OpenCL or Vulkan Array Allocation of dynamic size");
+      const uint64_t arr_size = const_int->getUniqueInteger().getLimitedValue();
+      bytes += arr_size * alloc_size;
+
+    } else {
+      bytes += alloc_size;
+    }
+  }
+  return bytes;
+}
+
+static llvm::SmallVector<llvm::Constant *> getNewOps(llvm::Constant *constant,
+                                                     llvm::Constant *from,
+                                                     llvm::Constant *to) {
+  llvm::SmallVector<llvm::Constant *> newOps;
+  // iterate through the constant and create a vector of old and new
+  // ones
+  for (unsigned i = 0, e = constant->getNumOperands(); i != e; ++i) {
+    auto op = constant->getOperand(i);
+    if (op == from) {
+      newOps.push_back(to);
+    } else {
+      newOps.push_back(llvm::cast<llvm::Constant>(op));
+    }
+  }
+  return newOps;
+}
+
+void remapConstantArray(llvm::ConstantArray *arr, llvm::Constant *from,
+                        llvm::Constant *to) {
+  const llvm::SmallVector<llvm::Constant *> newOps = getNewOps(arr, from, to);
+  // Create a new array with the list of operands and replace all uses with
+  llvm::Constant *newConstant =
+      llvm::ConstantArray::get(arr->getType(), newOps);
+  arr->replaceAllUsesWith(newConstant);
+  arr->destroyConstant();
+}
+
+void remapConstantExpr(llvm::ConstantExpr *expr, llvm::Constant *from,
+                       llvm::Constant *to) {
+  const llvm::SmallVector<llvm::Constant *> newOps = getNewOps(expr, from, to);
+  // Create a new expression with the list of operands and replace all uses with
+  llvm::Constant *newConstant = expr->getWithOperands(newOps);
+  expr->replaceAllUsesWith(newConstant);
+  expr->destroyConstant();
+}
+
+bool funcContainsDebugMetadata(const llvm::Function &func,
+                               llvm::ValueToValueMapTy &vmap) {
+  // Check if function references debug info
+  bool foundDI = false;
+
+  // Function has a DISubprogram entry attached
+  if (auto DISubprogram = func.getSubprogram()) {
+    vmap.MD()[DISubprogram].reset(DISubprogram);
+    foundDI = true;
+  }
+
+  for (auto &BB : func) {
+    for (auto &Inst : BB) {
+      if (auto DL = Inst.getDebugLoc()) {
+        llvm::DILocation *loc = DL.get();
+        vmap.MD()[loc].reset(loc);
+        foundDI = true;
+      }
+
+      if (auto DebugIntrinsic = llvm::dyn_cast<llvm::DbgInfoIntrinsic>(&Inst)) {
+        llvm::DILocalVariable *DIVar = nullptr;
+        if (auto DbgVarIntrinsic =
+                llvm::dyn_cast<llvm::DbgVariableIntrinsic>(DebugIntrinsic)) {
+          DIVar = DbgVarIntrinsic->getVariable();
+        } else {
+          continue;  // TODO CA-1115 - we don't handle DbgLabelInsts yet
+        }
+        if (DIVar) {
+          vmap.MD()[DIVar].reset(DIVar);
+          auto varLoc = DIVar->getScope();
+          vmap.MD()[varLoc].reset(varLoc);
+          foundDI = true;
+        }
+      }
+    }
+  }
+
+  return foundDI;
+}
+
+void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
+  // remove all dead constant users (sometimes these are left over by previous
+  // passes)
+  constant->removeDeadConstantUsers();
+
+  // Only handle constants which are ConstantExpr or ConstantVector
+  assert((llvm::isa<llvm::ConstantExpr>(constant) ||
+          llvm::isa<llvm::ConstantVector>(constant)) &&
+         "Unsupported constant type in IR");
+
+  // For each user of a constant we will check to see if they in turn are
+  // constants. If they are convert them to instructions first (still
+  // referencing this constant). We can are then clear to convert the current
+  // constant to an instruction as the only users left are instructions.
+
+  llvm::SmallVector<llvm::User *, 8> users;
+  // Create the list of users of this constant. We don't want duplicates here,
+  // which often happens with ConstantVectors, as we only want convert them to
+  // an instruction once. We want determinism here so use a vector to maintain
+  // order.
+  for (auto *constantUser : constant->users()) {
+    if (std::find(users.begin(), users.end(), constantUser) == users.end()) {
+      users.push_back(constantUser);
+    }
+  }
+
+  for (auto *constantUser : users) {
+    if (llvm::isa<llvm::Instruction>(constantUser)) {
+      // instructions are our best case, do nothing!
+    } else if (llvm::Constant *subConstant =
+                   llvm::dyn_cast<llvm::Constant>(constantUser)) {
+      replaceConstantExpressionWithInstruction(subConstant);
+    } else {
+      constantUser->print(llvm::errs());
+      llvm_unreachable("Constant user is not a constant or instruction!!");
+    }
+  }
+
+  // we record each use
+  llvm::SmallVector<llvm::Use *, 8> uses;
+
+  for (auto &use : constant->uses()) {
+    uses.push_back(&use);
+  }
+
+  for (auto *use : uses) {
+    // get the instruction that is the user of the use
+    auto inst = llvm::cast<llvm::Instruction>(use->getUser());
+
+    // get the function for this use
+    auto useFunc = inst->getFunction();
+
+    llvm::Instruction *newInst = nullptr;
+    // create a new instruction that matches the constant expression
+    if (llvm::ConstantExpr *constantExpr =
+            llvm::dyn_cast<llvm::ConstantExpr>(constant)) {
+      newInst = constantExpr->getAsInstruction();
+      // insert the instruction at the beginning of the entry block
+      newInst->insertBefore(useFunc->getEntryBlock().getFirstNonPHI());
+    } else if (llvm::ConstantVector *constantVec =
+                   llvm::dyn_cast<llvm::ConstantVector>(constant)) {
+      // If it is a ConstantVector then only handle the case where it is
+      // a single splatted value. This is the only kind generated at present.
+      auto splatVal = constantVec->getSplatValue();
+      assert(splatVal &&
+             "ConstantVector does not contained identical constants so cannot "
+             "be splatted!");
+      // Take the splatted Value and create two instructions. The first is
+      // InsertElement to place it in a new vector and the second is a
+      // ShuffleVector to duplicate the value across the vector.
+      auto numEls = constantVec->getNumOperands();
+      llvm::Value *undef = llvm::UndefValue::get(
+          llvm::FixedVectorType::get(splatVal->getType(), numEls));
+      llvm::Type *i32Ty = llvm::Type::getInt32Ty(constant->getContext());
+      auto insert = llvm::InsertElementInst::Create(
+          undef, splatVal, llvm::ConstantInt::get(i32Ty, 0));
+      insert->insertBefore(useFunc->getEntryBlock().getFirstNonPHI());
+      llvm::Value *zeros = llvm::ConstantAggregateZero::get(
+          llvm::FixedVectorType::get(i32Ty, numEls));
+      newInst = new llvm::ShuffleVectorInst(insert, undef, zeros);
+      newInst->insertAfter(insert);
+    }
+
+    // replace the use of the constant with the instruction
+    use->set(newInst);
+  }
+
+  // lastly, destroy the constant we just replaced
+  constant->destroyConstant();
+}
+
+llvm::AttributeList getCopiedFunctionAttrs(const llvm::Function &oldFn,
+                                           int numParams) {
+  const unsigned numParamsToCopy =
+      numParams < 0 ? oldFn.arg_size() : (unsigned)numParams;
+  llvm::SmallVector<llvm::AttributeSet, 4> newArgAttrs(numParamsToCopy);
+  const llvm::AttributeList oldAttrs = oldFn.getAttributes();
+  // clone any argument attributes we're copying over. Note we can't simply
+  // call Function::copyAttributes as not all arguments are present in the new
+  // function.
+  for (unsigned i = 0, e = numParamsToCopy; i != e; i++) {
+    newArgAttrs[i] = oldAttrs.getParamAttrs(i);
+  }
+
+  return llvm::AttributeList::get(oldFn.getContext(), oldAttrs.getFnAttrs(),
+                                  oldAttrs.getRetAttrs(), newArgAttrs);
+}
+
+void copyFunctionAttrs(const llvm::Function &oldFn, llvm::Function &newFn,
+                       int numParams) {
+  newFn.setAttributes(getCopiedFunctionAttrs(oldFn, numParams));
+}
+
+bool cloneFunctionsAddArg(
+    llvm::Module &module,
+    std::function<ParamTypeAttrsPair(llvm::Module &)> paramTypeFunc,
+    std::function<void(const llvm::Function &, bool &ClonedWithBody,
+                       bool &ClonedNoBody)>
+        toBeCloned,
+    const UpdateMDCallbackFn &updateMetaDataCallback) {
+  // mapping from new -> old function
+  llvm::ValueMap<llvm::Function *, llvm::Function *> newToOldMap;
+
+  // Preserve the value map across all function clones
+  llvm::ValueToValueMapTy vmap;
+
+  const ParamTypeAttrsPair paramInfo = paramTypeFunc(module);
+
+  // For each function we run the function toBeCloned to set the bools
+  // doCloneNoBody and doCloneWithBody
+  // first, run through our functions and make copies of all functions that:
+  //   1) are not declarations (these will be builtins we expand later) or
+  //   doCloneNoBody is set (don't clone but flesh out)
+  //   2) are not new functions that we just added
+  //   3) Functions marked by doCloneWithBody
+  for (auto &func : module.functions()) {
+    bool doCloneWithBody = false;
+    bool doCloneNoBody = false;
+
+    toBeCloned(func, doCloneWithBody, doCloneNoBody);
+    const bool isDecl = func.isDeclaration();
+    bool processFunc = (0 == newToOldMap.count(&func));
+
+    if (!isDecl) {
+      processFunc = processFunc && doCloneWithBody;
+    } else {
+      processFunc = processFunc && doCloneNoBody;
+    }
+
+    if (processFunc) {
+      auto funcTy = func.getFunctionType();
+
+      const unsigned numParams = funcTy->getNumParams();
+      llvm::SmallVector<llvm::Type *, 8> newParamTypes(numParams + 1);
+
+      // add each param from the original function to the new one
+      for (unsigned i = 0; i < numParams; i++) {
+        newParamTypes[i] = funcTy->getParamType(i);
+      }
+      // and lastly add our extra arg as the last param
+      newParamTypes[numParams] = paramInfo.first;
+
+      auto newFuncTy = llvm::FunctionType::get(funcTy->getReturnType(),
+                                               newParamTypes, false);
+
+      // create our new function, using the linkage from the old one
+      auto newFunc =
+          llvm::Function::Create(newFuncTy, func.getLinkage(), "", &module);
+
+      // set the correct calling convention
+      newFunc->setCallingConv(func.getCallingConv());
+
+      // take the name of the old function
+      newFunc->takeName(&func);
+
+      // Copy names over for the parameters
+      llvm::Function::arg_iterator DestI = newFunc->arg_begin();
+      for (const auto &I : func.args()) {
+        (*DestI).setName(I.getName());  // Copy the name over...
+        DestI++;
+      }
+
+      if (isDecl) {
+        // copy debug info for function over; CloneFunctionInto takes care of
+        // this if this function has a body
+        newFunc->setSubprogram(func.getSubprogram());
+        // copy the metadata into the new function, ignoring any debug info.
+        copyFunctionMetadata(func, *newFunc);
+      } else {
+        // map all original function arguments to the new function arguments
+        for (auto iter = func.arg_begin(), iter_end = func.arg_end(),
+                  new_iter = newFunc->arg_begin();
+             iter != iter_end; ++iter, ++new_iter) {
+          vmap[(&*iter)] = (&*new_iter);
+        }
+
+        llvm::SmallVector<llvm::ReturnInst *, 8> returns;
+
+        // we have module changes if our function contains any debug info
+        assert(newFunc->getParent() &&
+               "assumed newFunc has an associated module");
+        const bool hasDbgMetadata = funcContainsDebugMetadata(func, vmap);
+        const bool differentModules = newFunc->getParent() != func.getParent();
+        auto changeType = differentModules
+                              ? llvm::CloneFunctionChangeType::DifferentModule
+                              : llvm::CloneFunctionChangeType::LocalChangesOnly;
+        if (hasDbgMetadata) {
+          changeType = std::max(changeType,
+                                llvm::CloneFunctionChangeType::GlobalChanges);
+        }
+        CloneFunctionInto(newFunc, &func, vmap, changeType, returns);
+      }
+
+      // Add in the new parameter attributes here, because CloneFunctionInto
+      // wipes out pre-existing attributes on newFunc which aren't in oldFunc.
+      newFunc->addParamAttrs(numParams, llvm::AttrBuilder(newFunc->getContext(),
+                                                          paramInfo.second));
+
+      // map new func -> old func
+      newToOldMap[newFunc] = &func;
+
+      // remove the body of the old function that we are going to delete
+      // anyway, so that none of its callsites get processed in the remainder
+      // of this pass
+      func.deleteBody();
+    }
+  }
+
+  // next, remap all callsites that would have called the old function, to the
+  // new function we just created
+  for (auto pair : newToOldMap) {
+    llvm::Function *newFunc = pair.first;
+    llvm::Function *oldFunc = pair.second;
+
+    remapClonedCallsites(*oldFunc, *newFunc, true);
+
+    // next, let the caller update any metadata.
+    if (updateMetaDataCallback) {
+      updateMetaDataCallback(*oldFunc, *newFunc,
+                             newFunc->getFunctionType()->getNumParams() - 1);
+    }
+  }
+
+  // lastly, remove all the old functions we no longer need
+  for (auto pair : newToOldMap) {
+    // the old function, no longer used
+    llvm::Function *const oldFunc = pair.second;
+
+    // then destroy the function
+    oldFunc->eraseFromParent();
+  }
+
+  return true;
+}
+
+void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
+                          bool extraArg) {
+  // list of calls we need to erase at the end
+  llvm::SmallVector<llvm::CallInst *, 32> callsToErase;
+
+  // for everything that uses our old function
+  for (auto *user : oldFunc.users()) {
+    // if the user calls our old function
+    if (auto ci = llvm::dyn_cast<llvm::CallInst>(user)) {
+      // store the name from the old call
+      const std::string name = ci->getName().str();
+
+      // get the number of args at the old callsite
+      const unsigned numArgs = ci->arg_size();
+
+      // the number of args at the new callsite. If we're adding an extra
+      // argument this is incremented.
+      const unsigned newNumArgs = extraArg ? numArgs + 1 : numArgs;
+
+      // create a buffer for our args
+      llvm::SmallVector<llvm::Value *, 8> args(newNumArgs);
+
+      // set all the new call args to be the old call args
+      for (unsigned i = 0; i < numArgs; i++) {
+        args[i] = ci->getArgOperand(i);
+      }
+
+      // if we're adding an extra param it's always the last
+      // argument, so propagate the value on from the parent
+      if (extraArg) {
+        llvm::Function *parentFunc = ci->getFunction();
+        llvm::Argument *lastArg = getLastArgument(parentFunc);
+        args[numArgs] = lastArg;
+      }
+
+      // create our new call instruction to replace the old one
+      auto newCi = llvm::CallInst::Create(&newFunc, args, name, ci);
+
+      // use the debug location from the old call (if any)
+      newCi->setDebugLoc(ci->getDebugLoc());
+
+      // set the calling convention for our new call the same as the old one
+      newCi->setCallingConv(ci->getCallingConv());
+
+      // replace anything that uses the old call with the new one
+      ci->replaceAllUsesWith(newCi);
+
+      // and remember to erase the old callsite
+      callsToErase.push_back(ci);
+    } else if (llvm::ConstantExpr *constant =
+                   llvm::dyn_cast<llvm::ConstantExpr>(user)) {
+      remapConstantExpr(constant, &oldFunc, &newFunc);
+    } else {
+      llvm_unreachable(
+          "UNHANDLED user for Function not a CallInst or ConstantExpr\n");
+    }
+  }
+
+  // remove all the old instructions we no longer need
+  for (llvm::CallInst *ci : callsToErase) {
+    // then destroy the call
+    ci->eraseFromParent();
+  }
+}
+
+bool addParamToAllFunctions(llvm::Module &module,
+                            llvm::Type *const newParamType,
+                            const llvm::AttributeSet &newParamAttrs,
+                            const UpdateMDCallbackFn &updateMetaDataCallback) {
+  return cloneFunctionsAddArg(
+      module,
+      [newParamType, newParamAttrs](llvm::Module &) {
+        return ParamTypeAttrsPair{newParamType, newParamAttrs};
+      },
+      [](const llvm::Function &func, bool &ClonedWithBody, bool &ClonedNoBody) {
+        // don't clone and add arg to special functions starting with __llvm.
+        // These are reserved for clang generated functions such as profile
+        // related ones
+        ClonedWithBody = !func.getName().starts_with("__llvm");
+        ClonedNoBody = false;
+      },
+      updateMetaDataCallback);
+}
+
+llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
+                             llvm::Value *indexStart, llvm::Value *indexEnd,
+                             const CreateLoopOpts &opts,
+                             CreateLoopBodyFn body) {
+  // If the index increment is null, we default to 1 as our index.
+  auto indexInc = opts.indexInc
+                      ? opts.indexInc
+                      : llvm::ConstantInt::get(indexStart->getType(), 1);
+
+  llvm::LLVMContext &ctx = entry->getContext();
+
+  llvm::SmallVector<llvm::Value *, 4> currIVs(opts.IVs.begin(), opts.IVs.end());
+  llvm::SmallVector<llvm::Value *, 4> nextIVs(opts.IVs.size());
+
+  // the basic block that will link into our loop
+  llvm::IRBuilder<> entryIR(entry);
+
+  // the basic block that will form the start of our loop
+  llvm::IRBuilder<> loopIR(
+      llvm::BasicBlock::Create(ctx, opts.headerName, entry->getParent()));
+
+  // branch into our loop to begin executing
+  entryIR.CreateBr(loopIR.GetInsertBlock());
+
+  // first thing in the loop is our phi node for the loop counter
+  auto phi = loopIR.CreatePHI(indexInc->getType(), 2);
+
+  // and make the phi node equal the start index when coming from our entry
+  phi->addIncoming(indexStart, entryIR.GetInsertBlock());
+
+  // Set up all of our user PHIs
+  for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    auto *const phi = loopIR.CreatePHI(currIVs[i]->getType(), 2);
+    llvm::cast<llvm::PHINode>(phi)->addIncoming(currIVs[i],
+                                                entryIR.GetInsertBlock());
+    // Set IV names if they've been given to us.
+    if (i < opts.loopIVNames.size()) {
+      phi->setName(opts.loopIVNames[i]);
+    }
+    currIVs[i] = phi;
+  }
+
+  // run the lamdba for the loop body, storing the block is finished at
+  llvm::BasicBlock *const latch =
+      body(loopIR.GetInsertBlock(), phi, currIVs, nextIVs);
+  llvm::IRBuilder<> bodyIR(latch);
+
+  // add to the phi node to increment our loop counter
+  auto *const add = bodyIR.CreateAdd(phi, indexInc);
+
+  // and set that if we loop back around, the phi node will be the increment
+  phi->addIncoming(add, latch);
+
+  // Update all of our PHIs
+  for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    llvm::cast<llvm::PHINode>(currIVs[i])->addIncoming(nextIVs[i], latch);
+  }
+
+  if (!exit) {
+    // the basic block to exit our loop when we are done
+    const llvm::IRBuilder<> exitIR(
+        llvm::BasicBlock::Create(ctx, "exitIR", entry->getParent()));
+    exit = exitIR.GetInsertBlock();
+  }
+
+  // last, branch condition either to the exit, or for another loop iteration
+  auto *const termBR = bodyIR.CreateCondBr(bodyIR.CreateICmpULT(add, indexEnd),
+                                           loopIR.GetInsertBlock(), exit);
+
+  if (opts.disableVectorize) {
+    auto *const vecDisable = llvm::MDNode::get(
+        ctx, {llvm::MDString::get(ctx, "llvm.loop.vectorize.enable"),
+              llvm::ConstantAsMetadata::get(
+                  llvm::ConstantInt::get(llvm::Type::getInt1Ty(ctx), false))});
+    // LLVM loop metadata -- for legacy reasons -- must have a reference to
+    // itself as its first operand. See
+    // https://llvm.org/docs/LangRef.html#llvm-loop.
+    auto *loopID = llvm::MDNode::get(ctx, {nullptr, vecDisable});
+    loopID->replaceOperandWith(0, loopID);
+    termBR->setMetadata(llvm::LLVMContext::MD_loop, loopID);
+  }
+
+  // we stopped executing in the exit block, so return that
+  return exit;
+}
+
+llvm::Argument *getLastArgument(llvm::Function *f) {
+  assert(!f->arg_empty() &&
+         "Can't get last argument if there are no arguments");
+  return f->arg_end() - 1;
+}
+
+unsigned getSizeTypeBytes(const llvm::Module &m) {
+  return m.getDataLayout().getPointerSize(0);
+}
+
+llvm::IntegerType *getSizeType(const llvm::Module &m) {
+  const llvm::DataLayout &dataLayout = m.getDataLayout();
+  return llvm::IntegerType::get(m.getContext(),
+                                dataLayout.getPointerSizeInBits(0));
+}
+
+static llvm::Function *createKernelWrapperFunctionImpl(
+    llvm::Function &F, llvm::Function &NewFunction, llvm::StringRef Suffix,
+    llvm::StringRef OldSuffix) {
+  // Make sure we take a copy of the basename as we're going to change the
+  // original function's name from underneath the StringRef.
+  const std::string baseName = getOrSetBaseFnName(NewFunction, F).str();
+
+  if (!OldSuffix.empty()) {
+    if (getBaseFnName(F).empty()) {
+      setBaseFnName(F, F.getName());
+    }
+    F.setName(F.getName() + OldSuffix);
+  }
+
+  NewFunction.setName(baseName + Suffix);
+
+  // we don't use exceptions
+  NewFunction.addFnAttr(llvm::Attribute::NoUnwind);
+
+  // copy the calling convention from the old function
+  NewFunction.setCallingConv(F.getCallingConv());
+
+  // copy the metadata into the new kernel ignoring any debug info.
+  copyFunctionMetadata(F, NewFunction);
+
+  // drop kernel (+ entry point) information from the old function: we've
+  // copied it over to the new one.
+  dropIsKernel(F);
+
+  // copy debug info for function over
+  if (auto *SP = F.getSubprogram()) {
+    const llvm::DIBuilder DIB(*F.getParent());
+    llvm::DISubprogram *const NewSP = DIB.createArtificialSubprogram(SP);
+#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+    // Wipe the list of retained nodes, as this new function is a wrapper over
+    // the old one and does not itself contain/retain any of the wrapped
+    // function's nodes.
+    NewSP->replaceRetainedNodes({});
+#else
+    // This does the same as the above, but there's no cleaner API with which
+    // to do it.
+    NewSP->replaceOperandWith(7, nullptr);
+#endif
+    NewFunction.setSubprogram(NewSP);
+  }
+
+  // set the function to always inline: 'noinline' takes precedence, though
+  if (!F.hasFnAttribute(llvm::Attribute::NoInline)) {
+    F.addFnAttr(llvm::Attribute::AlwaysInline);
+  }
+
+  // lastly set the linkage to internal
+  F.setLinkage(llvm::GlobalValue::InternalLinkage);
+
+  return &NewFunction;
+}
+
+llvm::Function *createKernelWrapperFunction(llvm::Function &F,
+                                            llvm::StringRef Suffix,
+                                            llvm::StringRef OldSuffix) {
+  // Create our new function
+  llvm::Function *const NewFunction = llvm::Function::Create(
+      F.getFunctionType(), llvm::Function::ExternalLinkage, "", F.getParent());
+
+  // copy over function attributes, including parameter attributes
+  copyFunctionAttrs(F, *NewFunction);
+
+  // Copy over parameter names
+  for (auto it : zip(NewFunction->args(), F.args())) {
+    std::get<0>(it).setName(std::get<1>(it).getName());
+  }
+
+  return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix);
+}
+
+llvm::Function *createKernelWrapperFunction(
+    llvm::Module &M, llvm::Function &F, llvm::ArrayRef<llvm::Type *> ArgTypes,
+    llvm::StringRef Suffix, llvm::StringRef OldSuffix) {
+  llvm::FunctionType *NewFunctionType =
+      llvm::FunctionType::get(F.getReturnType(), ArgTypes, false);
+
+  // create our new function
+  llvm::Function *const NewFunction = llvm::Function::Create(
+      NewFunctionType, llvm::Function::ExternalLinkage, "", &M);
+
+  // copy over function attributes, ignoring all parameter attributes - we
+  // don't know what the parameter mapping is.
+  copyFunctionAttrs(F, *NewFunction, 0);
+
+  return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix);
+}
+
+llvm::CallInst *createCallToWrappedFunction(
+    llvm::Function &WrappedF, const llvm::SmallVectorImpl<llvm::Value *> &Args,
+    llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt,
+    llvm::StringRef Name) {
+  auto *const CI =
+      llvm::CallInst::Create(WrappedF.getFunctionType(), &WrappedF, Args);
+
+  CI->setName(Name);
+  CI->setCallingConv(WrappedF.getCallingConv());
+  CI->setAttributes(getCopiedFunctionAttrs(WrappedF));
+
+  if (BB) {
+    CI->insertInto(BB, InsertPt);
+
+    if (auto *const ParentF = BB->getParent()) {
+      // An inlinable function call in a function with debug info *must* be
+      // given a debug location.
+      if (auto *const SP = ParentF->getSubprogram()) {
+        auto *const DbgLoc = llvm::DILocation::get(ParentF->getContext(),
+                                                   /*line*/ 0, /*col*/ 0, SP);
+        CI->setDebugLoc(DbgLoc);
+      }
+    }
+  }
+
+  return CI;
+}
+
+llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
+                                     llvm::Value *RHS, llvm::RecurKind Kind) {
+  switch (Kind) {
+    default:
+      break;
+    case llvm::RecurKind::None:
+      return nullptr;
+    case llvm::RecurKind::Add:
+      return B.CreateAdd(LHS, RHS);
+    case llvm::RecurKind::Mul:
+      return B.CreateMul(LHS, RHS);
+    case llvm::RecurKind::Or:
+      return B.CreateOr(LHS, RHS);
+    case llvm::RecurKind::And:
+      return B.CreateAnd(LHS, RHS);
+    case llvm::RecurKind::Xor:
+      return B.CreateXor(LHS, RHS);
+    case llvm::RecurKind::FAdd:
+      return B.CreateFAdd(LHS, RHS);
+    case llvm::RecurKind::FMul:
+      return B.CreateFMul(LHS, RHS);
+  }
+  assert((Kind == llvm::RecurKind::FMin || Kind == llvm::RecurKind::FMax ||
+          Kind == llvm::RecurKind::SMin || Kind == llvm::RecurKind::SMax ||
+          Kind == llvm::RecurKind::UMin || Kind == llvm::RecurKind::UMax) &&
+         "Unexpected min/max Kind");
+  if (Kind == llvm::RecurKind::FMin || Kind == llvm::RecurKind::FMax) {
+    return B.CreateBinaryIntrinsic(Kind == llvm::RecurKind::FMin
+                                       ? llvm::Intrinsic::minnum
+                                       : llvm::Intrinsic::maxnum,
+                                   LHS, RHS);
+  }
+  const bool isMin =
+      Kind == llvm::RecurKind::SMin || Kind == llvm::RecurKind::UMin;
+  const bool isSigned =
+      Kind == llvm::RecurKind::SMin || Kind == llvm::RecurKind::SMax;
+  const llvm::Intrinsic::ID intrOpc =
+      isMin ? (isSigned ? llvm::Intrinsic::smin : llvm::Intrinsic::umin)
+            : (isSigned ? llvm::Intrinsic::smax : llvm::Intrinsic::umax);
+  return B.CreateBinaryIntrinsic(intrOpc, LHS, RHS);
+}
+
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
new file mode 100644
index 0000000000000..02527c0309406
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
@@ -0,0 +1,136 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/pass_machinery.h>
+#include <llvm/ADT/StringMap.h>
+#include <llvm/Analysis/AliasAnalysis.h>
+#include <multi_llvm/llvm_version.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+// Note that Clang has three on/off options for debugging pass managers:
+// `-fdebug-pass-manager`, `-fdebug-pass-structure`, and
+// `-fdebug-pass-arguments``.
+// LLVM's `opt` tool combines them all into one:
+//   --debug-pass-manager (Normal)
+//   --debug-pass-manager=verbose (Verbose)
+//   --debug-pass-manager=quiet (Quiet)
+// However, the mapping is not one-to-one:
+// opt:
+//   PrintPassOptions PrintPassOpts;
+//   PrintPassOpts.Verbose = DebugPM == DebugLogging::Verbose;
+//   PrintPassOpts.SkipAnalyses = DebugPM == DebugLogging::Quiet;
+//   StandardInstrumentations SI(DebugPM != DebugLogging::None, VerifyEachPass,
+//                               PrintPassOpts);
+// clang:
+//   bool DebugPassStructure = CodeGenOpts.DebugPass == "Structure";
+//   PrintPassOptions PrintPassOpts;
+//   PrintPassOpts.Indent = DebugPassStructure;
+//   PrintPassOpts.SkipAnalyses = DebugPassStructure;
+//   StandardInstrumentations SI(CodeGenOpts.DebugPassManager ||
+//                                   DebugPassStructure,
+//                               false, PrintPassOpts);
+// While clang also pushes `mdebug-pass` onto LLVM, it only works for the
+// legacy pass manager, and so we choose to only support and model the
+// `debug-pass-manager` form.
+static cl::opt<DebugLogging> DebugPM(
+    "debug-pass-manager", cl::Hidden, cl::ValueOptional,
+    cl::desc("Print pass management debugging information"),
+    cl::init(DebugLogging::None),
+    cl::values(
+        clEnumValN(DebugLogging::Normal, "", ""),
+        clEnumValN(DebugLogging::Quiet, "quiet",
+                   "Skip printing info about analyses"),
+        clEnumValN(
+            DebugLogging::Verbose, "verbose",
+            "Print extra information about adaptors and pass managers")));
+
+static cl::opt<bool> VerifyEach("verify-each",
+                                cl::desc("Verify after each transform"));
+
+PassMachinery::PassMachinery(LLVMContext &Ctx, TargetMachine *TM,
+                             bool VerifyEach, DebugLogging debugLogLevel)
+    : TM(TM) {
+  llvm::PrintPassOptions PrintPassOpts;
+  PrintPassOpts.Verbose = DebugPM == DebugLogging::Verbose;
+  PrintPassOpts.SkipAnalyses = DebugPM == DebugLogging::Quiet;
+  PrintPassOpts.Indent = debugLogLevel != DebugLogging::None;
+  SI = std::make_unique<StandardInstrumentations>(
+      Ctx, debugLogLevel != DebugLogging::None, VerifyEach, PrintPassOpts);
+}
+
+PassMachinery::~PassMachinery() {}
+
+void PassMachinery::initializeStart(PipelineTuningOptions PTO) {
+  const std::optional<PGOOptions> PGOOpt;
+  PB = PassBuilder(TM, PTO, PGOOpt, &PIC);
+}
+
+void PassMachinery::registerPasses() {
+  buildDefaultAAPipeline();
+  registerLLVMAnalyses();
+}
+
+void PassMachinery::initializeFinish() {
+  // Register LLVM analyses now, with the knowledge that users have had the
+  // chance to register their own versions of LLVM analyses first.
+  registerPasses();
+  // With all passes registered, cross-register all the proxies
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // Allow registration of callbacks and instrumentation machinery
+  addClassToPassNames();
+  registerPassCallbacks();
+
+  // Register pass instrumentation
+#if LLVM_VERSION_GREATER_EQUAL(17, 0)
+  SI->registerCallbacks(PIC, &MAM);
+#else
+  SI->registerCallbacks(PIC, &FAM);
+#endif
+}
+
+void PassMachinery::buildDefaultAAPipeline() {
+  FAM.registerPass([&] { return PB.buildDefaultAAPipeline(); });
+}
+
+void PassMachinery::registerLLVMAnalyses() {
+  // Register standard analyses
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+}
+
+}  // namespace utils
+}  // namespace compiler
+
+namespace compiler {
+namespace utils {
+/// Helper functions for printing
+void printPassName(StringRef PassName, raw_ostream &OS) {
+  OS << "  " << PassName << "\n";
+}
+
+void printPassName(StringRef PassName, StringRef Params, raw_ostream &OS) {
+  OS << "  " << PassName << "<" << Params << ">\n";
+}
+
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
new file mode 100644
index 0000000000000..9220e0c9d2261
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/prepare_barriers_pass.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ca-barriers"
+
+PreservedAnalyses compiler::utils::PrepareBarriersPass::run(
+    Module &M, ModuleAnalysisManager &AM) {
+  SmallPtrSet<Function *, 4> Kernels;
+  auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
+  for (auto &F : M.functions()) {
+    if (isKernelEntryPt(F)) {
+      Kernels.insert(&F);
+    }
+  }
+
+  SmallPtrSet<Function *, 4> FuncsWithBarriers;
+
+  for (Function &F : M) {
+    const auto B = BI.analyzeBuiltin(F);
+    // If the function does not have a barrier id.
+    if (!BI.isMuxBuiltinWithBarrierID(B.ID)) {
+      continue;
+    }
+
+    for (User *U : F.users()) {
+      if (auto *const CI = dyn_cast<CallInst>(U)) {
+        auto *const Callee = CI->getFunction();
+
+        // If it's one of our kernels don't inline it, and definitely don't
+        // delete it either. No need to inline already dead functions, either!
+        if (!Callee->isDefTriviallyDead() && Kernels.count(Callee) == 0) {
+          FuncsWithBarriers.insert(Callee);
+        }
+      }
+    }
+  }
+
+  bool Changed = false;
+
+  // Walk the users of the barrier.
+  while (!FuncsWithBarriers.empty()) {
+    auto *F = *FuncsWithBarriers.begin();
+    FuncsWithBarriers.erase(F);
+
+    // Make a copy of the users of the function to be inlined since
+    // InlineFunction modifies the state of ci/F which affects
+    // the range being iterated over, resulting in use-after-free.
+    const SmallVector<User *, 8> Users{F->user_begin(), F->user_end()};
+
+    // Check the users of the function the call instruction inhabits.
+    for (User *U : Users) {
+      // If the call instruction's function does not any users.
+      if (!isa<CallInst>(U)) {
+        continue;
+      }
+
+      auto *const InfoF = cast<CallInst>(U)->getFunction();
+      InlineFunctionInfo IFI;
+      auto InlineResult =
+          InlineFunction(*cast<CallInst>(U), IFI, /*MergeAttributes*/ false,
+                         /*CalleeAAR*/ nullptr, /*InsertLifetime*/ true,
+                         /*ForwardVarArgsTo*/ nullptr);
+      if (InlineResult.isSuccess()) {
+        Changed = true;
+
+        // The function we inlined into now contains a barrier, so add it
+        // to the set.
+        if (!InfoF->isDefTriviallyDead() && Kernels.count(InfoF) == 0) {
+          FuncsWithBarriers.insert(InfoF);
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "Could not inline: " << *U << '\n';);
+      }
+    }
+
+    // Delete the now-dead inlined function
+    if (F->isDefTriviallyDead()) {
+      F->eraseFromParent();
+    }
+  }
+
+  // Assign all barriers a unique ID
+  unsigned ID = 0U;
+  auto &Ctx = M.getContext();
+  auto *const I32Ty = IntegerType::get(Ctx, 32);
+  for (auto *F : Kernels) {
+    for (BasicBlock &BB : *F) {
+      for (Instruction &I : BB) {
+        // Check call instructions for barrier.
+        if (auto *const CI = dyn_cast<CallInst>(&I)) {
+          Function *Callee = CI->getCalledFunction();
+          if (Callee &&
+              BI.isMuxBuiltinWithBarrierID(BI.analyzeBuiltin(*Callee).ID)) {
+            CI->setOperand(0, ConstantInt::get(I32Ty, ID++));
+          }
+        }
+      }
+    }
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
new file mode 100644
index 0000000000000..bfd12374f960b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -0,0 +1,641 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/replace_local_module_scope_variables_pass.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cassert>
+#include <functional>
+
+using namespace llvm;
+
+namespace {
+using AlignIntTy = uint64_t;
+
+// Creates and returns a new GEP instruction, inserted before input parameter
+// 'inst'. This GEP points to the element at 'index' of the struct living at
+// the final argument of each function.
+GetElementPtrInst *generateStructGEP(Instruction &inst,
+                                     StructType *funcsStructTy,
+                                     unsigned index) {
+  // find the function the instruction is in
+  auto func = inst.getFunction();
+
+  // the local module-scope variables struct we added to each function
+  auto funcsStruct = compiler::utils::getLastArgument(func);
+
+  assert(funcsStruct->getType()->isPointerTy());
+
+  // the type with which to index into our struct type
+  auto indexTy = Type::getInt32Ty(inst.getModule()->getContext());
+
+  // create a new GEP just before the instruction
+  auto GEP = GetElementPtrInst::CreateInBounds(
+      funcsStructTy, funcsStruct,
+      {ConstantInt::get(indexTy, 0), ConstantInt::get(indexTy, index)}, "",
+      &inst);
+  return GEP;
+}
+
+// Given the type of a __local variable about to be added to the
+// struct function calculates and returns the alignment of the type.
+AlignIntTy calculateTypeAlign(Type *type, const DataLayout &layout) {
+  // Get underlying type if variable is an array
+  while (type->isArrayTy()) {
+    type = type->getArrayElementType();
+  }
+
+  // 3 component wide vectors have the size of 4 components according to the
+  // OpenCL spec section 6.1.5 'Alignment of Types'
+  unsigned int vectorWidth =
+      type->isVectorTy() ? multi_llvm::getVectorNumElements(type) : 1;
+  if (3 == vectorWidth) {
+    vectorWidth = 4;
+  }
+
+  // if we have a pointer type return the size of a pointer on the target
+  if (type->isPointerTy()) {
+    return layout.getPointerSize();
+  }
+
+  // size of member in bytes - at least 8 bits to avoid zero alignment on
+  // integer types smaller than i8.
+  const unsigned int vectorSize =
+      (std::max(type->getScalarSizeInBits(), 8u) * vectorWidth) / 8;
+
+  return vectorSize;
+}
+
+// Variables in the local address space not passed as arguments can only be
+// declared in the outermost scope of a kernel function. Here we find the kernel
+// function the local address space global resides in.
+Function *determineKernel(GlobalVariable &global) {
+  auto global_user = *(global.user_begin());
+  if (auto instruction = dyn_cast<Instruction>(global_user)) {
+    return instruction->getFunction();
+  } else if (ConstantVector *cv = dyn_cast<ConstantVector>(global_user)) {
+    User *cv_user = *(cv->user_begin());
+    auto instruction = cast<Instruction>(cv_user);
+    return instruction->getFunction();
+  } else if (global_user) {
+    global_user->print(errs());
+    llvm_unreachable("Unknown user used the local module-scope variable!");
+  }
+  return nullptr;
+}
+
+// Information associated to with a local address space module scope variable
+// that is needed to update it's debug info metadata
+struct GlobalVarDebugInfoWrapper final {
+  // Byte offset into struct of replacement variables
+  unsigned offset;
+  // Associated debug info metadata entry
+  DIGlobalVariable *DIGlobal;
+  // Kernel function variable was defined in
+  Function *function;
+};
+
+}  // namespace
+
+PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
+    Module &M, ModuleAnalysisManager &) {
+  // the element types of the struct of replacement local module-scope
+  // variables we are replacing
+  SmallVector<Type *, 8> structElementTypes;
+
+  // ordered list of kernel names which are used to find cached function
+  // types. StringRef is safe here because the names will be taken over from
+  // the old functions to the new ones.
+  SmallVector<StringRef, 4> names;
+
+  // unmodified function types of functions in the module
+  DenseMap<StringRef, FunctionType *> functionTypes;
+
+  for (auto &F : M.functions()) {
+    if (isKernel(F)) {
+      names.push_back(F.getName());
+      functionTypes[F.getName()] = F.getFunctionType();
+    }
+  }
+
+  // a map from the original global variable to the index into
+  // structElementTypes
+  ValueMap<GlobalVariable *, unsigned> index_map;
+
+  // the global variables we need to process and remove
+  SmallVector<GlobalVariable *, 8> globals;
+
+  // maps variables in `globals` we're processing to helper information
+  // needed for updating debug info
+  DenseMap<GlobalVariable *, GlobalVarDebugInfoWrapper> debug_info_map;
+
+  // __local address space automatic variables are represented in the LLVM
+  // module as global variables with address space 3.
+  //
+  // This pass identifies these variables and places them into a struct
+  // allocated in a newly created wrapper function. A pointer to the struct
+  // is then passed via a parameter to the original kernel.
+  for (auto &global : M.globals()) {
+    // get the type of the global variable
+    const auto type = global.getType();
+
+    if (global.use_empty()) {
+      continue;
+    }
+
+    if (type->isPointerTy() &&
+        type->getPointerAddressSpace() == AddressSpace::Local) {
+      // and save that this is a global we care about
+      globals.push_back(&global);
+    }
+  }
+
+  // if we found no local module-scope variables to be replaced...
+  if (globals.empty()) {
+    // ... then we're done!
+    return PreservedAnalyses::all();
+  }
+
+  // Pad struct so that members are aligned.
+  //
+  // Unlike x86, ARM architecture alignment can be different from the
+  // member size. So that __local alignment is OpenCL conformant
+  // we need to manually pad our struct.
+  //
+  // To do this we keep track of each local module-scope elements
+  // offset in the struct, and ensure that it is a multiple of
+  // that elements alignment. Finally we then align the whole struct
+  // to the largest alignment found out of all our __local members.
+
+  // track largest member alignment found so far.
+  unsigned int maxAlignment = 0;
+  // byte offset in struct of current member
+  unsigned int offset = 0;
+  const auto &dl = M.getDataLayout();
+  for (auto &global : globals) {
+    auto memberType = global->getValueType();
+
+    // alignment of the new struct member, in the case where we can't
+    // calculate this, e.g. struct types, use the alignment of the llvm
+    // global. This is also needed if '__attribute__(aligned)' was used to
+    // set a specific alignment.
+    const unsigned int alignment =
+        std::max(global->getAlignment(), calculateTypeAlign(memberType, dl));
+    assert(alignment > 0 && "'0' is an impossible alignment");
+
+    // check if this is the largest alignment seen so far
+    if (alignment > maxAlignment) {
+      maxAlignment = alignment;
+    }
+
+    // check if member is not already aligned
+    const unsigned int remainder = offset % alignment;
+    if (0 != remainder) {
+      // calculate number of padding bytes
+      const unsigned int padding = alignment - remainder;
+
+      // Use a byte array to pad struct rather than trying to create
+      // an arbitrary intNTy, since this may not be supported by the backend.
+      const auto padByteType = Type::getInt8Ty(M.getContext());
+      const auto padByteArrayType = ArrayType::get(padByteType, padding);
+      structElementTypes.push_back(padByteArrayType);
+
+      // bump offset by padding size
+      offset += padding;
+    }
+
+    // we need the byte-offset when generating debug info
+    debug_info_map[global] = {offset, nullptr, nullptr};
+
+    // map the global variable to its index in structElementTypes
+    index_map[global] = structElementTypes.size();
+
+    // then add our element type to the struct
+    structElementTypes.push_back(memberType);
+
+    // update the offset based on the type's size
+    auto allocSize = dl.getTypeAllocSize(memberType);
+    if (dl.getTypeAllocSize(memberType).isScalable()) {
+      // Not an assert because this can happen in user-supplied IR
+      report_fatal_error("Scalable types in local memory are not supported");
+    }
+    const unsigned int totalSize = allocSize.getFixedValue();
+    offset += totalSize;
+  }
+
+  // create a struct containing all the local module-scope variables
+  auto structTy = StructType::create(structElementTypes, "localVarTypes");
+
+  // change all our functions to take a pointer to the new structTy we created
+  const AttributeSet defaultAttrs;
+  addParamToAllFunctions(M, structTy->getPointerTo(), defaultAttrs);
+
+  // Check if we have debug info, if so we need to fix it up to turn global
+  // variable entries into local variable ones.
+  if (const auto NMD = M.getNamedMetadata("llvm.dbg.cu")) {
+    const DIBuilder DIB(M, /*AllowUnresolved*/ false);
+
+    for (auto *CUOp : NMD->operands()) {
+      // Find module compilation unit
+      DICompileUnit *CU = cast<DICompileUnit>(CUOp);
+
+      // Check if there are any debug info global variables, as the DMA
+      // pass can create global variables without debug metadata attached.
+      auto DIGlobalVariables = CU->getGlobalVariables();
+      if (DIGlobalVariables.empty()) {
+        continue;
+      }
+      // Updated list of global debug info variables so that it no longer
+      // contains entries we will later replace with DILocalVariable metadata
+      SmallVector<Metadata *, 2> CU_DIExprs;
+      for (auto &global : M.globals()) {
+        // Get debug info expression for global variable
+        SmallVector<DIGlobalVariableExpression *, 1> Global_DIExprs;
+        global.getDebugInfo(Global_DIExprs);
+
+        if (Global_DIExprs.empty()) {
+          continue;
+        }
+
+        if (globals.end() == find(globals, &global)) {
+          // This is not a __local address space variable we will
+          // replace, so retain its debug info in the CU MDNode
+          CU_DIExprs.append(Global_DIExprs.begin(), Global_DIExprs.end());
+        } else {
+          // We will replace this debug info variable later
+          assert(Global_DIExprs.size() == 1 &&
+                 "Only expecting a single debug info variable");
+          debug_info_map[&global].DIGlobal = Global_DIExprs[0]->getVariable();
+        }
+      }
+      CU->replaceGlobalVariables(MDTuple::get(M.getContext(), CU_DIExprs));
+    }
+  }
+
+  for (auto &global : globals) {
+    const SmallVector<User *, 8> users(global->users());
+
+    for (auto *user : users) {
+      // if we have a constant expression, we need to force it back to a
+      // normal instruction, as we are removing the constant that the
+      // constant expression was associated with (we are removing the global
+      // variable), we can't use a constant expression to calculate the
+      // result.
+      if (auto *constant = dyn_cast<ConstantExpr>(user)) {
+        replaceConstantExpressionWithInstruction(constant);
+      }
+    }
+  }
+
+  for (auto &global : globals) {
+    if (debug_info_map[global].DIGlobal) {
+      // If global variable has debug info, find out what kernel the __local
+      // variable was defined in so we can use that information later.
+      debug_info_map[global].function = determineKernel(*global);
+      assert(debug_info_map[global].function);
+    }
+
+    // For each user that matches a specific kind of instruction, we do 3
+    // different things:
+    // 1) Create a GEP instruction to retrieve the address of the local
+    // version of 'global' in the newly created local struct.
+    // 2) We create a cast instruction to cast the type of the GEP created
+    // in 1) to the type of the global instruction.
+    // 3) Replace the use of the global instruction with the instruction
+    // created in 2).
+    const SmallVector<User *, 4> users(global->users());
+    for (auto *user : users) {
+      // if we have a GEP instruction...
+      if (GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(user)) {
+        auto local = generateStructGEP(*gep, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", gep);
+
+        gep->setOperand(0, castedLocal);
+        gep->setIsInBounds();
+      } else if (CastInst *cast = dyn_cast<CastInst>(user)) {
+        auto local = generateStructGEP(*cast, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", cast);
+
+        cast->setOperand(0, castedLocal);
+      } else if (LoadInst *load = dyn_cast<LoadInst>(user)) {
+        auto local = generateStructGEP(*load, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", load);
+
+        load->setOperand(0, castedLocal);
+      } else if (StoreInst *store = dyn_cast<StoreInst>(user)) {
+        auto local = generateStructGEP(*store, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", store);
+        // global could be pointer or value operand of the store
+        if (store->getValueOperand() == global) {
+          store->setOperand(0, castedLocal);
+        } else {
+          store->setOperand(1, castedLocal);
+        }
+      } else if (ConstantVector *cv = dyn_cast<ConstantVector>(user)) {
+        // Because 'cv' is not an instruction, we have to iterate over all its
+        // users and do the work for all of them individually.
+        for (auto cvIt = cv->user_begin(); cvIt != cv->user_end();) {
+          auto cvUser = *cvIt++;
+          auto inst = ::cast<Instruction>(cvUser);
+          auto local = generateStructGEP(*inst, structTy, index_map[global]);
+
+          auto castedLocal =
+              CastInst::CreatePointerCast(local, global->getType(), "", inst);
+
+          auto indexTy = Type::getInt32Ty(M.getContext());
+          Value *newCv = UndefValue::get(cv->getType());
+
+          // We can't simply 'setOperand' in a 'ConstantVector'. We have to
+          // recreate it from scratch.
+          for (unsigned i = 0; i < cv->getNumOperands(); ++i) {
+            if (cv->getOperand(i) == global) {
+              newCv = InsertElementInst::Create(
+                  newCv, castedLocal, ConstantInt::get(indexTy, i), "", inst);
+            } else {
+              newCv = InsertElementInst::Create(newCv, cv->getOperand(i),
+                                                ConstantInt::get(indexTy, i),
+                                                "", inst);
+            }
+          }
+
+          // And don't forget to replace 'cv' by 'newCv'.
+          inst->replaceUsesOfWith(cv, newCv);
+        }
+      } else if (PHINode *phi = dyn_cast<PHINode>(user)) {
+        // Because we can't create 1) before a phi node, we have to create it
+        // before the terminator of the incoming block.
+        for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
+          if (phi->getIncomingValue(i) == global) {
+            auto incomingBlock = phi->getIncomingBlock(i);
+            auto incomingBlockT = incomingBlock->getTerminator();
+            auto local =
+                generateStructGEP(*incomingBlockT, structTy, index_map[global]);
+
+            auto castedLocal = CastInst::CreatePointerCast(
+                local, global->getType(), "", incomingBlockT);
+
+            phi->setIncomingValue(i, castedLocal);
+          }
+        }
+      } else if (AtomicRMWInst *atomic = dyn_cast<AtomicRMWInst>(user)) {
+        auto local = generateStructGEP(*atomic, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", atomic);
+
+        // global could be pointer or value operand of the atomic
+        if (atomic->getPointerOperand() == global) {
+          atomic->setOperand(0, castedLocal);
+        } else {
+          atomic->setOperand(1, castedLocal);
+        }
+      } else if (auto *atomic = dyn_cast<AtomicCmpXchgInst>(user)) {
+        const auto local =
+            generateStructGEP(*atomic, structTy, index_map[global]);
+        const auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", atomic);
+
+        // global could be the pointer
+        if (atomic->getPointerOperand() == global) {
+          atomic->setOperand(0, castedLocal);
+        }
+        // the comparison value
+        if (atomic->getCompareOperand() == global) {
+          atomic->setOperand(1, castedLocal);
+        }
+        // the new value
+        if (atomic->getNewValOperand() == global) {
+          atomic->setOperand(2, castedLocal);
+        }
+      } else if (SelectInst *select = dyn_cast<SelectInst>(user)) {
+        auto local = generateStructGEP(*select, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", select);
+
+        // global could be the true or false value of the select
+        if (select->getTrueValue() == global) {
+          select->setOperand(1, castedLocal);
+        } else {
+          select->setOperand(2, castedLocal);
+        }
+      } else if (CallInst *call = dyn_cast<CallInst>(user)) {
+        auto local = generateStructGEP(*call, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", call);
+
+        unsigned i = 0;
+        for (; i < call->getNumOperands(); ++i) {
+          if (call->getOperand(i) == global) {
+            call->setOperand(i, castedLocal);
+          }
+        }
+      } else if (InsertElementInst *insertIns =
+                     dyn_cast<InsertElementInst>(user)) {
+        auto local = generateStructGEP(*insertIns, structTy, index_map[global]);
+        auto castedLocal = CastInst::CreatePointerCast(local, global->getType(),
+                                                       "", insertIns);
+        // Update middle operand as the others are the vector and index
+        insertIns->setOperand(1, castedLocal);
+      } else if (auto *cmpIns = dyn_cast<CmpInst>(user)) {
+        const auto local =
+            generateStructGEP(*cmpIns, structTy, index_map[global]);
+        const auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType(), "", cmpIns);
+        // global could be either side of the compare
+        if (cmpIns->getOperand(0) == global) {
+          cmpIns->setOperand(0, castedLocal);
+        }
+        if (cmpIns->getOperand(1) == global) {
+          cmpIns->setOperand(1, castedLocal);
+        }
+      } else {
+        user->print(errs());
+        llvm_unreachable("Unknown user used the local module-scope variable!");
+      }
+    }
+  }
+
+  // lastly, we create a wrapper function with the original kernel signature
+  // of each kernel, which will alloca the struct for the remapped local
+  // module-scope variables
+  for (const auto &name : names) {
+    // the original kernel function
+    auto *kernelFunc = M.getFunction(name);
+
+    // the original kernel function type, saved earlier
+    auto kernelFuncTy = functionTypes[name];
+
+    auto newFunc =
+        Function::Create(kernelFuncTy, kernelFunc->getLinkage(), "", &M);
+
+    // copy over function parameter names
+    for (unsigned i = 0, e = newFunc->arg_size(); i != e; i++) {
+      newFunc->getArg(i)->setName(kernelFunc->getArg(i)->getName());
+    }
+    // copy over function/parameter/ret attributes
+    copyFunctionAttrs(*kernelFunc, *newFunc, newFunc->arg_size());
+
+    auto baseName = getOrSetBaseFnName(*newFunc, *kernelFunc);
+    newFunc->setName(baseName + ".mux-local-var-wrapper");
+
+    // copy over function metadata
+    copyFunctionMetadata(*kernelFunc, *newFunc);
+    // drop the old function's kernel information - we've stolen it.
+    dropIsKernel(*kernelFunc);
+
+    // copy the calling convention too
+    newFunc->setCallingConv(kernelFunc->getCallingConv());
+
+    // we don't use exceptions
+    newFunc->addFnAttr(Attribute::NoUnwind);
+
+    // next, set the function to always inline unless it has a noinline
+    // attribute.
+    if (!kernelFunc->hasFnAttribute(Attribute::NoInline)) {
+      kernelFunc->addFnAttr(Attribute::AlwaysInline);
+    }
+
+    // lastly set the linkage to internal
+    kernelFunc->setLinkage(GlobalValue::InternalLinkage);
+
+    // move debug info for function over
+    newFunc->setSubprogram(kernelFunc->getSubprogram());
+    kernelFunc->setSubprogram(nullptr);
+
+    // create an irbuilder and basic block for our new function
+    IRBuilder<> ir(BasicBlock::Create(newFunc->getContext(), "", newFunc));
+
+    // stack allocate the local module-scope variables struct
+    auto alloca = ir.CreateAlloca(structTy);
+    alloca->setAlignment(MaybeAlign(maxAlignment).valueOrOne());
+
+    // Generate debug info metadata for the globals we have replaced
+    // which previously had debug info attached
+    for (auto global : globals) {
+      auto debug_info_wrapper = debug_info_map[global];
+      auto DIGlobal = debug_info_wrapper.DIGlobal;
+      if (!DIGlobal) {
+        // No debug info for GlobalVariable
+        continue;
+      }
+
+      // Expression for byte offset in newly allocated struct where our
+      // replacement variable lives
+      const unsigned offset = debug_info_wrapper.offset;
+      const uint64_t dwPlusOp = dwarf::DW_OP_plus_uconst;
+      DIBuilder DIB(M, /*AllowUnresolved*/ false);
+      auto offset_expr =
+          DIB.createExpression(ArrayRef<uint64_t>{dwPlusOp, offset});
+
+      // enqueued_kernel_scope is true if the variable was originally defined
+      // in kernelFunc, the kernel being enqueued by the user, rather than
+      // another kernel function being called by kernelFunc.
+      auto func = debug_info_wrapper.function;
+      const bool enqueued_kernel_scope = !func->getSubprogram();
+      auto DISubprogram = enqueued_kernel_scope ? newFunc->getSubprogram()
+                                                : func->getSubprogram();
+
+      // We can't guarantee a subprogram for all functions.
+      // FIXME: Should we be able to? Do we need to clone subprograms somehow?
+      // See CA-4241.
+      if (!DISubprogram) {
+        continue;
+      }
+
+      // Create replacement debug metadata entry representing the global
+      // as a DILocalVariable in the kernel function scope.
+      auto DILocal = DIB.createAutoVariable(
+          DISubprogram, DIGlobal->getName(), DIGlobal->getFile(),
+          DIGlobal->getLine(), dyn_cast<DIType>(DIGlobal->getType()));
+
+      // Insert debug declare intrinsic pointing to the location of
+      // the variable in our allocated struct
+      auto *location =
+          DILocation::get(DISubprogram->getContext(), DIGlobal->getLine(),
+                          /*Column*/ 0, DISubprogram);
+      if (enqueued_kernel_scope) {
+        DIB.insertDeclare(alloca, DILocal, offset_expr, location,
+                          alloca->getParent());
+      } else {
+        // A pointer to our struct is passed as the last argument to each
+        // function, use this argument if the global came from another kernel
+        // function which is called by kernelFunc.
+        auto last_arg = func->arg_end() - 1;
+        DIB.insertDeclare(last_arg, DILocal, offset_expr, location,
+                          func->getEntryBlock().getFirstNonPHIOrDbg());
+      }
+    }
+
+    // create a buffer for our args
+    SmallVector<Value *, 8> args;
+
+    for (auto &arg : newFunc->args()) {
+      args.push_back(&arg);
+    }
+
+    // add the new alloca for the local module-scope variables struct
+    args.push_back(alloca);
+
+    // call the original function
+    auto ci = ir.CreateCall(kernelFunc, args);
+    ci->setCallingConv(kernelFunc->getCallingConv());
+    ci->setAttributes(getCopiedFunctionAttrs(*kernelFunc));
+
+    // and return void
+    ir.CreateRetVoid();
+  }
+
+  // erase all the global variables that we have removed all uses for
+  for (auto global : globals) {
+    // Vecz generates constant vector with global variable with local scope.
+    // In this case, if we try to remove the global variable, llvm generates
+    // assert because there are still uses with constant vector in
+    // LLVMContext. As a result, if constant vector uses global variable with
+    // local scope, keep it.
+    bool keepIt = false;
+    for (auto *user : global->users()) {
+      if (isa<ConstantVector>(user)) {
+        keepIt = true;
+        break;
+      }
+    }
+
+    if (!keepIt) {
+      global->eraseFromParent();
+    }
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
new file mode 100644
index 0000000000000..96657c945cc83
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
@@ -0,0 +1,154 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/scheduling.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/IRBuilder.h>
+#include <sys/types.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+static constexpr const char *WorkItemParamName = "MuxWorkItemInfo";
+static constexpr const char *WorkGroupParamName = "MuxWorkGroupInfo";
+
+StructType *getWorkItemInfoStructTy(llvm::Module &M) {
+  LLVMContext &ctx = M.getContext();
+  // Check whether this struct has previously been defined.
+  if (auto *ty = StructType::getTypeByName(ctx, WorkItemParamName)) {
+    return ty;
+  }
+  auto *uint_type = Type::getInt32Ty(ctx);
+  auto *size_type = getSizeType(M);
+  auto *array_type = ArrayType::get(size_type, 3);
+
+  SmallVector<Type *, WorkItemInfoStructField::total> elements(
+      WorkItemInfoStructField::total);
+
+  elements[WorkItemInfoStructField::local_id] = array_type;
+  elements[WorkItemInfoStructField::sub_group_id] = uint_type;
+  elements[WorkItemInfoStructField::num_sub_groups] = uint_type;
+  elements[WorkItemInfoStructField::max_sub_group_size] = uint_type;
+
+  return StructType::create(elements, WorkItemParamName);
+}
+
+StructType *getWorkGroupInfoStructTy(llvm::Module &M) {
+  LLVMContext &ctx = M.getContext();
+  // Check whether this struct has previously been defined.
+  if (auto *ty = StructType::getTypeByName(ctx, WorkGroupParamName)) {
+    return ty;
+  }
+  auto *uint_type = Type::getInt32Ty(ctx);
+  auto *size_type = getSizeType(M);
+  auto *array_type = ArrayType::get(size_type, 3);
+
+  SmallVector<Type *, WorkGroupInfoStructField::total> elements(
+      WorkGroupInfoStructField::total);
+
+  elements[WorkGroupInfoStructField::group_id] = array_type;
+  elements[WorkGroupInfoStructField::num_groups] = array_type;
+  elements[WorkGroupInfoStructField::global_offset] = array_type;
+  elements[WorkGroupInfoStructField::local_size] = array_type;
+  elements[WorkGroupInfoStructField::work_dim] = uint_type;
+
+  return StructType::create(elements, WorkGroupParamName);
+}
+
+void populateStructSetterFunction(Function &F, Argument &structPtrArg,
+                                  StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg) {
+  assert(F.isDeclaration() && "Scrubbing existing function");
+
+  F.addFnAttr(Attribute::AlwaysInline);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  auto argIter = F.arg_begin();
+
+  Value *const indexArg = hasRankArg ? argIter++ : nullptr;
+
+  Value *const valueArg = argIter++;
+
+  IRBuilder<> ir(BasicBlock::Create(F.getContext(), "", &F));
+
+  SmallVector<Value *, 3> gep_indices{ir.getInt32(0),
+                                      ir.getInt32(structFieldIdx)};
+
+  if (hasRankArg) {
+    gep_indices.push_back(indexArg);
+  }
+
+  assert(structPtrArg.getType()->isPointerTy() &&
+         "Assuming a pointer type as the last argument");
+
+  Value *gep = ir.CreateGEP(structTy, &structPtrArg, gep_indices);
+
+  ir.CreateStore(valueArg, gep);
+
+  ir.CreateRetVoid();
+}
+
+void populateStructGetterFunction(llvm::Function &F, Argument &structPtrArg,
+                                  llvm::StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg,
+                                  size_t defaultValue) {
+  assert(F.isDeclaration() && "Scrubbing existing function");
+  F.addFnAttr(Attribute::AlwaysInline);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  auto *indexArg = hasRankArg ? F.arg_begin() : nullptr;
+
+  assert(structPtrArg.getType()->isPointerTy() &&
+         "Assuming a pointer type as the last argument");
+
+  IRBuilder<> ir(BasicBlock::Create(F.getContext(), "", &F));
+
+  SmallVector<Value *, 3> gep_indices{ir.getInt32(0),
+                                      ir.getInt32(structFieldIdx)};
+
+  Value *ret = nullptr;
+  Value *cmp = nullptr;
+
+  if (hasRankArg) {
+    // we have 3 dimensions; x, y & z
+    auto *maxValidIndex = ir.getInt32(3);
+
+    cmp = ir.CreateICmp(CmpInst::ICMP_ULT, indexArg, maxValidIndex);
+
+    auto *sel = ir.CreateSelect(cmp, indexArg, ir.getInt32(0));
+
+    gep_indices.push_back(sel);
+  }
+
+  auto gep = ir.CreateGEP(structTy, &structPtrArg, gep_indices);
+
+  ret = ir.CreateLoad(F.getReturnType(), gep);
+
+  if (hasRankArg) {
+    ret = ir.CreateSelect(cmp, ret,
+                          ConstantInt::get(F.getReturnType(), defaultValue));
+  }
+
+  ir.CreateRet(ret);
+}
+
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
new file mode 100644
index 0000000000000..a4b20adc3b18e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
@@ -0,0 +1,168 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/sub_group_analysis.h>
+#include <llvm/ADT/PriorityWorklist.h>
+#include <llvm/ADT/SetOperations.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+GlobalSubgroupInfo::GlobalSubgroupInfo(Module &M, BuiltinInfo &BI) : BI(BI) {
+  SmallPtrSet<Function *, 8> UsesSubgroups;
+  SmallPriorityWorklist<Function *, 4> Worklist;
+
+  for (auto &F : M) {
+    if (F.isDeclaration()) {
+      continue;
+    }
+    auto SGI = std::make_unique<SubgroupInfo>();
+
+    // Assume the 'mux-no-subgroups' attribute is correct. If a pass introduces
+    // the use of sub-groups, then it should remove the attribute itself!
+    if (hasNoExplicitSubgroups(F)) {
+      FunctionMap.insert({&F, std::move(SGI)});
+      continue;
+    }
+
+    for (auto &BB : F) {
+      for (const auto &I : BB) {
+        if (auto *const CI = dyn_cast<CallInst>(&I)) {
+          if (auto SGBuiltin = isMuxSubgroupBuiltin(CI->getCalledFunction())) {
+            // Only add each function to the worklist once
+            if (UsesSubgroups.insert(&F).second) {
+              Worklist.insert(&F);
+            }
+            // Track this function's use of this builtin
+            SGI->UsedSubgroupBuiltins.insert(SGBuiltin->ID);
+          }
+        }
+      }
+    }
+    FunctionMap.insert({&F, std::move(SGI)});
+  }
+
+  // Collect all functions that contain sub-group calls, including calls to
+  // other functions in the module that contain sub-group calls.
+  while (!Worklist.empty()) {
+    auto *const F = Worklist.pop_back_val();
+    const auto &FSubgroups = FunctionMap[F]->UsedSubgroupBuiltins;
+    // Track which unique call-graph edges we've traversed, in case F ends up
+    // calling the same function multiple times. The set of builtins used by
+    // this item isn't going to change while we're working on it.
+    SmallPtrSet<Function *, 4> AlreadyUnioned;
+    for (auto *const U : F->users()) {
+      if (auto *const CI = dyn_cast<CallInst>(U)) {
+        auto *const CallerF = CI->getFunction();
+        // If we haven't seen this function before, we need to process it and
+        // propagate its users.
+        if (UsesSubgroups.insert(CallerF).second) {
+          Worklist.insert(CallerF);
+        }
+        // If we've recorded that CallerF calls F for the first time in this
+        // loop, CallerF's set of used builtins gains all the builtins used by
+        // F.
+        if (AlreadyUnioned.insert(CallerF).second) {
+          auto &CallerSubgroups = FunctionMap[CallerF]->UsedSubgroupBuiltins;
+          // If the set union produces a new set...
+          if (set_union(CallerSubgroups, FSubgroups)) {
+            // ... we might have previously visited CallerF when it had fewer
+            // registered uses of sub-groups. Thus we need to stick it back on
+            // the worklist to propagate these to its users.
+            Worklist.insert(CallerF);
+          }
+        }
+      }
+    }
+  }
+}
+
+bool GlobalSubgroupInfo::usesSubgroups(const llvm::Function &F) const {
+  auto I = FunctionMap.find(&F);
+  assert(I != FunctionMap.end() && "Missing entry for function");
+  return !I->second->UsedSubgroupBuiltins.empty();
+}
+
+std::optional<Builtin> GlobalSubgroupInfo::isMuxSubgroupBuiltin(
+    const Function *F) const {
+  if (!F) {
+    return std::nullopt;
+  }
+  auto SGBuiltin = BI.analyzeBuiltin(*F);
+
+  switch (SGBuiltin.ID) {
+    default:
+      break;
+    case eMuxBuiltinSubGroupBarrier:
+    case eMuxBuiltinGetSubGroupSize:
+    case eMuxBuiltinGetMaxSubGroupSize:
+    case eMuxBuiltinGetNumSubGroups:
+    case eMuxBuiltinGetSubGroupId:
+    case eMuxBuiltinGetSubGroupLocalId:
+      return SGBuiltin;
+  }
+
+  if (auto GroupOp = BI.isMuxGroupCollective(SGBuiltin.ID);
+      GroupOp && GroupOp->isSubGroupScope()) {
+    return SGBuiltin;
+  }
+
+  return std::nullopt;
+}
+
+AnalysisKey SubgroupAnalysis::Key;
+
+SubgroupAnalysis::Result SubgroupAnalysis::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  return GlobalSubgroupInfo(M, AM.getResult<BuiltinInfoAnalysis>(M));
+}
+
+PreservedAnalyses SubgroupAnalysisPrinterPass::run(Module &M,
+                                                   ModuleAnalysisManager &AM) {
+  const auto &Info = AM.getResult<SubgroupAnalysis>(M);
+
+  for (auto &F : M) {
+    if (F.isDeclaration()) {
+      continue;
+    }
+    OS << "Function '" << F.getName() << "' uses";
+    if (!Info.usesSubgroups(F)) {
+      OS << " no sub-group builtins\n";
+      continue;
+    }
+    auto *FInfo = Info[&F];
+    assert(FInfo && "Missing function info");
+    const auto &UsedBuiltins = FInfo->UsedSubgroupBuiltins;
+    // Note: this output isn't stable and shouldn't be relied upon. It's mostly
+    // for developer analysis.
+    OS << " " << UsedBuiltins.size() << " sub-group builtin"
+       << (UsedBuiltins.size() == 1 ? "" : "s") << ": "
+       << static_cast<unsigned>(*UsedBuiltins.begin());
+    for (auto B :
+         make_range(std::next(UsedBuiltins.begin()), UsedBuiltins.end())) {
+      OS << "," << static_cast<unsigned>(B);
+    }
+    OS << "\n";
+  }
+
+  return PreservedAnalyses::all();
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
new file mode 100644
index 0000000000000..90dfbaad05744
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
@@ -0,0 +1,162 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/target_extension_types.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Type.h>
+#include <multi_llvm/llvm_version.h>
+
+using namespace compiler::utils;
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+namespace tgtext {
+
+Type *getEventTy(LLVMContext &Ctx) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return TargetExtType::get(Ctx, "spirv.Event");
+#endif
+}
+
+Type *getSamplerTy(LLVMContext &Ctx) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return TargetExtType::get(Ctx, "spirv.Sampler");
+#endif
+}
+
+[[maybe_unused]] static Type *getImageTyHelper(
+    LLVMContext &Ctx, ImageTyDimensionalityParam Dim, ImageTyDepthParam Depth,
+    ImageTyArrayedParam Arrayed, ImageTyMSParam MS, ImageTySampledParam Sampled,
+    ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)Dim;
+  (void)Depth;
+  (void)Arrayed;
+  (void)MS;
+  (void)Sampled;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  unsigned IntParams[7];
+  IntParams[ImageTyDimensionalityIdx] = Dim;
+  IntParams[ImageTyDepthIdx] = Depth;
+  IntParams[ImageTyArrayedIdx] = Arrayed;
+  IntParams[ImageTyMSIdx] = MS;
+  IntParams[ImageTySampledIdx] = Sampled;
+  IntParams[ImageTyFormatIdx] = /*Unknown*/ 0;
+  IntParams[ImageTyAccessQualIdx] = AccessQual;
+  return TargetExtType::get(Ctx, "spirv.Image", Type::getVoidTy(Ctx),
+                            IntParams);
+#endif
+}
+
+[[maybe_unused]] static Type *getOpenCLImageTyHelper(
+    LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+    ImageTyArrayedParam Arrayed, ImageTyDepthParam Depth, ImageTyMSParam MS,
+    ImageTyAccessQualParam AccessQual) {
+  return getImageTyHelper(Ctx, Dim, Depth, Arrayed, MS, ImageSampledRuntime,
+                          AccessQual);
+}
+
+[[maybe_unused]] static Type *getOpenCLImageTyHelper(
+    LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+    ImageTyArrayedParam Arrayed, ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(Ctx, Dim, Arrayed, ImageDepthNone,
+                                ImageMSSingleSampled, AccessQual);
+}
+
+Type *getImage1DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageNonArrayed, AccessQual);
+#endif
+}
+
+Type *getImage1DArrayTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageArrayed, AccessQual);
+#endif
+}
+
+Type *getImage1DBufferTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return getOpenCLImageTyHelper(Ctx, ImageDimBuffer, ImageNonArrayed,
+                                AccessQual);
+#endif
+}
+
+Type *getImage2DTy(LLVMContext &Ctx, bool Depth, bool MS,
+                   ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)Depth;
+  (void)MS;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return getOpenCLImageTyHelper(
+      Ctx, ImageDim2D, ImageNonArrayed, Depth ? ImageDepth : ImageDepthNone,
+      MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual);
+#endif
+}
+
+Type *getImage2DArrayTy(LLVMContext &Ctx, bool Depth, bool MS,
+                        ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)Depth;
+  (void)MS;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return getOpenCLImageTyHelper(
+      Ctx, ImageDim2D, ImageArrayed, Depth ? ImageDepth : ImageDepthNone,
+      MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual);
+#endif
+}
+
+Type *getImage3DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+#if LLVM_VERSION_LESS(17, 0)
+  (void)Ctx;
+  (void)AccessQual;
+  llvm_unreachable("Can't use target extension types before LLVM 17");
+#else
+  return getOpenCLImageTyHelper(Ctx, ImageDim3D, ImageNonArrayed, AccessQual);
+#endif
+}
+
+}  // namespace tgtext
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
new file mode 100644
index 0000000000000..eff301b86f1f5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
@@ -0,0 +1,282 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Defines the RenameStructsPass.
+
+#include <compiler/utils/StructTypeRemapper.h>
+#include <compiler/utils/unique_opaque_structs_pass.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/multi_llvm.h>
+
+using namespace compiler::utils;
+using namespace llvm;
+
+/// @brief Indicates whether a function needs to be cloned.
+///
+/// There are a few ways the undesirable types can exist in a function:
+/// * As a return type.
+/// * As a parameter type.
+/// * As a call to a function returning undesirable type.
+/// * The result of an alloca.
+/// * Result of a cast of some type.
+/// * Reference to a global.
+///
+/// @param[in] StructTypeRemapper Map from suffixed opaque structs to
+/// unsuffixed opaque structs.
+/// @param[in] Function function to be checked for cloning.
+///
+/// @return Whether function should be cloned.
+/// @retval true if function should be cloned.
+/// @retval false otherwise.
+static bool shouldClone(compiler::utils::StructTypeRemapper &StructTypeRemapper,
+                        const Function &Func) {
+  // First check the return type.
+  if (StructTypeRemapper.isRemapped(Func.getReturnType())) {
+    return true;
+  }
+
+  // Then the arguments.
+  for (const Argument &Arg : Func.args()) {
+    if (StructTypeRemapper.isRemapped(Arg.getType())) {
+      return true;
+    }
+  }
+
+  // Now look for specific instructions that could introduce the type.
+  for (auto &BB : Func) {
+    for (auto &I : BB) {
+      // We can catch any instruction that produces an undesirable type by
+      // just checking its type.
+      if (StructTypeRemapper.isRemapped(I.getType())) {
+        return true;
+      }
+    }
+  }
+
+  // TODO: Check globals (see CA-3833).
+
+  // If an instruction makes use of a type but
+  // isn't of that type e.g. a cast it will necessarily get caught by
+  // the above case as it is a use of something which produced that
+  // type.
+
+  // If we've got here, we've checked all the cases, so no need to clone.
+  return false;
+}
+
+/// @brief Constructs a map of suffixed opaque structure types to their
+/// unsuffixed versions.
+///
+/// If a module references opaque structs that have identical names up to a
+/// suffix within the context, e.g. opencl.event_t and opencl.event_t this
+/// function will return a map mapping the suffixed versions to the unsuffixed
+/// versions e.g. map[opencl.event_t.0] = opencl.event_t.
+///
+/// @param module Module referencing the types in the context.
+///
+/// @return The map of suffixed structures to the unsuffixed structures.
+static compiler::utils::StructMap uniqueOpaqueSuffixedStructs(
+    llvm::Module &module) {
+  StructMap map;
+  for (auto *structTy : module.getIdentifiedStructTypes()) {
+    if (!structTy->isOpaque()) {
+      continue;
+    }
+
+    // Look up each struct in the module by name.
+    auto structName = structTy->getName();
+    const char *Suffix = ".0123456789";
+
+    // Check whether there is a type in the context with the same name minus a
+    // suffix.
+    if (auto *ctxStructTy = llvm::StructType::getTypeByName(
+            module.getContext(), structName.rtrim(Suffix))) {
+      // Make sure it is also opaque.
+      if (!ctxStructTy->isOpaque()) {
+        continue;
+      }
+
+      // If it isn't the same type as the first map the suffixed
+      // type to the unsuffixed type.
+      if (ctxStructTy != structTy) {
+        map[structTy] = ctxStructTy;
+      }
+    }
+  }
+  return map;
+}
+
+/// @brief Populates list of functions that need to be cloned.
+///
+/// @param[in] Module module containing the functions to be inspected.
+/// @param[in] StructTypeRemapper Map from suffixed opaque structs to
+/// unsuffixed opaque structs.
+/// @param[out] WorkList vector of functions that need to be processed.
+static void populateWorkList(
+    Module &Module, compiler::utils::StructTypeRemapper &StructTypeRemapper,
+    SmallVectorImpl<Function *> &WorkList) {
+  for (auto &Function : Module) {
+    // We don't need to touch intrinsics.
+    if (Function.isIntrinsic()) {
+      continue;
+    }
+
+    // Check the function for undesirable types.
+    if (shouldClone(StructTypeRemapper, Function)) {
+      WorkList.push_back(&Function);
+    }
+  }
+}
+
+static void removeOldFunctions(const SmallVectorImpl<Function *> &OldFuncs) {
+  // First we have to delete the bodies of the functions, otherwise we will
+  // get issues about uses missing their defs.
+  for (auto &OldFunc : OldFuncs) {
+    OldFunc->deleteBody();
+  }
+
+  // Now we can delete the actual functions.
+  for (auto &OldFunc : OldFuncs) {
+    OldFunc->eraseFromParent();
+  }
+}
+
+/// @brief Clones a list of functions updating types within the function.
+///
+/// Clones a list of functions updating the types of any instances of the
+/// undesirable types according to the map that was passed to this pass. A new
+/// call graph is constructed and the old functions names are taken by the
+/// new functions.
+///
+/// @param[in] StructTypeRemapper Map from suffixed opaque structs to
+/// unsuffixed opaque structs.
+/// @param[in] OldFuncs list of functions to clone and update.
+static void replaceRemappedTypeRefs(
+    compiler::utils::StructTypeRemapper &StructTypeRemapper,
+    const SmallVectorImpl<Function *> &OldFuncs) {
+  // Maps the old functions to their new versions with updated types.
+  // Note: it is important we do this before cloning to catch the case that
+  // functions A and B both need updating, but function A calls function B and
+  // A is processed before B, otherwise function calls won't be updated during
+  // the clone.
+  SmallDenseMap<Function *, Function *> FFMap;
+  for (auto &OldFunc : OldFuncs) {
+    auto *OldFuncTy = OldFunc->getFunctionType();
+    // First map the return type.
+    auto *RetTy = StructTypeRemapper.remapType(OldFuncTy->getReturnType());
+
+    // Then map the parameter types.
+    SmallVector<Type *, 4> ParamTys;
+    for (auto ParamTy : OldFuncTy->params()) {
+      ParamTys.push_back(StructTypeRemapper.remapType(ParamTy));
+    }
+
+    // Create the new function with updated types.
+    auto *NewFuncTy = FunctionType::get(RetTy, ParamTys, OldFuncTy->isVarArg());
+    auto *NewFunc = Function::Create(NewFuncTy, OldFunc->getLinkage(), "",
+                                     OldFunc->getParent());
+    NewFunc->setCallingConv(OldFunc->getCallingConv());
+
+    FFMap[OldFunc] = NewFunc;
+  }
+
+  // Here we actually do the cloning.
+  for (auto &OldFunc : OldFuncs) {
+    // We construct a new value map on each iteration to avoid entries in the
+    // value map potentially being overwritten during cloning which would then
+    // be used be subsequent loop iterations.
+    ValueToValueMapTy ValueMap;
+    for (auto &pair : FFMap) {
+      ValueMap[pair.getFirst()] = pair.getSecond();
+    }
+    auto *NewFunc = FFMap[OldFunc];
+    auto NewArgIterator = NewFunc->arg_begin();
+    for (llvm::Argument &Arg : OldFunc->args()) {
+      NewArgIterator->setName(Arg.getName());
+      ValueMap[&Arg] = &*(NewArgIterator++);
+    }
+    NewFunc->takeName(OldFunc);
+
+    if (OldFunc->isDeclaration()) {
+      // Everything that follows requires a body.
+      continue;
+    }
+
+    SmallVector<ReturnInst *, 4> Returns;
+    CloneFunctionInto(NewFunc, OldFunc, ValueMap,
+                      CloneFunctionChangeType::GlobalChanges, Returns, "",
+                      /* CodeInfo */ nullptr, &StructTypeRemapper);
+    Returns.clear();
+
+    // It's possible we still have references to the old types in our new
+    // new function, this can happen via allocas and cast as well as
+    // references to global variables.
+    for (auto &BB : *NewFunc) {
+      for (auto &I : BB) {
+        // Anything that defines a undesirable instance will get caught
+        // here.
+        I.mutateType(StructTypeRemapper.remapType(I.getType()));
+
+        // GEP instructions need to be handled separately.
+        if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+          if (StructTypeRemapper.isRemapped(GEP->getSourceElementType())) {
+            GEP->setSourceElementType(
+                StructTypeRemapper.remapType(GEP->getSourceElementType()));
+          }
+        }
+      }
+    }
+  }
+
+  // We can now remove any of the misnamed types and any functions that used
+  // them.
+  removeOldFunctions(OldFuncs);
+}
+
+namespace compiler {
+namespace utils {
+PreservedAnalyses UniqueOpaqueStructsPass::run(Module &Module,
+                                               ModuleAnalysisManager &) {
+  // Find the opaque types in the module that have suffixes and map them to
+  // their unsuffixed versions.
+  auto StructMap = uniqueOpaqueSuffixedStructs(Module);
+  StructTypeRemapper StructTypeRemapper(StructMap);
+
+  // Build the list of functions we need to process.
+  SmallVector<Function *, 8> WorkList;
+  populateWorkList(Module, StructTypeRemapper, WorkList);
+
+  // If the set is empty we have no work and can exit early.
+  if (WorkList.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  // Otherwise, clone the functions, updating the types.
+  replaceRemappedTypeRefs(StructTypeRemapper, WorkList);
+
+  // We definitely cloned something by this point, so the module has been
+  // modified.
+  return PreservedAnalyses::none();
+}
+}  // namespace utils
+}  // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
new file mode 100644
index 0000000000000..b1c6a5b896a0a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -0,0 +1,1980 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/barrier_regions.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/sub_group_analysis.h>
+#include <compiler/utils/vectorization_factor.h>
+#include <compiler/utils/work_item_loops_pass.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Local.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <optional>
+
+using namespace llvm;
+
+#define NDEBUG_WI_LOOPS
+#define DEBUG_TYPE "work-item-loops"
+
+namespace compiler {
+namespace utils {
+
+/// @brief A subclass of the generic Barrier which is used by the
+/// WorkItemLoopsPass.
+///
+/// It adds additional fields used when creating wrapper kernels.
+class BarrierWithLiveVars : public Barrier {
+ public:
+  BarrierWithLiveVars(llvm::Module &m, llvm::Function &f,
+                      VectorizationInfo vf_info, bool IsDebug)
+      : Barrier(m, f, IsDebug), vf_info(vf_info) {}
+
+  VectorizationInfo getVFInfo() const { return vf_info; }
+
+  AllocaInst *getMemSpace() const { return mem_space; }
+  void setMemSpace(AllocaInst *ai) { mem_space = ai; }
+
+  void setSize0(Value *v) { size0 = v; }
+  Value *getSize0() const { return size0; }
+
+  void setTotalSize(Value *v) { totalSize = v; }
+  Value *getTotalSize() const { return totalSize; }
+
+  Value *getStructSize() const { return structSize; }
+  void setStructSize(Value *v) { structSize = v; }
+
+  AllocaInst *getDebugAddr() const { return debug_addr; }
+  void setDebugAddr(AllocaInst *ai) { debug_addr = ai; }
+
+ private:
+  VectorizationInfo vf_info;
+
+  // Alloca representing the memory for the live variables for a given kernel,
+  // with enough space for each individual work-item in a work-group to have
+  // its own view.
+  //
+  // This is typically used to hold Z*Y*(X/vec_width) individual instances of
+  // the live-variables structure.
+  AllocaInst *mem_space = nullptr;
+
+  // Alloca holding the address of the live vars struct for the
+  // currently executing work item.
+  AllocaInst *debug_addr = nullptr;
+
+  // The number of items along the primary dimension
+  Value *size0 = nullptr;
+
+  // The total number of items
+  Value *totalSize = nullptr;
+
+  /// @brief The size of the struct in bytes, if the barrier contains
+  /// scalables
+  Value *structSize = nullptr;
+};
+
+}  // namespace utils
+}  // namespace compiler
+
+namespace {
+#ifndef NDEBUG_WI_LOOPS
+/// @brief Generate IR level printf function call Debug function only.
+///
+/// @param[in] format Format string string.
+/// @param[in] module Current module.
+/// @param[in] v Value for printing.
+/// @param[in] bb Basic block insertion point for @p v.
+///
+/// @return Return instruction to be checked.
+Instruction *IRPrintf(const std::string format, Module &module, Value *v,
+                      BasicBlock *bb) {
+  LLVMContext &context = module.getContext();
+  PointerType *ptr_type = PointerType::getUnqual(IntegerType::get(context, 8));
+
+  SmallVector<Type *, 16> args;
+  args.push_back(ptr_type);
+  FunctionType *printf_type =
+      FunctionType::get(IntegerType::get(context, 32), args, true);
+
+  bool isDeclared = true;
+  Function *func_printf = module.getFunction("printf");
+  if (!func_printf) {
+    func_printf = Function::Create(printf_type, GlobalValue::ExternalLinkage,
+                                   "printf", &module);
+    isDeclared = false;
+  }
+
+  ArrayType *array_type =
+      ArrayType::get(IntegerType::get(context, 8), format.size() + 1);
+  GlobalVariable *str;
+  if (isDeclared) {
+    str = new GlobalVariable(
+        module, array_type, true, GlobalValue::PrivateLinkage, 0, ".str",
+        nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 2, false);
+  } else {
+    str = new GlobalVariable(
+        module, array_type, true, GlobalValue::PrivateLinkage, 0, ".str",
+        nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0, false);
+  }
+  str->setAlignment(MaybeAlign(1));
+
+  Constant *const_array = ConstantDataArray::getString(context, format, true);
+  SmallVector<Constant *, 16> indices;
+  ConstantInt *cst_8 = ConstantInt::get(context, APInt(64, StringRef("0"), 10));
+  indices.push_back(cst_8);
+  indices.push_back(cst_8);
+  Constant *cst_ptr = ConstantExpr::getGetElementPtr(nullptr, str, indices);
+
+  str->setInitializer(const_array);
+
+  SmallVector<Value *, 8> call_params;
+  call_params.push_back(cst_ptr);
+  call_params.push_back(v);
+
+  CallInst *call = CallInst::Create(func_printf, call_params, "", bb);
+
+  return call;
+}
+#endif  // NDEBUG_WI_LOOPS
+
+Value *materializeVF(IRBuilder<> &builder,
+                     compiler::utils::VectorizationFactor vf) {
+  auto &m = *builder.GetInsertBlock()->getModule();
+  Constant *multiple =
+      ConstantInt::get(compiler::utils::getSizeType(m), vf.getKnownMin());
+  return !vf.isScalable() ? multiple : builder.CreateVScale(multiple);
+}
+
+struct ScheduleGenerator {
+  ScheduleGenerator(Module &m,
+                    const compiler::utils::BarrierWithLiveVars &barrierMain,
+                    const compiler::utils::BarrierWithLiveVars *barrierTail,
+                    compiler::utils::BuiltinInfo &BI)
+      : module(m),
+        context(m.getContext()),
+        barrierMain(barrierMain),
+        barrierTail(barrierTail),
+        BI(BI),
+        i32Ty(Type::getInt32Ty(context)) {
+    set_local_id =
+        BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetLocalId, m);
+    set_subgroup_id =
+        BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetSubGroupId, m);
+    assert(set_local_id && set_subgroup_id && "Missing mux builtins");
+  }
+  Module &module;
+  LLVMContext &context;
+  const compiler::utils::BarrierWithLiveVars &barrierMain;
+  const compiler::utils::BarrierWithLiveVars *barrierTail;
+  compiler::utils::BuiltinInfo &BI;
+
+  SmallVector<Value *, 8> args;
+  Function *set_local_id = nullptr;
+  Function *set_subgroup_id = nullptr;
+  Type *const i32Ty;
+
+  uint32_t workItemDim0 = 0;
+  uint32_t workItemDim1 = 1;
+  uint32_t workItemDim2 = 2;
+  Value *localSizeDim[3];
+
+  AllocaInst *nextID = nullptr;
+  Value *mainLoopLimit = nullptr;
+  Value *peel = nullptr;
+  bool emitTail = true;
+  bool isVectorPredicated = false;
+  bool wrapperHasMain = false;
+  bool wrapperHasTail = false;
+
+  DILocation *wrapperDbgLoc = nullptr;
+
+  Value *createLinearLiveVarsPtr(
+      const compiler::utils::BarrierWithLiveVars &barrier, IRBuilder<> &ir,
+      Value *index) {
+    Value *const mem_space = barrier.getMemSpace();
+    if (!mem_space) {
+      return nullptr;
+    }
+
+    // Calculate the offset for where the live variables of the current
+    // work item (within the nested loops) are stored.
+    // Loop i,j,k  -->  ((i * dim1) + j) * size0 + k
+    // memory access pattern should not depend on the vectorization
+    // dimension
+
+    Value *live_var_ptr;
+    if (!barrier.getStructSize()) {
+      Value *const live_var_mem_idxs[] = {index};
+      live_var_ptr = ir.CreateInBoundsGEP(barrier.getLiveVarsType(), mem_space,
+                                          live_var_mem_idxs);
+    } else {
+      // index into the byte buffer
+      auto *const byteOffset = ir.CreateMul(index, barrier.getStructSize());
+      Value *const live_var_mem_idxs[] = {byteOffset};
+      live_var_ptr =
+          ir.CreateInBoundsGEP(ir.getInt8Ty(), mem_space, live_var_mem_idxs);
+
+      // cast to the live mem type
+      live_var_ptr = ir.CreatePointerCast(
+          live_var_ptr,
+          PointerType::get(
+              barrier.getLiveVarsType(),
+              cast<PointerType>(live_var_ptr->getType())->getAddressSpace()));
+    }
+
+    return live_var_ptr;
+  }
+
+  Value *createLiveVarsPtr(const compiler::utils::BarrierWithLiveVars &barrier,
+                           IRBuilder<> &ir, Value *dim_0, Value *dim_1,
+                           Value *dim_2, Value *VF = nullptr) {
+    Value *const mem_space = barrier.getMemSpace();
+    if (!mem_space) {
+      return nullptr;
+    }
+
+    // Calculate the offset for where the live variables of the current
+    // work item (within the nested loops) are stored.
+    // Loop i,j,k  -->  ((i * dim1) + j) * size0 + k
+    // memory access pattern should not depend on the vectorization
+    // dimension
+    auto *const i_offset = ir.CreateMul(dim_2, localSizeDim[workItemDim1]);
+    auto *const j_offset =
+        ir.CreateMul(ir.CreateAdd(i_offset, dim_1), barrier.getSize0());
+    auto *const k_offset = VF ? ir.CreateUDiv(dim_0, VF) : dim_0;
+    auto *const offset = ir.CreateAdd(j_offset, k_offset);
+
+    return createLinearLiveVarsPtr(barrier, ir, offset);
+  }
+
+  void recreateDebugIntrinsics(
+      const compiler::utils::BarrierWithLiveVars &barrier, BasicBlock *block,
+      StoreInst *SI) {
+    DIBuilder DIB(module, /*AllowUnresolved*/ false);
+    auto RecreateDebugIntrinsic = [&](DILocalVariable *const old_var,
+                                      const unsigned live_var_offset) {
+      const uint64_t dwPlusOp = dwarf::DW_OP_plus_uconst;
+      // Use a DWARF expression to point to byte offset in struct where
+      // the variable lives. This involves dereferencing the pointer
+      // stored in `live_vars_debug_addr` to get the start of the live
+      // vars struct, then using a byte offset into the struct for the
+      // particular variable.
+      auto expr = DIB.createExpression(
+          ArrayRef<uint64_t>{dwarf::DW_OP_deref, dwPlusOp, live_var_offset});
+      // Remap this debug variable to its new scope.
+      auto *new_var = DIB.createAutoVariable(
+          block->getParent()->getSubprogram(), old_var->getName(),
+          old_var->getFile(), old_var->getLine(), old_var->getType(),
+          /*AlwaysPreserve=*/false, DINode::FlagZero,
+          old_var->getAlignInBits());
+      // Create intrinsic
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+      if (!module.IsNewDbgInfoFormat) {
+        auto *const DII = DIB.insertDeclare(barrier.getDebugAddr(), new_var,
+                                            expr, wrapperDbgLoc, block)
+                              .get<Instruction *>();
+
+        // Bit of a HACK to produce the same debug output as the Mem2Reg
+        // pass used to do.
+        auto *const DVIntrinsic = cast<DbgVariableIntrinsic>(DII);
+        ConvertDebugDeclareToDebugValue(DVIntrinsic, SI, DIB);
+      } else {
+        auto *const DVR = static_cast<DbgVariableRecord *>(
+            DIB.insertDeclare(barrier.getDebugAddr(), new_var, expr,
+                              wrapperDbgLoc, block)
+                .get<DbgRecord *>());
+
+        // This is nasty, but LLVM errors out on trailing debug info, we need a
+        // subsequent instruction even if we delete it immediately afterwards.
+        auto *DummyInst = new UnreachableInst(module.getContext(), block);
+
+        // Bit of a HACK to produce the same debug output as the Mem2Reg
+        // pass used to do.
+        ConvertDebugDeclareToDebugValue(DVR, SI, DIB);
+
+        DummyInst->eraseFromParent();
+      }
+#else
+      auto *const DII = DIB.insertDeclare(barrier.getDebugAddr(), new_var, expr,
+                                          wrapperDbgLoc, block);
+
+      // Bit of a HACK to produce the same debug output as the Mem2Reg
+      // pass used to do.
+      auto *const DVIntrinsic = cast<DbgVariableIntrinsic>(DII);
+      ConvertDebugDeclareToDebugValue(DVIntrinsic, SI, DIB);
+#endif
+    };
+    for (auto debug_pair : barrier.getDebugIntrinsics()) {
+      RecreateDebugIntrinsic(debug_pair.first->getVariable(),
+                             debug_pair.second);
+    }
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+    for (auto debug_pair : barrier.getDebugDbgVariableRecords()) {
+      RecreateDebugIntrinsic(debug_pair.first->getVariable(),
+                             debug_pair.second);
+    }
+#endif
+  }
+
+  void createWorkItemLoopBody(
+      const compiler::utils::BarrierWithLiveVars &barrier, IRBuilder<> &ir,
+      BasicBlock *block, unsigned i, Value *dim_0, Value *dim_1, Value *dim_2,
+      Value *accumulator = nullptr, Value *VF = nullptr,
+      Value *offset = nullptr) {
+    auto new_kernel_args = args;
+    if (accumulator) {
+      new_kernel_args.push_back(accumulator);
+    }
+
+    // If the work item ID is a nullptr we take it to mean this barrier region
+    // doesn't need to use the barrier struct.
+    if (dim_0) {
+      assert(dim_1 && dim_2 && "unexpected null Work item IDs");
+
+      // set our local id
+      auto *const local_id = offset ? ir.CreateAdd(offset, dim_0) : dim_0;
+      ir.CreateCall(set_local_id,
+                    {ConstantInt::get(i32Ty, workItemDim0), local_id})
+          ->setCallingConv(set_local_id->getCallingConv());
+
+      auto *const live_var_ptr =
+          createLiveVarsPtr(barrier, ir, dim_0, dim_1, dim_2, VF);
+      if (live_var_ptr) {
+        new_kernel_args.push_back(live_var_ptr);
+
+        if (auto *debug_addr = barrier.getDebugAddr()) {
+          // Update the alloca holding the address of the live vars struct for
+          // currently executing work item.
+          auto *const live_var_ptr_cast =
+              ir.CreatePointerBitCastOrAddrSpaceCast(
+                  live_var_ptr, debug_addr->getAllocatedType());
+          auto *const SI = ir.CreateStore(live_var_ptr_cast, debug_addr);
+
+          // Recreate all the debug intrinsics pointing at location in live
+          // variables struct. We only need to do this once before the first
+          // barrier.
+          if (i == compiler::utils::kBarrier_FirstID) {
+            recreateDebugIntrinsics(barrier, block, SI);
+          }
+        }
+      }
+    }
+
+    auto &subkernel = *barrier.getSubkernel(i);
+
+    // call the original function now we've setup all the info!
+    CallInst *ci = ir.CreateCall(&subkernel, new_kernel_args);
+    // add a debug location for this call so that later inlining correctly
+    // updates the debug metadata of all inlined instructions.
+    if (wrapperDbgLoc) {
+      ci->setDebugLoc(wrapperDbgLoc);
+    }
+    ci->setCallingConv(subkernel.getCallingConv());
+    ci->setAttributes(compiler::utils::getCopiedFunctionAttrs(subkernel));
+
+#ifndef NDEBUG_WI_LOOPS
+    IRPrintf(std::string("return.kernel.body=%d\x0A"), module, ci, block);
+#endif  // NDEBUG_WI_LOOPS
+
+    // And update the location of where we need to go to next (if we need to)
+    const auto &successors = barrier.getSuccessorIds(i);
+    if (successors.size() > 1) {
+      ir.CreateStore(ci, nextID);
+    }
+  }
+
+  // Create a 1D loop to execute all the work items in a 'barrier', reducing
+  // across an accumulator.
+  std::pair<BasicBlock *, Value *> makeReductionLoop(
+      const compiler::utils::BarrierWithLiveVars &barrier,
+      const compiler::utils::GroupCollective &WGC, BasicBlock *block, Value *op,
+      Value *accumulator) {
+    auto *const accTy = accumulator->getType();
+    Function *const func = block->getParent();
+
+    // Induction variables
+    auto *const totalSize = barrier.getTotalSize();
+
+    compiler::utils::CreateLoopOpts inner_opts;
+    inner_opts.IVs = {accumulator};
+    inner_opts.disableVectorize = true;
+
+    BasicBlock *preheader = block;
+    BasicBlock *exitBlock = nullptr;
+    PHINode *resultPhi = nullptr;
+
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+
+    if (auto *const loopLimitConst = dyn_cast<Constant>(totalSize)) {
+      if (loopLimitConst->isZeroValue()) {
+        // No iterations at all!
+        return {block, accumulator};
+      }
+      preheader = block;
+    } else {
+      preheader =
+          BasicBlock::Create(context, "ca_work_group_reduce_preheader", func);
+
+      exitBlock =
+          BasicBlock::Create(context, "ca_work_group_reduce_exit", func);
+      preheader->moveAfter(block);
+      exitBlock->moveAfter(preheader);
+
+      auto *const needLoop = CmpInst::Create(
+          Instruction::ICmp, CmpInst::ICMP_NE, zero, totalSize, "", block);
+
+      BranchInst::Create(preheader, exitBlock, needLoop, block);
+
+      resultPhi = PHINode::Create(accTy, 2, "WGC_reduce", exitBlock);
+      resultPhi->addIncoming(accumulator, block);
+    }
+
+    BasicBlock *latchBlock = nullptr;
+
+    // linearly looping through the work items
+    exitBlock = compiler::utils::createLoop(
+        preheader, exitBlock, zero, totalSize, inner_opts,
+        [&](BasicBlock *block, Value *index, ArrayRef<Value *> ivs,
+            MutableArrayRef<Value *> ivsNext) -> BasicBlock * {
+          IRBuilder<> ir(block);
+          auto *const liveVars = createLinearLiveVarsPtr(barrier, ir, index);
+          compiler::utils::Barrier::LiveValuesHelper live_values(barrier, block,
+                                                                 liveVars);
+
+          IRBuilder<> ir_load(block);
+          auto *const itemOp =
+              live_values.getReload(op, ir_load, "_load", /*reuse*/ true);
+
+          // Do the reduction here..
+          accumulator = compiler::utils::createBinOpForRecurKind(
+              ir, ivs[0], itemOp, WGC.Recurrence);
+          ivsNext[0] = accumulator;
+          latchBlock = block;
+
+          return block;
+        });
+
+    if (!resultPhi) {
+      assert(exitBlock != latchBlock && "createLoop didn't create a loop");
+      resultPhi = PHINode::Create(accTy, 1, "WGC_reduce", exitBlock);
+    }
+    resultPhi->addIncoming(accumulator, latchBlock);
+    return {exitBlock, resultPhi};
+  }
+
+  void getUniformValues(BasicBlock *block,
+                        const compiler::utils::BarrierWithLiveVars &barrier,
+                        MutableArrayRef<Value *> values) {
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+    IRBuilder<> ir(block);
+    auto *const barrier0 = ir.CreateInBoundsGEP(barrier.getLiveVarsType(),
+                                                barrier.getMemSpace(), {zero});
+    compiler::utils::Barrier::LiveValuesHelper live_values(barrier, block,
+                                                           barrier0);
+    for (auto &value : values) {
+      value = live_values.getReload(value, ir, "_load", true);
+    }
+  }
+
+  std::optional<compiler::utils::GroupCollective> getBarrierGroupCollective(
+      const compiler::utils::BarrierWithLiveVars &Barrier, unsigned BarrierID) {
+    auto *const BarrierCall = Barrier.getBarrierCall(BarrierID);
+    if (!BarrierCall) {
+      return std::nullopt;
+    }
+
+    auto Builtin = BI.analyzeBuiltin(*BarrierCall->getCalledFunction());
+    return BI.isMuxGroupCollective(Builtin.ID);
+  }
+
+  std::tuple<BasicBlock *, Value *,
+             std::optional<compiler::utils::GroupCollective>>
+  makeWorkGroupCollectiveLoops(BasicBlock *block, unsigned barrierID) {
+    auto *const groupCall = barrierMain.getBarrierCall(barrierID);
+    if (!groupCall) {
+      return {block, nullptr, std::nullopt};
+    }
+
+    auto Info = getBarrierGroupCollective(barrierMain, barrierID);
+    if (!Info || !Info->isWorkGroupScope()) {
+      return {block, nullptr, std::nullopt};
+    }
+
+    switch (Info->Op) {
+      case compiler::utils::GroupCollective::OpKind::Reduction:
+      case compiler::utils::GroupCollective::OpKind::All:
+      case compiler::utils::GroupCollective::OpKind::Any: {
+        auto *const ty = groupCall->getType();
+        auto *const accumulator =
+            compiler::utils::getNeutralVal(Info->Recurrence, ty);
+        auto [loop_exit_block, accum] = makeReductionLoop(
+            barrierMain, *Info, block, groupCall->getOperand(1), accumulator);
+        if (barrierTail) {
+          auto *const groupTailInst = barrierTail->getBarrierCall(barrierID);
+          std::tie(loop_exit_block, accum) =
+              makeReductionLoop(*barrierTail, *Info, loop_exit_block,
+                                groupTailInst->getOperand(1), accum);
+        }
+        if (groupCall->hasName()) {
+          accum->takeName(groupCall);
+        }
+        return std::make_tuple(loop_exit_block, accum, Info);
+      }
+      case compiler::utils::GroupCollective::OpKind::ScanInclusive:
+      case compiler::utils::GroupCollective::OpKind::ScanExclusive: {
+        auto *const ty = groupCall->getType();
+        auto *const accumulator =
+            compiler::utils::getIdentityVal(Info->Recurrence, ty);
+        return {block, accumulator, Info};
+      }
+      case compiler::utils::GroupCollective::OpKind::Broadcast: {
+        // First we need to get the item ID values from the barrier struct.
+        // These should be uniform but they may still be variables. It should
+        // be safe to get them from the barrier struct at index zero.
+        auto *const zero =
+            Constant::getNullValue(compiler::utils::getSizeType(module));
+
+        Function *const func = block->getParent();
+        BasicBlock *mainUniformBlock = block;
+        BasicBlock *tailUniformBlock = nullptr;
+
+        auto *const totalSize = barrierMain.getTotalSize();
+        if (auto *const loopLimitConst = dyn_cast<Constant>(totalSize)) {
+          // If we know for a fact that the main struct has at least one item,
+          // we can just use that. Otherwise, we need to use the tail struct.
+          if (loopLimitConst->isZeroValue()) {
+            mainUniformBlock = nullptr;
+            if (barrierTail) {
+              tailUniformBlock = block;
+            }
+          }
+        } else if (barrierTail) {
+          // If we have a variable number of main items, it could be zero at
+          // runtime, so we need an alternative way to get the values.
+          mainUniformBlock =
+              BasicBlock::Create(context, "ca_main_uniform_load", func);
+          tailUniformBlock =
+              BasicBlock::Create(context, "ca_tail_uniform_load", func);
+
+          auto *const needTail = CmpInst::Create(
+              Instruction::ICmp, CmpInst::ICMP_EQ, totalSize, zero, "", block);
+          BranchInst::Create(tailUniformBlock, mainUniformBlock, needTail,
+                             block);
+        }
+
+        if (!mainUniformBlock && !tailUniformBlock) {
+          return {block, nullptr, std::nullopt};
+        }
+
+        Value *idsMain[] = {zero, zero, zero};
+        Value *idsTail[] = {zero, zero, zero};
+        if (mainUniformBlock) {
+          idsMain[0] = groupCall->getOperand(2);
+          idsMain[1] = groupCall->getOperand(3);
+          idsMain[2] = groupCall->getOperand(4);
+          getUniformValues(mainUniformBlock, barrierMain, idsMain);
+        }
+
+        if (tailUniformBlock) {
+          auto *const tailGroupCall = barrierTail->getBarrierCall(barrierID);
+          assert(tailGroupCall &&
+                 "No corresponding work group broadcast in tail kernel");
+          idsTail[0] = tailGroupCall->getOperand(2);
+          idsTail[1] = tailGroupCall->getOperand(3);
+          idsTail[2] = tailGroupCall->getOperand(4);
+          getUniformValues(tailUniformBlock, *barrierTail, idsTail);
+        }
+
+        // If both barrier structs had to be used, we need to merge the result.
+        if (mainUniformBlock && tailUniformBlock) {
+          block = BasicBlock::Create(context, "ca_merge_uniform_load", func);
+          BranchInst::Create(block, tailUniformBlock);
+          BranchInst::Create(block, mainUniformBlock);
+
+          for (size_t i = 0; i != 3; ++i) {
+            auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2,
+                                             "uniform_merge", block);
+            mergePhi->addIncoming(idsMain[i], mainUniformBlock);
+            mergePhi->addIncoming(idsTail[i], tailUniformBlock);
+            idsMain[i] = mergePhi;
+          }
+        }
+
+        IRBuilder<> ir(block);
+        auto *const op = groupCall->getOperand(1);
+
+        // Compute the address of the value in the main barrier struct
+        auto *const VF = materializeVF(ir, barrierMain.getVFInfo().vf);
+        auto *const liveVars = createLiveVarsPtr(barrierMain, ir, idsMain[0],
+                                                 idsMain[1], idsMain[2], VF);
+        compiler::utils::Barrier::LiveValuesHelper live_values(barrierMain,
+                                                               block, liveVars);
+        auto *const GEPmain = live_values.getGEP(op);
+        assert(GEPmain && "Could not get broadcasted value");
+
+        if (barrierTail) {
+          const bool VP = barrierTail->getVFInfo().IsVectorPredicated;
+
+          // Compute the address of the value in the tail barrier struct
+          auto *const offsetDim0 = ir.CreateSub(idsMain[0], mainLoopLimit);
+          auto *const liveVarsTail =
+              createLiveVarsPtr(*barrierTail, ir, offsetDim0, idsMain[1],
+                                idsMain[2], VP ? VF : nullptr);
+          compiler::utils::Barrier::LiveValuesHelper live_values(
+              *barrierTail, block, liveVarsTail);
+
+          auto *const opTail =
+              barrierTail->getBarrierCall(barrierID)->getOperand(1);
+          auto *const GEPtail = live_values.getGEP(opTail);
+          assert(GEPtail && "Could not get tail-broadcasted value");
+
+          // Select the main GEP or the tail GEP to load from
+          auto *const cond = ir.CreateICmpUGE(idsMain[0], mainLoopLimit);
+
+          auto *const select = ir.CreateSelect(cond, GEPtail, GEPmain);
+
+          auto *const result = ir.CreateLoad(op->getType(), select);
+          result->takeName(groupCall);
+
+          return {block, result, Info};
+        } else {
+          auto *const result = ir.CreateLoad(op->getType(), GEPmain);
+          result->takeName(groupCall);
+          return {block, result, Info};
+        }
+      }
+      default:
+        break;
+    }
+    return {block, nullptr, std::nullopt};
+  }
+
+  // Create loops to execute all the main work items, and then all the
+  // left-over tail work items at the end.
+  BasicBlock *makeWorkItemLoops(BasicBlock *block, unsigned barrierID) {
+    Value *accum = nullptr;
+    std::optional<compiler::utils::GroupCollective> collective;
+    std::tie(block, accum, collective) =
+        makeWorkGroupCollectiveLoops(block, barrierID);
+
+    // Work-group scans should be using linear work-item loops.
+    assert((!collective || !collective->isScan()) && "No support for scans");
+
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+    auto *const i32Zero = Constant::getNullValue(i32Ty);
+    auto *const func = block->getParent();
+
+    // The subgroup induction variable, set to the value of the subgroup ID at
+    // the end of the last loop (i.e. beginning of the next loop)
+    Value *nextSubgroupIV = i32Zero;
+
+    // looping through num groups in the first (innermost)
+    // dimension
+    BasicBlock *mainPreheaderBB = block;
+    BasicBlock *mainExitBB = nullptr;
+
+    // We need to ensure any subgroup IV is defined on the path in which
+    // the vector loop is skipped.
+    PHINode *subgroupMergePhi = nullptr;
+
+    // If we are emitting a tail, we might need to bypass the vector loop (if
+    // the local size is less than the vector width).
+    if (emitTail) {
+      if (auto *const loopLimitConst = dyn_cast<Constant>(mainLoopLimit)) {
+        if (loopLimitConst->isZeroValue()) {
+          // No vector iterations at all!
+          mainPreheaderBB = nullptr;
+          mainExitBB = block;
+        }
+      } else {
+        mainPreheaderBB = BasicBlock::Create(
+            context, "ca_work_item_x_vector_preheader", func);
+
+        mainExitBB =
+            BasicBlock::Create(context, "ca_work_item_x_vector_exit", func);
+        mainPreheaderBB->moveAfter(block);
+        mainExitBB->moveAfter(mainPreheaderBB);
+
+        subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB);
+        subgroupMergePhi->addIncoming(i32Zero, block);
+
+        auto *const needMain =
+            CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, zero,
+                            mainLoopLimit, "", block);
+
+        BranchInst::Create(mainPreheaderBB, mainExitBB, needMain, block);
+      }
+    }
+
+    assert((mainPreheaderBB || !wrapperHasMain) &&
+           "Vector loops in one barrier block but not another?");
+
+    if (mainPreheaderBB) {
+      wrapperHasMain = true;
+      // Subgroup induction variables
+      compiler::utils::CreateLoopOpts outer_opts;
+      outer_opts.IVs = {i32Zero};
+
+      // looping through num groups in the third (outermost) dimension
+      mainExitBB = compiler::utils::createLoop(
+          mainPreheaderBB, mainExitBB, zero, localSizeDim[workItemDim2],
+          outer_opts,
+          [&](BasicBlock *block, Value *dim_2, ArrayRef<Value *> ivs2,
+              MutableArrayRef<Value *> ivsNext2) -> BasicBlock * {
+            // if we need to set the local id, do so here.
+            IRBuilder<> ir(block);
+            ir.CreateCall(set_local_id,
+                          {ConstantInt::get(i32Ty, workItemDim2), dim_2})
+                ->setCallingConv(set_local_id->getCallingConv());
+
+            compiler::utils::CreateLoopOpts middle_opts;
+            middle_opts.IVs = ivs2.vec();
+
+            // looping through num groups in the second dimension
+            BasicBlock *exit1 = compiler::utils::createLoop(
+                block, nullptr, zero, localSizeDim[workItemDim1], middle_opts,
+                [&](BasicBlock *block, Value *dim_1, ArrayRef<Value *> ivs1,
+                    MutableArrayRef<Value *> ivsNext1) -> BasicBlock * {
+                  IRBuilder<> ir(block);
+                  ir.CreateCall(set_local_id,
+                                {ConstantInt::get(i32Ty, workItemDim1), dim_1})
+                      ->setCallingConv(set_local_id->getCallingConv());
+
+                  // Materialize the scale factor at the beginning of the
+                  // preheader
+                  IRBuilder<> irph(mainPreheaderBB,
+                                   mainPreheaderBB->getFirstInsertionPt());
+                  auto *VF = materializeVF(irph, barrierMain.getVFInfo().vf);
+
+                  compiler::utils::CreateLoopOpts inner_opts;
+                  inner_opts.indexInc = VF;
+                  inner_opts.IVs = ivs1.vec();
+
+                  BasicBlock *exit0 = compiler::utils::createLoop(
+                      block, nullptr, zero, mainLoopLimit, inner_opts,
+                      [&](BasicBlock *block, Value *dim_0,
+                          ArrayRef<Value *> ivs0,
+                          MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                        IRBuilder<> ir(block);
+
+                        // set our subgroup id
+                        ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                            ->setCallingConv(set_subgroup_id->getCallingConv());
+
+                        createWorkItemLoopBody(barrierMain, ir, block,
+                                               barrierID, dim_0, dim_1, dim_2,
+                                               accum, VF);
+
+                        nextSubgroupIV =
+                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                        ivsNext0[0] = nextSubgroupIV;
+
+                        return block;
+                      });
+
+                  // Don't forget to update the subgroup IV phi.
+                  ivsNext1[0] = nextSubgroupIV;
+
+                  return exit0;
+                });
+
+            // Don't forget to update the subgroup IV phi.
+            ivsNext2[0] = nextSubgroupIV;
+
+            if (subgroupMergePhi) {
+              subgroupMergePhi->addIncoming(nextSubgroupIV, exit1);
+            }
+
+            return exit1;
+          });
+    }
+
+    // looping through num groups in the first
+    // (innermost) dimension
+    BasicBlock *tailPreheaderBB = mainExitBB;
+    BasicBlock *tailExitBB = nullptr;
+
+    if (emitTail && peel) {
+      // We might need to bypass the tail loop.
+      if (auto *const peelConst = dyn_cast<Constant>(peel)) {
+        if (peelConst->isZeroValue()) {
+          // No tail iterations at all!
+          tailPreheaderBB = nullptr;
+          tailExitBB = mainExitBB;
+        }
+      } else {
+        tailPreheaderBB = BasicBlock::Create(
+            context, "ca_work_item_x_scalar_preheader", func);
+
+        tailExitBB =
+            BasicBlock::Create(context, "ca_work_item_x_scalar_exit", func);
+        tailPreheaderBB->moveAfter(mainExitBB);
+        tailExitBB->moveAfter(tailPreheaderBB);
+
+        auto *const needPeeling = CmpInst::Create(
+            Instruction::ICmp, CmpInst::ICMP_NE, zero, peel, "", mainExitBB);
+
+        BranchInst::Create(tailPreheaderBB, tailExitBB, needPeeling,
+                           mainExitBB);
+      }
+    } else {
+      tailPreheaderBB = nullptr;
+      tailExitBB = mainExitBB;
+    }
+
+    assert((tailPreheaderBB || !wrapperHasTail) &&
+           "Tail loops in one barrier block but not another?");
+
+    if (tailPreheaderBB) {
+      assert(barrierTail);
+      wrapperHasTail = true;
+      // Subgroup induction variables
+      compiler::utils::CreateLoopOpts outer_opts;
+      outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+
+      // looping through num groups in the third (outermost) dimension
+      tailExitBB = compiler::utils::createLoop(
+          tailPreheaderBB, tailExitBB, zero, localSizeDim[workItemDim2],
+          outer_opts,
+          [&](BasicBlock *block, Value *dim_2, ArrayRef<Value *> ivs2,
+              MutableArrayRef<Value *> ivsNext2) -> BasicBlock * {
+            // set the local id
+            IRBuilder<> ir(block);
+            ir.CreateCall(set_local_id,
+                          {ConstantInt::get(i32Ty, workItemDim2), dim_2})
+                ->setCallingConv(set_local_id->getCallingConv());
+
+            compiler::utils::CreateLoopOpts middle_opts;
+            middle_opts.IVs = ivs2.vec();
+
+            // looping through num groups in the second dimension
+            BasicBlock *exit1 = compiler::utils::createLoop(
+                block, nullptr, zero, localSizeDim[workItemDim1], middle_opts,
+                [&](BasicBlock *block, Value *dim_1, ArrayRef<Value *> ivs1,
+                    MutableArrayRef<Value *> ivsNext1) -> BasicBlock * {
+                  IRBuilder<> ir(block);
+                  ir.CreateCall(set_local_id,
+                                {ConstantInt::get(i32Ty, workItemDim1), dim_1})
+                      ->setCallingConv(set_local_id->getCallingConv());
+
+                  compiler::utils::CreateLoopOpts inner_opts;
+                  inner_opts.IVs = ivs1.vec();
+                  inner_opts.disableVectorize = true;
+
+                  BasicBlock *exit0 = compiler::utils::createLoop(
+                      block, nullptr, zero, peel, inner_opts,
+                      [&](BasicBlock *block, Value *dim_0,
+                          ArrayRef<Value *> ivs0,
+                          MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                        IRBuilder<> ir(block);
+
+                        if (set_subgroup_id) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
+
+                        createWorkItemLoopBody(
+                            *barrierTail, ir, block, barrierID, dim_0, dim_1,
+                            dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
+
+                        nextSubgroupIV =
+                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                        ivsNext0[0] = nextSubgroupIV;
+
+                        return block;
+                      });
+
+                  // Don't forget to update the subgroup IV phi.
+                  ivsNext1[0] = nextSubgroupIV;
+
+                  return exit0;
+                });
+
+            // Don't forget to update the subgroup IV phi.
+            ivsNext2[0] = nextSubgroupIV;
+
+            return exit1;
+          });
+    }
+    return tailExitBB;
+  }
+
+  // Create loops to execute all work items in local linear ID order.
+  BasicBlock *makeLinearWorkItemLoops(BasicBlock *block, unsigned barrierID) {
+    Value *accum = nullptr;
+    std::optional<compiler::utils::GroupCollective> collective;
+    std::tie(block, accum, collective) =
+        makeWorkGroupCollectiveLoops(block, barrierID);
+
+    bool isScan = collective && collective->isScan();
+    bool isExclusiveScan =
+        isScan && collective->Op ==
+                      compiler::utils::GroupCollective::OpKind::ScanExclusive;
+    // The scan types can differ between 'main' and 'tail' kernels.
+    bool isTailExclusiveScan = false;
+    if (isScan && barrierTail) {
+      const auto tailInfo = getBarrierGroupCollective(*barrierTail, barrierID);
+      assert(tailInfo && "No corresponding work group scan in tail kernel");
+      isTailExclusiveScan =
+          tailInfo->Op ==
+          compiler::utils::GroupCollective::OpKind::ScanExclusive;
+    }
+
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+    auto *const i32Zero = Constant::getNullValue(i32Ty);
+    auto *const func = block->getParent();
+
+    // The subgroup induction variable, set to the value of the subgroup ID at
+    // the end of the last loop (i.e. beginning of the next loop)
+    Value *nextSubgroupIV = i32Zero;
+
+    // The work-group scan induction variable, set to the current scan value at
+    // the end of the last loop (i.e. beginning of the next loop)
+    Value *nextScanIV = accum;
+
+    // We need to ensure any subgroup IV is defined on the path in which
+    // the vector loop is skipped.
+    PHINode *subgroupMergePhi = nullptr;
+    // Same with the scan IV
+    PHINode *scanMergePhi = nullptr;
+
+    compiler::utils::CreateLoopOpts outer_opts;
+    outer_opts.IVs.push_back(i32Zero);
+    outer_opts.loopIVNames.push_back("sg.z");
+    if (isScan) {
+      outer_opts.IVs.push_back(nextScanIV);
+      outer_opts.loopIVNames.push_back("scan.z");
+    }
+
+    // looping through num groups in the third (outermost) dimension
+    return compiler::utils::createLoop(
+        block, nullptr, zero, localSizeDim[workItemDim2], outer_opts,
+        [&](BasicBlock *block, Value *dim_2, ArrayRef<Value *> ivs2,
+            MutableArrayRef<Value *> ivsNext2) -> BasicBlock * {
+          // set the local id
+          IRBuilder<> ir(block);
+          ir.CreateCall(set_local_id,
+                        {ConstantInt::get(i32Ty, workItemDim2), dim_2})
+              ->setCallingConv(set_local_id->getCallingConv());
+
+          compiler::utils::CreateLoopOpts middle_opts;
+          middle_opts.IVs = ivs2.vec();
+          middle_opts.loopIVNames.push_back("sg.y");
+          if (isScan) {
+            middle_opts.loopIVNames.push_back("scan.y");
+          }
+
+          // looping through num groups in the second dimension
+          BasicBlock *exit1 = compiler::utils::createLoop(
+              block, nullptr, zero, localSizeDim[workItemDim1], middle_opts,
+              [&](BasicBlock *block, Value *dim_1, ArrayRef<Value *> ivs1,
+                  MutableArrayRef<Value *> ivsNext1) -> BasicBlock * {
+                IRBuilder<> ir(block);
+                ir.CreateCall(set_local_id,
+                              {ConstantInt::get(i32Ty, workItemDim1), dim_1})
+                    ->setCallingConv(set_local_id->getCallingConv());
+
+                // looping through num groups in the first (innermost)
+                // dimension
+                BasicBlock *mainPreheaderBB = block;
+                BasicBlock *mainExitBB = nullptr;
+
+                // If we are emitting a tail, we might need to bypass the
+                // main loop (if the local size is less than the main loop
+                // width).
+                if (emitTail) {
+                  if (auto *const loopLimitConst =
+                          dyn_cast<Constant>(mainLoopLimit)) {
+                    if (loopLimitConst->isZeroValue()) {
+                      // No main iterations at all!
+                      mainPreheaderBB = nullptr;
+                      mainExitBB = block;
+                    }
+                  } else {
+                    mainPreheaderBB = BasicBlock::Create(
+                        context, "ca_work_item_x_main_preheader", func);
+
+                    mainExitBB = BasicBlock::Create(
+                        context, "ca_work_item_x_main_exit", func);
+                    mainPreheaderBB->moveAfter(block);
+                    mainExitBB->moveAfter(mainPreheaderBB);
+
+                    subgroupMergePhi =
+                        PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB);
+                    subgroupMergePhi->addIncoming(ivs1[0], block);
+
+                    if (isScan) {
+                      scanMergePhi = PHINode::Create(accum->getType(), 2,
+                                                     "scan.merge", mainExitBB);
+                      scanMergePhi->addIncoming(ivs1[1], block);
+                    }
+
+                    auto *const needMain =
+                        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+                                        zero, mainLoopLimit, "", block);
+
+                    BranchInst::Create(mainPreheaderBB, mainExitBB, needMain,
+                                       block);
+                  }
+                }
+
+                assert((mainPreheaderBB || !wrapperHasMain) &&
+                       "Main loops in one barrier block but not another?");
+
+                if (mainPreheaderBB) {
+                  wrapperHasMain = true;
+                  BasicBlock *mainLoopBB = nullptr;
+
+                  // Materialize the scale factor at the beginning of the
+                  // preheader
+                  IRBuilder<> irph(mainPreheaderBB,
+                                   mainPreheaderBB->getFirstInsertionPt());
+                  auto *VF = materializeVF(irph, barrierMain.getVFInfo().vf);
+
+                  compiler::utils::CreateLoopOpts inner_vf_opts;
+                  inner_vf_opts.indexInc = VF;
+                  inner_vf_opts.IVs = ivs1.vec();
+                  inner_vf_opts.loopIVNames.push_back("sg.x.main");
+                  if (isScan) {
+                    inner_vf_opts.loopIVNames.push_back("scan.y.main");
+                  }
+
+                  mainExitBB = compiler::utils::createLoop(
+                      mainPreheaderBB, mainExitBB, zero, mainLoopLimit,
+                      inner_vf_opts,
+                      [&](BasicBlock *block, Value *dim_0,
+                          ArrayRef<Value *> ivs0,
+                          MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                        IRBuilder<> ir(block);
+
+                        if (set_subgroup_id) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
+
+                        if (isScan) {
+                          auto *const barrierCall =
+                              barrierMain.getBarrierCall(barrierID);
+                          auto *const liveVars = createLiveVarsPtr(
+                              barrierMain, ir, dim_0, dim_1, dim_2, VF);
+                          compiler::utils::Barrier::LiveValuesHelper
+                              live_values(barrierMain, block, liveVars);
+                          auto *const itemOp = live_values.getReload(
+                              barrierCall->getOperand(1), ir, "_load",
+                              /*reuse*/ true);
+                          nextScanIV = compiler::utils::createBinOpForRecurKind(
+                              ir, ivs0[1], itemOp, collective->Recurrence);
+                          accum = isExclusiveScan ? ivs0[1] : nextScanIV;
+                          ivsNext0[1] = nextScanIV;
+                        }
+
+                        createWorkItemLoopBody(barrierMain, ir, block,
+                                               barrierID, dim_0, dim_1, dim_2,
+                                               accum, VF);
+
+                        nextSubgroupIV =
+                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
+                                         "sg.x.main.inc");
+                        ivsNext0[0] = nextSubgroupIV;
+
+                        // Move the exit after the loop block, as it reads more
+                        // logically.
+                        mainLoopBB = block;
+                        if (mainExitBB) {
+                          mainExitBB->moveAfter(mainLoopBB);
+                        }
+
+                        return block;
+                      });
+
+                  if (subgroupMergePhi) {
+                    subgroupMergePhi->addIncoming(nextSubgroupIV, mainLoopBB);
+                  }
+
+                  if (scanMergePhi) {
+                    scanMergePhi->addIncoming(nextScanIV, mainLoopBB);
+                  }
+                }
+                assert(mainExitBB && "didn't create a loop exit block!");
+
+                // looping through num groups in the first
+                // (innermost) dimension
+                BasicBlock *tailPreheaderBB = mainExitBB;
+                BasicBlock *tailExitBB = nullptr;
+
+                if (emitTail && peel) {
+                  // We might need to bypass the tail loop.
+                  if (auto *const peelConst = dyn_cast<Constant>(peel)) {
+                    if (peelConst->isZeroValue()) {
+                      // No tail iterations at all!
+                      tailPreheaderBB = nullptr;
+                      tailExitBB = mainExitBB;
+                    }
+                  } else {
+                    tailPreheaderBB = BasicBlock::Create(
+                        context, "ca_work_item_x_tail_preheader", func);
+
+                    tailExitBB = BasicBlock::Create(
+                        context, "ca_work_item_x_tail_exit", func);
+                    tailPreheaderBB->moveAfter(mainExitBB);
+                    tailExitBB->moveAfter(tailPreheaderBB);
+
+                    auto *const needPeeling =
+                        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+                                        zero, peel, "", mainExitBB);
+
+                    BranchInst::Create(tailPreheaderBB, tailExitBB, needPeeling,
+                                       mainExitBB);
+                  }
+                } else {
+                  tailPreheaderBB = nullptr;
+                  tailExitBB = mainExitBB;
+                }
+
+                assert((tailPreheaderBB || !wrapperHasTail) &&
+                       "Tail loops in one barrier block but not another?");
+
+                if (tailPreheaderBB) {
+                  assert(barrierTail);
+                  wrapperHasTail = true;
+                  // Subgroup induction variables
+                  SmallVector<Value *, 2> subgroupIVs0 = {
+                      subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+                  if (isScan) {
+                    subgroupIVs0.push_back(scanMergePhi ? scanMergePhi
+                                                        : nextScanIV);
+                  }
+
+                  BasicBlock *tailLoopBB = nullptr;
+                  if (barrierTail->getVFInfo().IsVectorPredicated) {
+                    IRBuilder<> ir(tailPreheaderBB);
+                    if (set_subgroup_id) {
+                      // set our subgroup id
+                      ir.CreateCall(set_subgroup_id, {subgroupIVs0[0]})
+                          ->setCallingConv(set_subgroup_id->getCallingConv());
+                    }
+
+                    if (isScan) {
+                      assert(barrierTail);
+                      auto *const barrierCall =
+                          barrierTail->getBarrierCall(barrierID);
+                      auto *const liveVars = createLiveVarsPtr(
+                          *barrierTail, ir, zero, dim_1, dim_2, nullptr);
+                      compiler::utils::Barrier::LiveValuesHelper live_values(
+                          *barrierTail, tailPreheaderBB, liveVars);
+                      auto *const itemOp = live_values.getReload(
+                          barrierCall->getOperand(1), ir, "_load",
+                          /*reuse*/ true);
+                      nextScanIV = compiler::utils::createBinOpForRecurKind(
+                          ir, subgroupIVs0[1], itemOp, collective->Recurrence);
+                      accum =
+                          isTailExclusiveScan ? subgroupIVs0[1] : nextScanIV;
+                    }
+
+                    createWorkItemLoopBody(*barrierTail, ir, tailPreheaderBB,
+                                           barrierID, zero, dim_1, dim_2, accum,
+                                           /*VF*/ nullptr, mainLoopLimit);
+
+                    nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0],
+                                                  ConstantInt::get(i32Ty, 1),
+                                                  "sg.x.tail.inc");
+                    assert(tailExitBB);
+                    ir.CreateBr(tailExitBB);
+                    tailLoopBB = tailPreheaderBB;
+                  } else {
+                    compiler::utils::CreateLoopOpts inner_scalar_opts;
+                    inner_scalar_opts.disableVectorize = true;
+                    inner_scalar_opts.IVs.assign(subgroupIVs0.begin(),
+                                                 subgroupIVs0.end());
+                    inner_scalar_opts.loopIVNames.push_back("sg.x.tail");
+                    if (isScan) {
+                      inner_scalar_opts.loopIVNames.push_back("scan.x.tail");
+                    }
+
+                    tailExitBB = compiler::utils::createLoop(
+                        tailPreheaderBB, tailExitBB, zero, peel,
+                        inner_scalar_opts,
+                        [&](BasicBlock *block, Value *dim_0,
+                            ArrayRef<Value *> ivs0,
+                            MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                          IRBuilder<> ir(block);
+
+                          if (set_subgroup_id) {
+                            // set our subgroup id
+                            ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                                ->setCallingConv(
+                                    set_subgroup_id->getCallingConv());
+                          }
+
+                          if (isScan) {
+                            assert(barrierTail);
+                            auto *const barrierCall =
+                                barrierTail->getBarrierCall(barrierID);
+                            auto *const liveVars = createLiveVarsPtr(
+                                *barrierTail, ir, dim_0, dim_1, dim_2, nullptr);
+                            compiler::utils::Barrier::LiveValuesHelper
+                                live_values(*barrierTail, block, liveVars);
+                            auto *const itemOp = live_values.getReload(
+                                barrierCall->getOperand(1), ir, "_load",
+                                /*reuse*/ true);
+                            nextScanIV =
+                                compiler::utils::createBinOpForRecurKind(
+                                    ir, ivs0[1], itemOp,
+                                    collective->Recurrence);
+                            accum = isTailExclusiveScan ? ivs0[1] : nextScanIV;
+                            ivsNext0[1] = nextScanIV;
+                          }
+
+                          createWorkItemLoopBody(
+                              *barrierTail, ir, block, barrierID, dim_0, dim_1,
+                              dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
+
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
+                                           "sg.x.tail.inc");
+                          ivsNext0[0] = nextSubgroupIV;
+
+                          tailLoopBB = block;
+                          // Move the exit after the loop block, as it reads
+                          // more logically.
+                          if (tailExitBB) {
+                            tailExitBB->moveAfter(tailLoopBB);
+                          }
+
+                          return block;
+                        });
+                  }
+
+                  // Merge the main and tail subgroup IVs together in the
+                  // tail exit, since we may have skipped either main or
+                  // tail loops.
+                  if (subgroupMergePhi) {
+                    auto *scalarSubgroupIV = nextSubgroupIV;
+                    nextSubgroupIV = PHINode::Create(
+                        i32Ty, 2, "sg.main.tail.merge", tailExitBB);
+                    cast<PHINode>(nextSubgroupIV)
+                        ->addIncoming(scalarSubgroupIV, tailLoopBB);
+                    cast<PHINode>(nextSubgroupIV)
+                        ->addIncoming(subgroupMergePhi, mainExitBB);
+                  }
+
+                  if (scanMergePhi) {
+                    auto *scalarScanIV = nextScanIV;
+                    nextScanIV =
+                        PHINode::Create(accum->getType(), 2,
+                                        "scan.main.tail.merge", tailExitBB);
+                    cast<PHINode>(nextScanIV)
+                        ->addIncoming(scalarScanIV, tailLoopBB);
+                    cast<PHINode>(nextScanIV)
+                        ->addIncoming(scanMergePhi, mainExitBB);
+                  }
+                }
+                // Don't forget to update the subgroup IV phi.
+                ivsNext1[0] = nextSubgroupIV;
+                if (isScan) {
+                  // ... or the scan IV phi.
+                  ivsNext1[1] = nextScanIV;
+                }
+                return tailExitBB;
+              });
+
+          // Don't forget to update the subgroup IV phi.
+          ivsNext2[0] = nextSubgroupIV;
+          if (isScan) {
+            // ... or the scan IV phi.
+            ivsNext2[1] = nextScanIV;
+          }
+          return exit1;
+        });
+  }
+
+  // It executes only the first work item in the work group
+  BasicBlock *makeRunOneWorkItem(BasicBlock *block, unsigned barrierID) {
+    // "Once" scheduled barriers shouldn't need the local id set.
+    IRBuilder<> ir(block);
+    createWorkItemLoopBody(barrierTail ? *barrierTail : barrierMain, ir, block,
+                           barrierID, nullptr, nullptr, nullptr, nullptr);
+    return block;
+  }
+};
+
+// Emits code to set up the storage allocated to a live-vars structure.
+//
+// Allocates enough space for sizeZ * sizeY * sizeX work-items. Note that Z/Y/X
+// here corresponds to the current outermost to innermost vectorized
+// dimensions, rather than in their absolutist sense.
+void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier,
+                         IRBuilder<> &B, Value *const sizeZ, Value *const sizeY,
+                         Value *const sizeX, StringRef name, bool isDebug) {
+  barrier.setSize0(sizeX);
+  Value *const live_var_size = B.CreateMul(sizeX, B.CreateMul(sizeY, sizeZ));
+  barrier.setTotalSize(live_var_size);
+  AllocaInst *live_var_mem_space;
+  auto &m = *B.GetInsertBlock()->getModule();
+  auto *const size_ty = compiler::utils::getSizeType(m);
+  const auto scalablesSize = barrier.getLiveVarMemSizeScalable();
+  if (scalablesSize == 0) {
+    live_var_mem_space =
+        B.CreateAlloca(barrier.getLiveVarsType(), live_var_size, name);
+    live_var_mem_space->setAlignment(
+        MaybeAlign(barrier.getLiveVarMaxAlignment()).valueOrOne());
+    barrier.setMemSpace(live_var_mem_space);
+  } else {
+    const auto fixedSize = barrier.getLiveVarMemSizeFixed();
+    // We ensure that the VFs are the same between the main and tail.
+    auto *const vscale =
+        B.CreateVScale(ConstantInt::get(size_ty, scalablesSize));
+    auto *const structSize =
+        B.CreateAdd(vscale, ConstantInt::get(size_ty, fixedSize));
+    auto *const buffer_size = B.CreateMul(structSize, live_var_size);
+
+    live_var_mem_space = B.CreateAlloca(B.getInt8Ty(), buffer_size, name);
+    live_var_mem_space->setAlignment(
+        MaybeAlign(barrier.getLiveVarMaxAlignment()).valueOrOne());
+    barrier.setMemSpace(live_var_mem_space);
+    barrier.setStructSize(structSize);
+  }
+
+  if (isDebug) {
+    barrier.setDebugAddr(B.CreateAlloca(live_var_mem_space->getType(), nullptr,
+                                        "live_vars_peel_dbg"));
+  }
+}
+
+}  // namespace
+
+Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
+    BarrierWithLiveVars &barrierMain, BarrierWithLiveVars *barrierTail,
+    StringRef baseName, Module &M, compiler::utils::BuiltinInfo &BI) {
+  Function &mainF = barrierMain.getFunc();
+
+  // The reference function is that which we expect to hold the reference
+  // version of various pieces of data, such as metadata. It's the tail
+  // function if one exists, else it's the main function.
+  Function &refF = barrierTail ? barrierTail->getFunc() : barrierMain.getFunc();
+
+  const bool emitTail = barrierTail != nullptr;
+
+  auto mainInfo = barrierMain.getVFInfo();
+  auto tailInfo =
+      emitTail ? barrierTail->getVFInfo() : std::optional<VectorizationInfo>();
+
+  const auto workItemDim0 = 0;
+  const auto workItemDim1 = 1;
+  const auto workItemDim2 = 2;
+
+  LLVMContext &context = M.getContext();
+
+  Function *new_wrapper =
+      createKernelWrapperFunction(mainF, ".mux-barrier-wrapper");
+
+  new_wrapper->setName(baseName + ".mux-barrier-wrapper");
+  // Ensure the base name is recorded
+  setBaseFnName(*new_wrapper, baseName);
+
+  // An inlinable function call in a function with debug info *must* be given
+  // a debug location.
+  DILocation *wrapperDbgLoc = nullptr;
+  if (auto *const SP = new_wrapper->getSubprogram()) {
+    wrapperDbgLoc = DILocation::get(context, /*line*/ 0, /*col*/ 0,
+                                    new_wrapper->getSubprogram());
+  }
+
+  IRBuilder<> entryIR(BasicBlock::Create(context, "entry", new_wrapper));
+
+  auto *const i32Ty = Type::getInt32Ty(context);
+
+  auto sizeTyBytes = getSizeTypeBytes(M);
+
+  auto *VF = materializeVF(entryIR, barrierMain.getVFInfo().vf);
+  Value *localSizeDim[3];
+
+  if (auto wgs = parseRequiredWGSMetadata(refF)) {
+    localSizeDim[0] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[0]);
+    localSizeDim[1] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[1]);
+    localSizeDim[2] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[2]);
+  } else {
+    const uint32_t max_work_dim = parseMaxWorkDimMetadata(refF).value_or(3);
+
+    // Fill out a default local size of 1x1x1.
+    std::fill(std::begin(localSizeDim), std::end(localSizeDim),
+              entryIR.getIntN(8 * sizeTyBytes, 1));
+
+    auto *const get_local_size =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+    assert(get_local_size && "Missing __mux_get_local_size");
+
+    auto ci0 =
+        entryIR.CreateCall(get_local_size, entryIR.getInt32(0), "local_size.x");
+    ci0->setCallingConv(get_local_size->getCallingConv());
+    localSizeDim[0] = ci0;
+
+    if (max_work_dim > 1) {
+      auto ci1 = entryIR.CreateCall(get_local_size, entryIR.getInt32(1),
+                                    "local_size.y");
+      ci1->setCallingConv(get_local_size->getCallingConv());
+      localSizeDim[1] = ci1;
+    }
+
+    if (max_work_dim > 2) {
+      auto ci2 = entryIR.CreateCall(get_local_size, entryIR.getInt32(2),
+                                    "local_size.z");
+      ci2->setCallingConv(get_local_size->getCallingConv());
+      localSizeDim[2] = ci2;
+    }
+  }
+
+  // Assume that local sizes are never zero. This prevents LLVM "saving" our
+  // loops by inserting llvm.umax (or its equivalent) to stop the loops we're
+  // about to create from causing headaches:
+  //   %iv.next = add i64 nuw %iv, 1
+  //   %exit = icmp eq i64 %iv.next, %localsizeY
+  //   br i1 %exit, label %exit.the.loop, %continue.the.loop
+  // If LLVM doesn't know that %localsizey is never zero, it rightly determines
+  // that a zero size would cause problems, since we'd have to overflow our i64
+  // to exit the loop, but we've marked the increment as 'nuw'. So it inserts
+  // an llvm.umax to ensure the size is at least 1. Since we know our local
+  // sizes are never zero, an llvm.assume intrinsic prevents this from
+  // happening.
+  // We want to insert a call to __mux__set_max_sub_group_size after these
+  // assumptions, to keep track of the last one we've inserted.
+  Instruction *setMaxSubgroupSizeInsertPt = nullptr;
+  for (auto i = 0; i < 3; i++) {
+    auto *const nonZero = entryIR.CreateICmpNE(
+        localSizeDim[i], ConstantInt::get(localSizeDim[i]->getType(), 0));
+    setMaxSubgroupSizeInsertPt = entryIR.CreateAssumption(nonZero);
+  }
+
+  const bool isVectorPredicated = barrierMain.getVFInfo().IsVectorPredicated;
+
+  Value *mainLoopLimit = localSizeDim[workItemDim0];
+  Value *peel = nullptr;
+  if (emitTail) {
+    peel = entryIR.CreateSRem(mainLoopLimit, VF, "peel");
+    mainLoopLimit = entryIR.CreateSub(mainLoopLimit, peel, "mainLoopLimit");
+  }
+
+  // Set the number of subgroups in this kernel
+  {
+    auto setNumSubgroupsFn =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetNumSubGroups, M);
+    assert(setNumSubgroupsFn && "Missing __mux_set_num_sub_groups");
+    // First, compute Z * Y
+    auto *const numSubgroupsZY = entryIR.CreateMul(
+        localSizeDim[workItemDim2], localSizeDim[workItemDim1], "sg.zy");
+    // Now multiply by the number of subgroups in the X dimension.
+    auto *numSubgroupsX = entryIR.CreateUDiv(mainLoopLimit, VF, "sg.main.x");
+    // Add on any tail iterations here.
+    if (peel) {
+      numSubgroupsX = entryIR.CreateAdd(numSubgroupsX, peel, "sg.x");
+    } else if (isVectorPredicated) {
+      // Vector predication will use an extra subgroup to mop up any remainder.
+      auto *const leftover = entryIR.CreateSRem(mainLoopLimit, VF, "peel");
+      auto *hasLeftover = entryIR.CreateICmp(
+          CmpInst::ICMP_NE, leftover, ConstantInt::get(leftover->getType(), 0),
+          "sg.has.vp");
+      hasLeftover = entryIR.CreateZExt(hasLeftover, numSubgroupsX->getType());
+      numSubgroupsX = entryIR.CreateAdd(numSubgroupsX, hasLeftover, "sg.x");
+    }
+    auto *numSubgroups =
+        entryIR.CreateMul(numSubgroupsZY, numSubgroupsX, "sg.zyx");
+    if (numSubgroups->getType() != i32Ty) {
+      numSubgroups = entryIR.CreateTrunc(numSubgroups, i32Ty);
+    }
+    entryIR.CreateCall(setNumSubgroupsFn, {numSubgroups});
+  }
+
+  if (barrierMain.hasLiveVars()) {
+    // The size in the first dimension is divided by the vectorization factor.
+    // When vector-predicated, this result is rounded up: (LIM + VF - 1) / VF.
+    // This catches cases where we need two loop iterations, e.g., VF=4 and
+    // size=7, where rounding down would give one.
+    Value *numerator = mainLoopLimit;
+    if (isVectorPredicated) {
+      Value *const vf_minus_1 =
+          entryIR.CreateSub(VF, ConstantInt::get(VF->getType(), 1));
+      numerator = entryIR.CreateAdd(mainLoopLimit, vf_minus_1);
+    }
+    Value *const size0 = entryIR.CreateUDiv(numerator, VF);
+
+    setUpLiveVarsAlloca(barrierMain, entryIR, localSizeDim[workItemDim2],
+                        localSizeDim[workItemDim1], size0, "live_variables",
+                        IsDebug);
+  }
+
+  // Amazingly, it's possible for the tail kernel to have live vars in its
+  // barriers, even when the main kernel does not.
+  if (emitTail && barrierTail->hasLiveVars()) {
+    Value *size0 = peel;
+    if (barrierTail->getVFInfo().IsVectorPredicated) {
+      // If the tail is predicated, it will only have a single (vectorized) item
+      // along the X axis, or none.
+      auto *const hasLeftover = entryIR.CreateICmp(
+          CmpInst::ICMP_NE, peel, ConstantInt::get(peel->getType(), 0),
+          "tail.has.vp");
+      size0 = entryIR.CreateZExt(hasLeftover, peel->getType());
+    }
+    setUpLiveVarsAlloca(*barrierTail, entryIR, localSizeDim[workItemDim2],
+                        localSizeDim[workItemDim1], size0,
+                        "live_variables_peel", IsDebug);
+  }
+
+  // next means next barrier id. This variable is uninitialized to begin with,
+  // and is set by the first pass below
+  IntegerType *index_type = i32Ty;
+  AllocaInst *nextID =
+      entryIR.CreateAlloca(index_type, nullptr, "next_barrier_id");
+
+  SmallVector<BasicBlock *, 8> bbs;
+  const unsigned num_blocks = barrierMain.getNumSubkernels();
+  assert(!emitTail || barrierTail->getNumSubkernels() == num_blocks);
+
+  for (unsigned i = kBarrier_EndID; i <= num_blocks; i++) {
+    BasicBlock *bb = BasicBlock::Create(context, "sw.bb", new_wrapper);
+    bbs.push_back(bb);
+  }
+
+  ScheduleGenerator schedule(M, barrierMain, barrierTail, BI);
+  schedule.workItemDim0 = workItemDim0;
+  schedule.workItemDim1 = workItemDim1;
+  schedule.workItemDim2 = workItemDim2;
+  schedule.localSizeDim[0] = localSizeDim[0];
+  schedule.localSizeDim[1] = localSizeDim[1];
+  schedule.localSizeDim[2] = localSizeDim[2];
+  schedule.wrapperDbgLoc = wrapperDbgLoc;
+  schedule.nextID = nextID;
+  schedule.mainLoopLimit = mainLoopLimit;
+  schedule.emitTail = emitTail;
+  schedule.isVectorPredicated = isVectorPredicated;
+  schedule.peel = peel;
+
+  // Make call instruction for first new kernel. It follows wrapper function's
+  // parameters.
+  for (auto &arg : new_wrapper->args()) {
+    schedule.args.push_back(&arg);
+  }
+
+  // Branch directly into the first basic block.
+  entryIR.CreateBr(bbs[kBarrier_FirstID]);
+
+  for (unsigned i = kBarrier_EndID; i <= num_blocks; i++) {
+    // Keep it linear
+    BasicBlock *const block = bbs[i];
+    block->moveAfter(&new_wrapper->back());
+
+    if (i == kBarrier_EndID) {
+      // This basic block breaks us out of our function, thus we return!
+      ReturnInst::Create(context, block);
+    } else {
+      // Re-issue the barrier's memory fence before the work-item loops
+      if (auto *const CI = barrierMain.getBarrierCall(i)) {
+        auto *const callee = CI->getCalledFunction();
+        const auto builtin = BI.analyzeBuiltin(*callee);
+        if (builtin.ID == compiler::utils::eMuxBuiltinWorkGroupBarrier) {
+          IRBuilder<> B(block);
+          auto *MemBarrier =
+              BI.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
+          assert(MemBarrier);
+          Value *Ops[2] = {CI->getOperand(1), CI->getOperand(2)};
+
+          auto *const Call = B.CreateCall(MemBarrier, Ops);
+
+          // Patch up any operands that were non-constants by fetching them from
+          // the barrier struct. We do this after creating the call because we
+          // need an instruction to function as an insert point.
+          if (!isa<Constant>(Ops[0]) || !isa<Constant>(Ops[1])) {
+            // We expect these values to be uniform so it should be safe to get
+            // from the barrier struct at index zero. Barriers are convergent,
+            // so there should be no chance that the value does not exist.
+            auto *const zero =
+                Constant::getNullValue(compiler::utils::getSizeType(M));
+            IRBuilder<> ir(Call);
+            auto *const barrier0 =
+                ir.CreateInBoundsGEP(barrierMain.getLiveVarsType(),
+                                     barrierMain.getMemSpace(), {zero});
+
+            Barrier::LiveValuesHelper live_values(barrierMain, Call, barrier0);
+
+            size_t op_index = 0;
+            for (auto *const op : Ops) {
+              if (!isa<Constant>(op)) {
+                auto *const new_op =
+                    live_values.getReload(op, ir, "_load", /*reuse*/ true);
+                Call->setArgOperand(op_index, new_op);
+              }
+              ++op_index;
+            }
+          }
+          Call->setDebugLoc(wrapperDbgLoc);
+        }
+      }
+
+      auto *const exitBlock = [&]() {
+        switch (barrierMain.getSchedule(i)) {
+          default:
+            assert(!"Unexpected barrier schedule enum");
+            LLVM_FALLTHROUGH;
+          case BarrierSchedule::Unordered:
+          case BarrierSchedule::ScalarTail:
+            if (tailInfo && tailInfo->IsVectorPredicated) {
+              return schedule.makeLinearWorkItemLoops(block, i);
+            }
+            return schedule.makeWorkItemLoops(block, i);
+
+          case BarrierSchedule::Once:
+            return schedule.makeRunOneWorkItem(block, i);
+
+          case BarrierSchedule::Linear:
+            return schedule.makeLinearWorkItemLoops(block, i);
+        }
+      }();
+
+      // the last basic block in our function!
+      IRBuilder<> exitIR(exitBlock);
+
+      const auto &successors = barrierMain.getSuccessorIds(i);
+      const auto num_succ = successors.size();
+
+      if (num_succ == 1) {
+        // If there is only one successor, we can branch directly to it
+        exitIR.CreateBr(bbs[successors.front()]);
+      } else if (num_succ == 2) {
+        // If there are exactly two successors, we can use a conditional branch
+        auto *const bb_id = ConstantInt::get(index_type, successors[0]);
+        auto *const br_block =
+            BasicBlock::Create(context, "barrier.branch", new_wrapper);
+        auto *const ld_next_id = new LoadInst(index_type, nextID, "", br_block);
+        auto *const cmp_id =
+            CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, ld_next_id,
+                            bb_id, "", br_block);
+        BranchInst::Create(bbs[successors[0]], bbs[successors[1]], cmp_id,
+                           br_block);
+
+        exitIR.CreateBr(br_block);
+      } else if (num_succ == 0) {
+        // If a barrier region has no successor, we just emit a call to
+        // llvm.trap and unreachable. A barrier region can have zero successors
+        // if all its terminators end in unreachable. Since there are no
+        // successors, it is not possible to continue and therefore we emit an
+        // unreachable here.
+
+        // TODO: we should be flagging up unreachables sooner, so that we avoid
+        // wrapping barrier regions with no successors with work item loops,
+        // and we should also make sure that the barrier region has no
+        // successors because of all its terminators ending in unreachable.
+        // If it's not the case we may want to handle that differently.
+        auto trap =
+            M.getOrInsertFunction("llvm.trap", Type::getVoidTy(context));
+        exitIR.CreateCall(trap);
+        exitIR.CreateUnreachable();
+      } else {
+        // Make a basic block with a switch to jump to the next subkernel
+        auto *const switch_body =
+            BasicBlock::Create(context, "barrier.switch", new_wrapper);
+        LoadInst *const ld_next_id =
+            new LoadInst(index_type, nextID, "", switch_body);
+        SwitchInst *const sw = SwitchInst::Create(
+            ld_next_id, bbs[successors[0]], num_succ, switch_body);
+        for (const auto i : successors) {
+          sw->addCase(ConstantInt::get(index_type, i), bbs[i]);
+        }
+        exitIR.CreateBr(switch_body);
+      }
+    }
+  }
+
+  bbs[kBarrier_EndID]->moveAfter(&new_wrapper->back());
+  bbs[kBarrier_EndID]->setName("kernel.exit");
+
+  // Set the subgroup maximum size in this kernel wrapper.
+  // There are three cases:
+  //
+  // 1. With no vectorization:
+  //    get_max_sub_group_size() = mux sub-group size
+  //
+  // 2. With predicated vectorization:
+  //    get_max_sub_group_size() = min(vector_width,
+  //    local_size_in_vectorization_dimension)
+  //
+  // 3. Without predicated vectorization:
+  //    get_max_sub_group_size() = local_size_in_vectorization_dimension
+  //    < vector_width ? mux sub-group size : vector_width
+  {
+    // Reset the insertion point back to the wrapper entry block, after VF was
+    // materialized.
+    entryIR.SetInsertPoint(setMaxSubgroupSizeInsertPt);
+    auto setMaxSubgroupSizeFn =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M);
+    assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size");
+    // Assume no vectorization to begin with i.e. get_max_sub_group_size() = mux
+    // sub-group size.
+    Value *maxSubgroupSize = entryIR.getInt32(getMuxSubgroupSize(refF));
+    if (schedule.wrapperHasMain) {
+      auto *localSizeInVecDim = localSizeDim[workItemDim0];
+      auto *cmp = entryIR.CreateICmpULT(localSizeInVecDim, VF);
+      if (isVectorPredicated) {
+        maxSubgroupSize = entryIR.CreateSelect(cmp, localSizeInVecDim, VF);
+      } else {
+        maxSubgroupSize = entryIR.CreateSelect(
+            cmp, ConstantInt::get(VF->getType(), getMuxSubgroupSize(refF)), VF);
+      }
+      if (maxSubgroupSize->getType() != i32Ty) {
+        maxSubgroupSize = entryIR.CreateTrunc(maxSubgroupSize, i32Ty);
+      }
+    }
+    entryIR.CreateCall(setMaxSubgroupSizeFn, {maxSubgroupSize});
+  }
+
+  // Remap any constant expression which take a reference to the old function
+  // FIXME: What about the main function?
+  for (auto *user : make_early_inc_range(refF.users())) {
+    if (ConstantExpr *constant = dyn_cast<ConstantExpr>(user)) {
+      remapConstantExpr(constant, &refF, new_wrapper);
+    } else if (ConstantArray *ca = dyn_cast<ConstantArray>(user)) {
+      remapConstantArray(ca, &refF, new_wrapper);
+    } else if (!isa<CallInst>(user)) {
+      llvm_unreachable(
+          "Cannot handle user of function being anything other than a "
+          "ConstantExpr, ConstantArray or CallInst");
+    }
+  }
+  // We output the number of uses here to lit test that the number of uses was
+  // not increased by the remap functions.
+  LLVM_DEBUG(dbgs() << "Uses of " << refF.getName() << ": " << refF.getNumUses()
+                    << "\n");
+
+  // Forcibly disable the tail info if we know we've omitted it.
+  if (!schedule.wrapperHasMain || !schedule.wrapperHasTail) {
+    // If we're missing a main loop then the tail loop becomes the main from
+    // the perspective of the metadata: have that steal the tail loop info. We
+    // should always have a main loop with an optional tail.
+    if (!schedule.wrapperHasMain) {
+      if (schedule.wrapperHasTail && tailInfo) {
+        mainInfo = *tailInfo;
+      } else {
+        // If we have neither a main nor a tail (which may happen at kernel
+        // compile time but we should never actually execute such a kernel -
+        // we already assume the local sizes are never zero, see elsewhere in
+        // this pass) then encode a token info metadata of 1.
+        mainInfo =
+            VectorizationInfo{VectorizationFactor::getScalar(), workItemDim0,
+                              /*isVectorPredicated*/ false};
+      }
+    }
+    tailInfo = std::nullopt;
+  }
+
+  encodeWrapperFnMetadata(*new_wrapper, mainInfo, tailInfo);
+
+  // The subkernels can be marked as internal since its external uses have been
+  // superceded by this wrapper. This will help it get DCE'd once inlined. Any
+  // existing calls to this subkernel (e.g., another kernel calling this
+  // kernel) will prevent it from being removed unnecessarily.
+  barrierMain.getFunc().setLinkage(Function::InternalLinkage);
+  if (barrierTail) {
+    barrierTail->getFunc().setLinkage(Function::InternalLinkage);
+  }
+
+  return new_wrapper;
+}
+
+struct BarrierWrapperInfo {
+  StringRef BaseName;
+  // Information about the 'main' kernel
+  Function *MainF;
+  compiler::utils::VectorizationInfo MainInfo;
+  // Optional information about the 'tail' kernel
+  Function *TailF = nullptr;
+  std::optional<compiler::utils::VectorizationInfo> TailInfo = std::nullopt;
+  // A 'tail' kernel which was explicitly omitted.
+  Function *SkippedTailF = nullptr;
+};
+
+PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
+    Module &M, ModuleAnalysisManager &MAM) {
+  // Cache the functions we're interested in as this pass introduces new ones
+  // which we don't want to run over.
+  SmallVector<BarrierWrapperInfo, 4> MainTailPairs;
+  const auto &GSGI = MAM.getResult<compiler::utils::SubgroupAnalysis>(M);
+
+  for (auto &F : M.functions()) {
+    if (!isKernelEntryPt(F)) {
+      continue;
+    }
+
+    const auto BaseName = getBaseFnNameOrFnName(F);
+    auto VeczToOrigFnData = parseVeczToOrigFnLinkMetadata(F);
+
+    const auto WorkItemDim0 = 0;
+
+    const VectorizationInfo scalarTailInfo{VectorizationFactor::getScalar(),
+                                           WorkItemDim0,
+                                           /*IsVectorPredicated*/ false};
+
+    if (!VeczToOrigFnData) {
+      // If there was no vectorization metadata, it's a scalar kernel.
+      MainTailPairs.push_back({BaseName, &F, scalarTailInfo});
+      continue;
+    }
+
+    // If we got a vectorized kernel, wrap it using the vectorization factor.
+    const auto MainInfo = VeczToOrigFnData->second;
+
+    // Start out assuming scalar tail, which is the default behaviour...
+    auto TailInfo = scalarTailInfo;
+    auto *TailFunc = VeczToOrigFnData->first;
+    // ... and search for a linked vector-predicated tail, which we prefer.
+    if (!MainInfo.IsVectorPredicated && TailFunc) {
+      SmallVector<LinkMetadataResult, 4> LinkedFns;
+      parseOrigToVeczFnLinkMetadata(*TailFunc, LinkedFns);
+      for (const auto &Link : LinkedFns) {
+        // Restrict our option to strict VF==VF matches.
+        if (Link.first != &F && Link.second.vf == MainInfo.vf &&
+            Link.second.IsVectorPredicated) {
+          TailFunc = Link.first;
+          TailInfo = Link.second;
+          break;
+        }
+      }
+    }
+
+    std::optional<size_t> LocalSizeInVecDim;
+    if (auto WGS = parseRequiredWGSMetadata(F)) {
+      LocalSizeInVecDim = (*WGS)[WorkItemDim0];
+    }
+
+    // We can skip the tail in the following circumstances:
+    // * If we have no tail function (trusting that this is okay)
+    // * Vector-predicated kernels handle their own tails
+    // * The user has explicitly forced us to omit tails
+    // * We can prove that the vectorization factor fits the required/known
+    //   local work-group size
+    if (!TailFunc || MainInfo.IsVectorPredicated || ForceNoTail ||
+        (LocalSizeInVecDim && !MainInfo.vf.isScalable() &&
+         *LocalSizeInVecDim % MainInfo.vf.getKnownMin() == 0)) {
+      MainTailPairs.push_back({BaseName, &F, MainInfo, /*TailF*/ nullptr,
+                               /*TailInfo*/ std::nullopt,
+                               /*SkippedTailF*/ TailFunc});
+    } else {
+      // Else, emit a tail using the tail function.
+      MainTailPairs.push_back({BaseName, &F, MainInfo, TailFunc, TailInfo});
+    }
+  }
+
+  if (MainTailPairs.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  // Prune redundant wrappers we don't want to create for the sake of compile
+  // time.
+  SmallPtrSet<const Function *, 4> RedundantMains;
+  for (const auto &P : MainTailPairs) {
+    // If we're creating a wrapper with a skipped 'tail' or a scalar 'tail', we
+    // don't want to create another wrapper where the scalar tail is the
+    // 'main', unless that tail is useful as a fallback sub-group kernel. A
+    // fallback sub-group kernel is one for which:
+    // * The 'main' is not a degenerate sub-group kernel. These are always safe
+    // to run so the fallback is unnecessary.
+    // * The 'main' has a required sub-group size that isn't the scalar size.
+    // * The 'main' and 'tail' kernels both make use of sub-group builtins. If
+    // neither do, there's no need for the fallback.
+    // * The 'main' kernel uses sub-groups but the 'main' vectorization factor
+    // cleanly divides the known local work-group size.
+    if (P.SkippedTailF || (P.TailInfo && P.TailInfo->vf.isScalar())) {
+      const auto *TailF = P.SkippedTailF ? P.SkippedTailF : P.TailF;
+      if (hasDegenerateSubgroups(*P.MainF) ||
+          getReqdSubgroupSize(*P.MainF).value_or(1) != 1 ||
+          (!GSGI.usesSubgroups(*P.MainF) && !GSGI.usesSubgroups(*TailF))) {
+        RedundantMains.insert(TailF);
+      } else if (auto wgs = parseRequiredWGSMetadata(*P.MainF)) {
+        const uint64_t local_size_x = wgs.value()[0];
+        if (!P.MainInfo.IsVectorPredicated &&
+            !(local_size_x % P.MainInfo.vf.getKnownMin())) {
+          RedundantMains.insert(TailF);
+        }
+      }
+    }
+    // If we're creating a wrapper with a VP 'tail', we don't want to create
+    // another wrapper where the VP is the 'main'
+    if (!P.MainInfo.IsVectorPredicated && P.TailInfo &&
+        P.TailInfo->IsVectorPredicated) {
+      RedundantMains.insert(P.TailF);
+    }
+  }
+
+  MainTailPairs.erase(
+      std::remove_if(MainTailPairs.begin(), MainTailPairs.end(),
+                     [&RedundantMains](const BarrierWrapperInfo &I) {
+                       return RedundantMains.contains(I.MainF);
+                     }),
+      MainTailPairs.end());
+
+  SmallPtrSet<Function *, 4> Wrappers;
+  auto &BI = MAM.getResult<BuiltinInfoAnalysis>(M);
+
+  for (const auto &P : MainTailPairs) {
+    assert(P.MainF && "Missing main function");
+    // Construct the main barrier
+    BarrierWithLiveVars MainBarrier(M, *P.MainF, P.MainInfo, IsDebug);
+    MainBarrier.Run(MAM);
+
+    // Tail kernels are optional
+    if (!P.TailF) {
+      Wrappers.insert(
+          makeWrapperFunction(MainBarrier, nullptr, P.BaseName, M, BI));
+    } else {
+      // Construct the tail barrier
+      assert(P.TailInfo && "Missing tail info");
+      BarrierWithLiveVars TailBarrier(M, *P.TailF, *P.TailInfo, IsDebug);
+      TailBarrier.Run(MAM);
+
+      Wrappers.insert(
+          makeWrapperFunction(MainBarrier, &TailBarrier, P.BaseName, M, BI));
+    }
+  }
+
+  // At this point we mandate that any kernels that haven't been wrapped with
+  // work-item loops can't be kernels, nor entry points.
+  for (auto &F : M) {
+    if (isKernelEntryPt(F) && !Wrappers.contains(&F)) {
+      dropIsKernel(F);
+      // FIXME: Also mark them as internal in case they contain symbols we
+      // haven't resolved as part of the work-item loop wrapping process. We
+      // rely on GlobalOptPass to remove such functions; this is the same root
+      // issue as CA-4126.
+      F.setLinkage(GlobalValue::InternalLinkage);
+    }
+  }
+
+  return PreservedAnalyses::none();
+}

From cca9b23912a69ac7390f63e009537af7c1b13d71 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Mon, 6 May 2024 14:52:29 +0100
Subject: [PATCH 108/182] Use LLVM passes to output modules

---
 .../compiler_passes/vecz/tools/source/veczc.cpp          | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 102f2d1f0ad20..4fb058699f515 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -27,8 +27,9 @@
 #include <llvm/Analysis/CGSCCPassManager.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Bitcode/BitcodeReader.h>
-#include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/Bitcode/BitcodeWriterPass.h>
 #include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IRPrinter/IRPrintingPasses.h>
 #include <llvm/IRReader/IRReader.h>
 #include <llvm/InitializePasses.h>
 #include <llvm/MC/TargetRegistry.h>
@@ -443,11 +444,13 @@ int main(const int argc, const char *const argv[]) {
   }
 
   // Write the resulting module.
+  llvm::ModulePassManager printMPM;
   if (WriteTextual) {
-    Out->os() << *module;
+    printMPM.addPass(llvm::PrintModulePass(Out->os()));
   } else {
-    llvm::WriteBitcodeToFile(*module, Out->os());
+    printMPM.addPass(llvm::BitcodeWriterPass(Out->os()));
   }
+  printMPM.run(*module, passMach.getMAM());
 
   Out->keep();
 

From 2dc22836de4964d90a731dbc6a3bac7379d02830 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Thu, 9 May 2024 10:23:20 +0100
Subject: [PATCH 109/182] Remove all code specific to LLVM 16

We have now switched to a minimum of LLVM 17, so there is no need
for specific code to handle LLVM 16.
---
 .../include/multi_llvm/multi_llvm.h           |  1 -
 .../include/multi_llvm/triple.h               | 27 --------
 .../source/barrier_regions.cpp                |  4 +-
 .../source/cl_builtin_info.cpp                |  1 +
 .../compiler_pipeline/source/mangling.cpp     | 19 ------
 .../source/mux_builtin_info.cpp               |  5 --
 .../optimal_builtin_replacement_pass.cpp      |  2 +-
 .../source/pass_functions.cpp                 |  6 --
 .../source/pass_machinery.cpp                 |  4 --
 .../source/target_extension_types.cpp         | 61 -------------------
 .../source/transform/basic_mem2reg_pass.cpp   |  4 +-
 .../vecz/source/vector_target_info.cpp        |  2 +-
 .../vecz/source/vector_target_info_riscv.cpp  |  4 --
 .../vecz/tools/source/veczc.cpp               |  3 -
 14 files changed, 5 insertions(+), 138 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 85ffd2f72715b..be4169aed2120 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -20,6 +20,5 @@
 #include <multi_llvm/enums.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/loop_utils.h>
-#include <multi_llvm/triple.h>
 
 #endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h
deleted file mode 100644
index 1b069001f7019..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/triple.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#ifndef MULTI_LLVM_TRIPLE_H_INCLUDED
-#define MULTI_LLVM_TRIPLE_H_INCLUDED
-
-#include <multi_llvm/llvm_version.h>
-
-#if LLVM_VERSION_MAJOR >= 17
-#include <llvm/TargetParser/Triple.h>
-#else
-#include <llvm/ADT/Triple.h>
-#endif
-
-#endif  // MULTI_LLVM_TRIPLE_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 9d3dd8eb7b2f5..3affc145019cb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -972,10 +972,8 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() {
     // so record this and the matching byte offset into the struct.
 #if LLVM_VERSION_GREATER_EQUAL(18, 0)
     auto DbgIntrinsics = findDbgDeclares(member.value);
-#elif LLVM_VERSION_GREATER_EQUAL(17, 0)
-    auto DbgIntrinsics = FindDbgDeclareUses(member.value);
 #else
-    auto DbgIntrinsics = FindDbgAddrUses(member.value);
+    auto DbgIntrinsics = FindDbgDeclareUses(member.value);
 #endif
     for (auto DII : DbgIntrinsics) {
       if (auto dbgDeclare = dyn_cast<DbgDeclareInst>(DII)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 92faf3552cbd4..9f02245b369e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -28,6 +28,7 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
 #include <llvm/Support/MathExtras.h>
+#include <llvm/TargetParser/Triple.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/multi_llvm.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
index 45dc86ec4edaa..e2f41d3b39017 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
@@ -249,20 +249,15 @@ bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Quals,
   } else if (Ty->isPointerTy()) {
     PointerType *PtrTy = cast<PointerType>(Ty);
     const unsigned AddressSpace = PtrTy->getAddressSpace();
-#if LLVM_VERSION_LESS(17, 0)
-    assert(PtrTy->isOpaque() && "No support for typed pointers past LLVM 15");
-#endif
     O << "u3ptr";
     manglePointerQuals(O, Qual, AddressSpace);
     return true;
-#if LLVM_VERSION_GREATER_EQUAL(17, 0)
   } else if (Ty->isTargetExtTy()) {
     if (auto Name = mangleBuiltinType(Ty)) {
       O << *Name;
       return true;
     }
     return false;
-#endif
   } else {
     return false;
   }
@@ -331,17 +326,6 @@ bool NameMangler::demangleSimpleType(Lexer &L, Type *&Ty, TypeQualifier &Qual) {
 }
 
 std::optional<std::string> NameMangler::mangleBuiltinType(Type *Ty) {
-  // With opaque pointers, before LLVM 17 we can't actually mangle OpenCL
-  // builtin types because our APIs don't expose the ability to mangle a pointer
-  // based on its element type.
-  // This is never a problem in the compiler as we don't generate such functions
-  // on the fly, but it is a weakness in the API. We could fix this, or wait it
-  // out until LLVM 17 becomes the minimum version, at which point target
-  // extension types save the day.
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ty;
-  return nullptr;
-#else
   auto *const TgtTy = cast<TargetExtType>(Ty);
   const StringRef Name = TgtTy->getName();
 
@@ -399,7 +383,6 @@ std::optional<std::string> NameMangler::mangleBuiltinType(Type *Ty) {
   }
 
   return std::to_string(MangledName.size()) + MangledName;
-#endif
 }
 
 bool NameMangler::demangleOpenCLBuiltinType(Lexer &L, Type *&Ty) {
@@ -408,7 +391,6 @@ bool NameMangler::demangleOpenCLBuiltinType(Lexer &L, Type *&Ty) {
     return true;
   }
 
-#if LLVM_VERSION_GREATER_EQUAL(17, 0)
   if (auto *TargetExtTy = [this, &L]() -> Type * {
         if (L.Consume("11ocl_image1d")) {
           return compiler::utils::tgtext::getImage1DTy(*Context);
@@ -449,7 +431,6 @@ bool NameMangler::demangleOpenCLBuiltinType(Lexer &L, Type *&Ty) {
     Ty = TargetExtTy;
     return true;
   }
-#endif
 
   StringRef Name;
   //
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index b1120d26c1444..64644428940af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -875,10 +875,6 @@ bool BIMuxInfoConcept::requiresSchedulingParameters(BuiltinID ID) {
 }
 
 Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)M;
-  (void)Ty;
-#else
   // We only map target extension types
   assert(Ty && Ty->isTargetExtTy() && "Only expecting target extension types");
   auto &Ctx = Ty->getContext();
@@ -907,7 +903,6 @@ Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) {
     }());
   }
 
-#endif
   return nullptr;
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index 17b5f0ebd77fb..b834a15eb6c3a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -26,7 +26,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/IR/Intrinsics.h>
-#include <multi_llvm/triple.h>
+#include <llvm/TargetParser/Triple.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #define DEBUG_TYPE "ca-optimal-builtins"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 35cf68a70dd35..025a5daa16fb2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -623,16 +623,10 @@ static llvm::Function *createKernelWrapperFunctionImpl(
   if (auto *SP = F.getSubprogram()) {
     const llvm::DIBuilder DIB(*F.getParent());
     llvm::DISubprogram *const NewSP = DIB.createArtificialSubprogram(SP);
-#if LLVM_VERSION_GREATER_EQUAL(17, 0)
     // Wipe the list of retained nodes, as this new function is a wrapper over
     // the old one and does not itself contain/retain any of the wrapped
     // function's nodes.
     NewSP->replaceRetainedNodes({});
-#else
-    // This does the same as the above, but there's no cleaner API with which
-    // to do it.
-    NewSP->replaceOperandWith(7, nullptr);
-#endif
     NewFunction.setSubprogram(NewSP);
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
index 02527c0309406..bcb564de9c784 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
@@ -99,11 +99,7 @@ void PassMachinery::initializeFinish() {
   registerPassCallbacks();
 
   // Register pass instrumentation
-#if LLVM_VERSION_GREATER_EQUAL(17, 0)
   SI->registerCallbacks(PIC, &MAM);
-#else
-  SI->registerCallbacks(PIC, &FAM);
-#endif
 }
 
 void PassMachinery::buildDefaultAAPipeline() {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
index 90dfbaad05744..54a8097653edf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
@@ -27,37 +27,17 @@ namespace utils {
 namespace tgtext {
 
 Type *getEventTy(LLVMContext &Ctx) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return TargetExtType::get(Ctx, "spirv.Event");
-#endif
 }
 
 Type *getSamplerTy(LLVMContext &Ctx) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return TargetExtType::get(Ctx, "spirv.Sampler");
-#endif
 }
 
 [[maybe_unused]] static Type *getImageTyHelper(
     LLVMContext &Ctx, ImageTyDimensionalityParam Dim, ImageTyDepthParam Depth,
     ImageTyArrayedParam Arrayed, ImageTyMSParam MS, ImageTySampledParam Sampled,
     ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)Dim;
-  (void)Depth;
-  (void)Arrayed;
-  (void)MS;
-  (void)Sampled;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   unsigned IntParams[7];
   IntParams[ImageTyDimensionalityIdx] = Dim;
   IntParams[ImageTyDepthIdx] = Depth;
@@ -68,7 +48,6 @@ Type *getSamplerTy(LLVMContext &Ctx) {
   IntParams[ImageTyAccessQualIdx] = AccessQual;
   return TargetExtType::get(Ctx, "spirv.Image", Type::getVoidTy(Ctx),
                             IntParams);
-#endif
 }
 
 [[maybe_unused]] static Type *getOpenCLImageTyHelper(
@@ -87,74 +66,34 @@ Type *getSamplerTy(LLVMContext &Ctx) {
 }
 
 Type *getImage1DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageNonArrayed, AccessQual);
-#endif
 }
 
 Type *getImage1DArrayTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageArrayed, AccessQual);
-#endif
 }
 
 Type *getImage1DBufferTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return getOpenCLImageTyHelper(Ctx, ImageDimBuffer, ImageNonArrayed,
                                 AccessQual);
-#endif
 }
 
 Type *getImage2DTy(LLVMContext &Ctx, bool Depth, bool MS,
                    ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)Depth;
-  (void)MS;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return getOpenCLImageTyHelper(
       Ctx, ImageDim2D, ImageNonArrayed, Depth ? ImageDepth : ImageDepthNone,
       MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual);
-#endif
 }
 
 Type *getImage2DArrayTy(LLVMContext &Ctx, bool Depth, bool MS,
                         ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)Depth;
-  (void)MS;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return getOpenCLImageTyHelper(
       Ctx, ImageDim2D, ImageArrayed, Depth ? ImageDepth : ImageDepthNone,
       MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual);
-#endif
 }
 
 Type *getImage3DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
-#if LLVM_VERSION_LESS(17, 0)
-  (void)Ctx;
-  (void)AccessQual;
-  llvm_unreachable("Can't use target extension types before LLVM 17");
-#else
   return getOpenCLImageTyHelper(Ctx, ImageDim3D, ImageNonArrayed, AccessQual);
-#endif
 }
 
 }  // namespace tgtext
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index cd2bb4f3c0dcf..af5e0f1737ddc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -185,10 +185,8 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
       DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
 #if LLVM_VERSION_GREATER_EQUAL(18, 0)
       auto DbgIntrinsics = findDbgDeclares(Alloca);
-#elif LLVM_VERSION_GREATER_EQUAL(17, 0)
-      auto DbgIntrinsics = FindDbgDeclareUses(Alloca);
 #else
-      auto DbgIntrinsics = FindDbgAddrUses(Alloca);
+      auto DbgIntrinsics = FindDbgDeclareUses(Alloca);
 #endif
       for (auto oldDII : DbgIntrinsics) {
         ConvertDebugDeclareToDebugValue(oldDII, Store, DIB);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index c6dfc00e904e4..78c223bb766a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -19,7 +19,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/MC/TargetRegistry.h>
 #include <llvm/Target/TargetMachine.h>
-#include <multi_llvm/triple.h>
+#include <llvm/TargetParser/Triple.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 0ebb2edc816db..748991d555454 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -747,11 +747,7 @@ Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
   auto *const I64Ty = Type::getInt64Ty(B.getContext());
 
   auto *const VL =
-#if LLVM_VERSION_GREATER_EQUAL(17, 0)
       B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli, {I64Ty},
-#else
-      B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli_opt, {I64Ty},
-#endif
                         {RemainingIters, VSEW, VLMul});
 
   return B.CreateTrunc(VL, I32Ty);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 4fb058699f515..491086842da18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -294,9 +294,6 @@ int main(const int argc, const char *const argv[]) {
 
   llvm::SMDiagnostic err;
   llvm::LLVMContext context;
-#if LLVM_VERSION_LESS(17, 0)
-  context.setOpaquePointers(true);
-#endif
 
   std::unique_ptr<llvm::Module> module =
       llvm::parseIRFile(InputFilename, err, context);

From 80d71ac17fd98114cb3d3b5ba169aa305133c8b8 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Mon, 6 May 2024 17:49:42 +0100
Subject: [PATCH 110/182] Fix replace_local_module_scope_variables mempcy bug

replace_local_module_scope_variables was incorrectly trying to clone and
delete the original of memcpy which was referenced in a llvm.compiler.used
line to ensure it continued to exist.

This fixes this by looking at any globals and working back through the
user chain until we find an Instruction and only cloning functions that
actually require it, as well as all kernels. It also moves and modifies
the `addParamToAllFunctions` to the ReplaceLocalModuleScopeVariablesPass
as it is only used in there and we can make more appropriate changes.

This change puts it more in line with other similar add parameter
pass_functions such as the scheduling parameters and is more efficient.
---
 .../include/compiler/utils/pass_functions.h   |  20 ----
 .../source/pass_functions.cpp                 |  19 ----
 ...lace_local_module_scope_variables_pass.cpp | 106 +++++++++++++++++-
 3 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
index ea7be99445996..cb0b457cc60f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
@@ -159,26 +159,6 @@ bool cloneFunctionsAddArg(
 void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
                           bool extraArg);
 
-/// @brief Clone all functions in a module, appending an extra parameter to
-/// them.
-///
-/// @param module llvm module containing the functions
-/// @param newParamType Type of the parameter to be added
-/// @param newParamAttrs Parameter attributes of the parameter to be added
-/// @param updateMetaDataCallback if set, is invokved with the old function,
-/// new function and new argument index.
-///
-/// @return bool if the module has changed (currently always true)
-///
-/// This iterates through all the functions in a module and clones all
-/// functions with a body and adds the extra param at the end of their parameter
-/// lists. Simpler version of `cloneFunctionsAddArg()` where the use case is
-/// more limited.
-bool addParamToAllFunctions(
-    llvm::Module &module, llvm::Type *const newParamType,
-    const llvm::AttributeSet &newParamAttrs,
-    const UpdateMDCallbackFn &updateMetaDataCallback = nullptr);
-
 using CreateLoopBodyFn = std::function<llvm::BasicBlock *(
     llvm::BasicBlock *, llvm::Value *, llvm::ArrayRef<llvm::Value *>,
     llvm::MutableArrayRef<llvm::Value *>)>;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 025a5daa16fb2..4bcf4f39cb63b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -469,25 +469,6 @@ void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
   }
 }
 
-bool addParamToAllFunctions(llvm::Module &module,
-                            llvm::Type *const newParamType,
-                            const llvm::AttributeSet &newParamAttrs,
-                            const UpdateMDCallbackFn &updateMetaDataCallback) {
-  return cloneFunctionsAddArg(
-      module,
-      [newParamType, newParamAttrs](llvm::Module &) {
-        return ParamTypeAttrsPair{newParamType, newParamAttrs};
-      },
-      [](const llvm::Function &func, bool &ClonedWithBody, bool &ClonedNoBody) {
-        // don't clone and add arg to special functions starting with __llvm.
-        // These are reserved for clang generated functions such as profile
-        // related ones
-        ClonedWithBody = !func.getName().starts_with("__llvm");
-        ClonedNoBody = false;
-      },
-      updateMetaDataCallback);
-}
-
 llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
                              llvm::Value *indexStart, llvm::Value *indexEnd,
                              const CreateLoopOpts &opts,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index bfd12374f960b..95bf6fb008e60 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -19,6 +19,8 @@
 #include <compiler/utils/metadata.h>
 #include <compiler/utils/pass_functions.h>
 #include <compiler/utils/replace_local_module_scope_variables_pass.h>
+#include <llvm/ADT/PriorityWorklist.h>
+#include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IRBuilder.h>
@@ -31,6 +33,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "replace-module-scope-vars"
+
 namespace {
 using AlignIntTy = uint64_t;
 
@@ -117,6 +121,106 @@ struct GlobalVarDebugInfoWrapper final {
   Function *function;
 };
 
+// Check if a user is an instruction and if so add it to the Visited, Worklist
+// and FuncsToClone. If it's not an instruction repeat for all its users
+void checkUsersForInstructions(
+    User *user, llvm::SmallPtrSet<llvm::Function *, 4> &Visited,
+    llvm::SmallVector<llvm::Function *, 4> &FuncsToClone,
+    llvm::SmallPriorityWorklist<llvm::Function *, 4> &Worklist) {
+  if (auto *I = dyn_cast<Instruction>(user)) {
+    auto *F = I->getFunction();
+    if (Visited.insert(F).second) {
+      Worklist.insert(F);
+      FuncsToClone.push_back(F);
+      LLVM_DEBUG(
+          dbgs() << "Function '" << F->getName()
+                 << "' requires additional local module struct parameter\n");
+    }
+  } else {
+    for (auto *user_of_user : user->users()) {
+      checkUsersForInstructions(user_of_user, Visited, FuncsToClone, Worklist);
+    }
+  }
+}
+
+/// @brief Clone all required functions in a module, appending an extra
+/// parameter to them if they are part of the call graph required for access to
+/// local variables.
+///
+/// @param module llvm module containing the functions
+/// @param newParamType Type of the parameter to be added
+/// @param newParamAttrs Parameter attributes of the parameter to be added
+/// @return bool if the module has changed (currently always true)
+///
+/// This recurses through all the users of the local variables to look for any
+/// functions which use them as well as assuming that the top level kernels must
+/// have them.
+bool addParamToAllRequiredFunctions(llvm::Module &module,
+                                    llvm::Type *const newParamType,
+                                    const llvm::AttributeSet &newParamAttrs) {
+  llvm::SmallPtrSet<llvm::Function *, 4> Visited;
+  llvm::SmallVector<llvm::Function *, 4> FuncsToClone;
+  llvm::SmallPriorityWorklist<llvm::Function *, 4> Worklist;
+
+  // Iterate through the top level functions checking if they are kernels.
+  for (auto &F : module.functions()) {
+    // Kernel entry points must present a consistent ABI to external users
+    if (compiler::utils::isKernelEntryPt(F)) {
+      Visited.insert(&F);
+      Worklist.insert(&F);
+      FuncsToClone.push_back(&F);
+      LLVM_DEBUG(
+          dbgs() << "Function '" << F.getName()
+                 << "' requires additional local module struct parameter\n");
+      continue;
+    }
+  }
+
+  // Check each global's users if they are instructions or recurse up the user
+  // chain if not. If an Instruction is found we add it to the functions to
+  // clone.
+  for (auto &global : module.globals()) {
+    for (auto *user : global.users()) {
+      checkUsersForInstructions(user, Visited, FuncsToClone, Worklist);
+    }
+  }
+
+  // Iterate over the functions that require local struct parameters and
+  // recursively register all callers of those functions as needing local struct
+  // parameters too.
+  while (!Worklist.empty()) {
+    Function *F = Worklist.pop_back_val();
+    for (auto *U : F->users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        auto *Caller = CB->getFunction();
+        if (Visited.insert(Caller).second) {
+          Worklist.insert(Caller);
+          FuncsToClone.push_back(Caller);
+          LLVM_DEBUG(dbgs() << "Function '" << Caller->getName()
+                            << "' requires local struct parameters\n");
+        }
+      } else {
+        report_fatal_error("unhandled user type");
+      }
+    }
+  }
+
+  // Ideally cloneFunctionsAddArg() would take a list of functions, but
+  // currently takes a std::function so we search the created vector of
+  // functions.
+  return compiler::utils::cloneFunctionsAddArg(
+      module,
+      [newParamType, newParamAttrs](llvm::Module &) {
+        return compiler::utils::ParamTypeAttrsPair{newParamType, newParamAttrs};
+      },
+      [&FuncsToClone](const llvm::Function &func, bool &ClonedWithBody,
+                      bool &ClonedNoBody) {
+        ClonedWithBody = llvm::is_contained(FuncsToClone, &func);
+        ClonedNoBody = false;
+      },
+      nullptr /*updateMetaDataCallback*/);
+}
+
 }  // namespace
 
 PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
@@ -250,7 +354,7 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
 
   // change all our functions to take a pointer to the new structTy we created
   const AttributeSet defaultAttrs;
-  addParamToAllFunctions(M, structTy->getPointerTo(), defaultAttrs);
+  addParamToAllRequiredFunctions(M, structTy->getPointerTo(), defaultAttrs);
 
   // Check if we have debug info, if so we need to fix it up to turn global
   // variable entries into local variable ones.

From d1f526d35079772f6419a07692a90a1e5508466b Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 22 May 2024 15:57:24 +0100
Subject: [PATCH 111/182] Avoid StringRef.equals()

LLVM 19 has deprecated StringRef::equals in favor of plain ==. This has
been supported by all versions of LLVM ever since StringRef was first
added, so we can just change them as suggested.
---
 .../compiler_pipeline/source/cl_builtin_info.cpp       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 9f02245b369e9..ec5c1356b2405 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -877,7 +877,7 @@ BuiltinID CLBuiltinInfo::identifyBuiltin(const Function &F) const {
   const auto Version = getOpenCLVersion(*F.getParent());
   const StringRef DemangledName = Mangler.demangleName(Name);
   while (entry->ID != eBuiltinInvalid) {
-    if (Version >= entry->MinVer && DemangledName.equals(entry->OpenCLFnName)) {
+    if (Version >= entry->MinVer && DemangledName == entry->OpenCLFnName) {
       return entry->ID;
     }
     entry++;
@@ -1327,8 +1327,8 @@ Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
   StringRef FirstChunk;
   Lexer L(BuiltinName);
   if (L.ConsumeUntil('_', FirstChunk)) {
-    const bool AsBuiltin = FirstChunk.equals("as");
-    const bool ConvertBuiltin = FirstChunk.equals("convert");
+    const bool AsBuiltin = FirstChunk == "as";
+    const bool ConvertBuiltin = FirstChunk == "convert";
     if (!L.Consume("_")) {
       return nullptr;
     }
@@ -1443,8 +1443,8 @@ Function *CLBuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
   StringRef FirstChunk;
   Lexer L(BuiltinName);
   if (L.ConsumeUntil('_', FirstChunk)) {
-    const bool AsBuiltin = FirstChunk.equals("as");
-    const bool ConvertBuiltin = FirstChunk.equals("convert");
+    const bool AsBuiltin = FirstChunk == "as";
+    const bool ConvertBuiltin = FirstChunk == "convert";
     if (!L.Consume("_")) {
       return nullptr;
     }

From 8ebf0b787c35d0342dc8590f9241aed50634b8ad Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 17 Jun 2024 12:14:59 +0100
Subject: [PATCH 112/182] Update tests for LLVM 19.

LLVM 19 now prints in the new debug info format by default, so update
tests to account for this.

Also update ca-lit to call python3 rather than python. The latter is
less likely to be available, and even if available, may be Python 2
rather than Python 3.
---
 .../lit/llvm/inlined_function_debug_info.ll   |  9 +++--
 .../lit/llvm/insert_element_debug_info.ll     |  9 +++--
 .../test/lit/llvm/packetization_debug_info.ll | 33 +++++++++-------
 .../vecz/test/lit/llvm/phi_node_debug_info.ll |  9 +++--
 .../test/lit/llvm/scalarization_debug_info.ll | 38 +++++++++++--------
 5 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
index c6641f8aee1dc..a96bb1e109f23 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -16,7 +16,8 @@
 
 ; Check VECZ debug info for inlined DILocation metadata nodes
 
-; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %t
 
 ; ModuleID = '/tmp/inlined_function.ll'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -130,8 +131,10 @@ attributes #4 = { nobuiltin }
 
 ; CHECK: %[[LOAD1:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
 ; CHECK: %[[LOAD2:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
-; CHECK: call void @llvm.dbg.value(metadata i32 %[[LOAD1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1:![0-9]+]]
-; CHECK: call void @llvm.dbg.value(metadata i32 %[[LOAD2]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1]]
+; CHECK-GE19: #dbg_value(i32 %[[LOAD1]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1:![0-9]+]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata i32 %[[LOAD1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1:![0-9]+]]
+; CHECK-GE19: #dbg_value(i32 %[[LOAD2]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata i32 %[[LOAD2]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1]]
 ; CHECK: %{{.*}} = mul nsw i32 %[[LOAD1]], %[[LOAD2]], !dbg [[DI_LOC2:![0-9]+]]
 
 ; CHECK: [[HELPER_SUBPROGRAM:![0-9]+]] = distinct !DISubprogram(name: "k_one",
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index 727130f6f2e92..2327a0207e8a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -18,7 +18,8 @@
 ; intrinsics across all lanes even when scalarization masks disable some
 ; of the lanes. This occurs when we scalarize insertelement instructions.
 
-; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %t
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -49,8 +50,10 @@ entry:
 ; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the
 ; termination of the previous value assigned to %tmp - we could probably do
 ; better here by manifesting a vectorized value?
-; CHECK: call void @llvm.dbg.value(metadata i32 {{(poison|undef)}}, metadata [[VAR:![0-9]+]],
-; CHECK-SAME:   metadata !DIExpression({{.*}})), !dbg !{{[0-9]+}}
+; CHECK-GE19: #dbg_value(i32 {{(poison|undef)}}, [[VAR:![0-9]+]],
+; CHECK-LT19: call void @llvm.dbg.value(metadata i32 {{(poison|undef)}}, metadata [[VAR:![0-9]+]],
+; CHECK-SAME:   !DIExpression({{.*}}),
+; CHECK-SAME:   !{{[0-9]+}}
   %1 = load i32, i32* %tid, align 4, !dbg !32
   %mul = mul nsw i32 3, %1, !dbg !32
   %idx.ext = sext i32 %mul to i64, !dbg !32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index 0dc600e899bc1..688d5be8fd10c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -17,7 +17,8 @@
 ; Check that debug info is preserved in the vectorized kernel.
 ; Specifically that the packetization pass creates vector types
 ; in the DI for the variables.
-; RUN: veczc -k add -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -k add -S < %s | FileCheck %t
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -35,32 +36,38 @@ entry:
   %a = alloca i32, align 4
   %b = alloca i32, align 4
   store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
-; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
-; CHECK-SAME: !dbg [[PARAM_LOC:![0-9]+]]
+; CHECK-GE19: #dbg_value(ptr addrspace(1) %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata ptr addrspace(1) %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
+; CHECK-SAME: [[PARAM_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
   store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
-; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[PARAM_LOC]]
+; CHECK-GE19: #dbg_value(ptr addrspace(1) %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata ptr addrspace(1) %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
-; CHECK: call void @llvm.dbg.value(metadata ptr addrspace(1) %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[PARAM_LOC]]
+; CHECK-GE19: #dbg_value(ptr addrspace(1) %out, [[DI_OUT:![0-9]+]], [[EXPR]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata ptr addrspace(1) %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
-; CHECK: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[TID_LOC:![0-9]+]]
+; CHECK-GE19: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
+; CHECK-SAME: [[TID_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
   %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
-; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_A:![0-9]+]], metadata !DIExpression())
-; CHECK-SAME: !dbg [[A_LOC:![0-9]+]]
+; CHECK-GE19: #dbg_value(i32 undef, [[DI_A:![0-9]+]], !DIExpression(),
+; CHECK-LT19: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_A:![0-9]+]], metadata !DIExpression())
+; CHECK-SAME: [[A_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
   store i32 %2, i32* %a, align 4, !dbg !32
-; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_B:![0-9]+]], metadata !DIExpression())
-; CHECK-SAME: !dbg [[B_LOC:![0-9]+]]
+; CHECK-GE19: #dbg_value(i32 undef, [[DI_B:![0-9]+]], !DIExpression(),
+; CHECK-LT19: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_B:![0-9]+]], metadata !DIExpression())
+; CHECK-SAME: [[B_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
   %3 = load i64, i64* %tid, align 8, !dbg !33
   %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index cbd7444ed6b8d..5f1ddd384e5ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -17,7 +17,8 @@
 ; Check that debug info intrinsics are correctly placed after
 ; phi nodes.
 
-; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %t
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -47,8 +48,10 @@ entry:
 
 ; CHECK: for.cond:
 ; CHECK: %[[PHI1:.+]] = phi {{i[0-9]+}} [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
-; CHECK: call void @llvm.dbg.value(metadata i64 %[[PHI1]], metadata !{{[0-9]+}},
-; CHECK-SAME: metadata !DIExpression({{.*}})), !dbg !{{[0-9]+}}
+; CHECK-GE19: #dbg_value(i64 %[[PHI1]], !{{[0-9]+}},
+; CHECK-LT19: call void @llvm.dbg.value(metadata i64 %[[PHI1]], metadata !{{[0-9]+}},
+; CHECK-SAME: !DIExpression({{.*}}),
+; CHECK-SAME: !{{[0-9]+}}
 ; Check we haven't inserted a llvm.dbg.value intrinsic before the last of the PHIs.
 ; CHECK-NOT: phi
 for.cond:                                         ; preds = %for.inc, %entry
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
index 60febc22cb725..6c75e223b8cd4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -18,7 +18,8 @@
 ; Specifically that the scalarization pass doesn't destroy DI
 ; intrinsics attached to the vector instructions it scalarizes.
 
-; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %t
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -131,26 +132,33 @@ attributes #3 = { nobuiltin }
 ; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_mul2({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
 
 ; Check that intrinsics for user variable locations are still present
-; CHECK: call void @llvm.dbg.value(metadata {{.*}} %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
-; CHECK-SAME: !dbg [[PARAM_LOC:![0-9]+]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata {{.*}} %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
+; CHECK-GE19: #dbg_value({{.*}} %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
+; CHECK-SAME: [[PARAM_LOC:![0-9]+]]
 
-; CHECK: call void @llvm.dbg.value(metadata {{.*}} %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[PARAM_LOC]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata {{.*}} %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
+; CHECK-GE19: #dbg_value({{.*}} %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
 
-; CHECK: call void @llvm.dbg.value(metadata {{.*}} %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[PARAM_LOC]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata {{.*}} %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
+; CHECK-GE19: #dbg_value({{.*}} %out, [[DI_OUT:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
 
-; CHECK: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME: !dbg [[TID_LOC:![0-9]+]]
+; CHECK-LT19: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
+; CHECK-GE19: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[TID_LOC:![0-9]+]]
 
-; CHECK: call void @llvm.dbg.declare(metadata ptr %a, metadata [[DI_A:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME:!dbg [[A_LOC:![0-9]+]]
+; CHECK-LT19: call void @llvm.dbg.declare(metadata ptr %a, metadata [[DI_A:![0-9]+]], metadata [[EXPR]]
+; CHECK-GE19: #dbg_declare(ptr %a, [[DI_A:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[A_LOC:![0-9]+]]
 
-; CHECK: call void @llvm.dbg.declare(metadata ptr %b, metadata [[DI_B:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME:!dbg [[B_LOC:![0-9]+]]
+; CHECK-LT19: call void @llvm.dbg.declare(metadata ptr %b, metadata [[DI_B:![0-9]+]], metadata [[EXPR]]
+; CHECK-GE19: #dbg_declare(ptr %b, [[DI_B:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[B_LOC:![0-9]+]]
 
-; CHECK: call void @llvm.dbg.declare(metadata ptr %tmp, metadata [[DI_TMP:![0-9]+]], metadata [[EXPR]]
-; CHECK-SAME:!dbg [[TMP_LOC:![0-9]+]]
+; CHECK-LT19: call void @llvm.dbg.declare(metadata ptr %tmp, metadata [[DI_TMP:![0-9]+]], metadata [[EXPR]]
+; CHECK-GE19: #dbg_declare(ptr %tmp, [[DI_TMP:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[TMP_LOC:![0-9]+]]
 
 ; Debug info metadata entries
 ; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_INT2:![0-9]+]], size: 64, align: 64)

From f4aaca57384304125c2a4c13622e2b090e03e87b Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 21 Jun 2024 09:41:18 +0100
Subject: [PATCH 113/182] [LLVM 19] Drop debug info format conversions

We were converting between LLVM's old and new debug info format to work
around the incomplete new debug info format implementation in LLVM 19.
Now that it is (mostly) done, we no longer need that.
---
 .../compiler_pipeline/source/cl_builtin_info.cpp       | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index ec5c1356b2405..d56c4242ac391 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -3585,16 +3585,6 @@ Function *CLBuiltinLoader::materializeBuiltin(StringRef BuiltinName,
       return nullptr;
     }
 
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
-    if (Current->IsNewDbgInfoFormat != BuiltinModule->IsNewDbgInfoFormat) {
-      if (BuiltinModule->IsNewDbgInfoFormat) {
-        Current->convertToNewDbgValues();
-      } else {
-        Current->convertFromNewDbgValues();
-      }
-    }
-#endif
-
     // Find any callees in the function and add them to the list.
     for (BasicBlock &BB : *Current) {
       for (Instruction &I : BB) {

From c5841ed6f9b1d9db594979d1e426e27b1edc51d1 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 24 Jun 2024 11:14:36 +0100
Subject: [PATCH 114/182] Fix build with LLVM 19 again.

* Add missing header. We were using SmallString without including its
  header, relying on it being included implicitly through some other
  header. It no longer is included implicitly.
* Change llvm::unique_function to std::function. Again we were failing
  to include the appropriate header, but there was no reason for using
  llvm::unique_function in the first place. Though it has benefits over
  std::function in specific cases, that did not apply to what we were
  doing.
---
 .../compiler_passes/compiler_pipeline/source/barrier_regions.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 3affc145019cb..97b035374388a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -23,6 +23,7 @@
 #include <llvm/ADT/SetOperations.h>
 #include <llvm/ADT/SetVector.h>
 #include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/SmallString.h>
 #include <llvm/ADT/StringSet.h>
 #include <llvm/ADT/TinyPtrVector.h>
 #include <llvm/IR/Constants.h>

From bad94d18aa79e1d98f28532a19e3142500e1ef4e Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 1 Jul 2024 12:52:21 +0100
Subject: [PATCH 115/182] Fix build with LLVM 19 again.

We were using types such as llvm::Module without explicitly including
<llvm/IR/Module.h>, relying on some other header including that
implicitly. As a result of changes to LLVM 19's headers, we no longer
get the same implicit includes, so include the required headers (or add
the required declarations) explicitly.
---
 .../compiler_passes/compiler_pipeline/source/builtin_info.cpp | 1 +
 .../compiler_pipeline/source/cl_builtin_info.cpp              | 1 +
 .../compiler_pipeline/source/define_mux_builtins_pass.cpp     | 1 +
 .../compiler_pipeline/source/encode_kernel_metadata_pass.cpp  | 1 +
 .../compiler_pipeline/source/mux_builtin_info.cpp             | 1 +
 .../source/optimal_builtin_replacement_pass.cpp               | 1 +
 .../compiler_pipeline/source/prepare_barriers_pass.cpp        | 1 +
 .../source/replace_local_module_scope_variables_pass.cpp      | 1 +
 .../compiler_passes/compiler_pipeline/source/scheduling.cpp   | 1 +
 .../compiler_pipeline/source/sub_group_analysis.cpp           | 1 +
 .../compiler_pipeline/source/unique_opaque_structs_pass.cpp   | 1 +
 .../compiler_passes/vecz/include/vecz/pass.h                  | 1 +
 .../source/include/analysis/vectorizable_function_analysis.h  | 4 ++++
 .../compiler_passes/vecz/source/include/memory_operations.h   | 1 +
 .../source/include/transform/control_flow_conversion_pass.h   | 2 ++
 .../compiler_passes/vecz/tools/source/veczc.cpp               | 1 +
 16 files changed, 20 insertions(+)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index 2a772bbd29b45..bdbc015e21e3f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -22,6 +22,7 @@
 #include <compiler/utils/scheduling.h>
 #include <llvm/ADT/StringExtras.h>
 #include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Module.h>
 
 using namespace llvm;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index d56c4242ac391..b37eea39b46eb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -24,6 +24,7 @@
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Support/Compiler.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/Error.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
index 75cf246e6065a..6acdfc09d02e4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
@@ -16,6 +16,7 @@
 
 #include <compiler/utils/builtin_info.h>
 #include <compiler/utils/define_mux_builtins_pass.h>
+#include <llvm/IR/Module.h>
 
 #define DEBUG_TYPE "define-mux-builtins"
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
index cec28d87c6322..2bdc18a595d2a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
@@ -17,6 +17,7 @@
 #include <compiler/utils/attributes.h>
 #include <compiler/utils/encode_kernel_metadata_pass.h>
 #include <compiler/utils/metadata.h>
+#include <llvm/IR/Module.h>
 
 using namespace llvm;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index 64644428940af..32c31877b0d35 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -23,6 +23,7 @@
 #include <compiler/utils/scheduling.h>
 #include <compiler/utils/target_extension_types.h>
 #include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Support/ModRef.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index b834a15eb6c3a..dec9f0958b229 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -26,6 +26,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
 #include <llvm/TargetParser/Triple.h>
 #include <multi_llvm/vector_type_helper.h>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
index 9220e0c9d2261..15b138e8a1af3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -19,6 +19,7 @@
 #include <compiler/utils/prepare_barriers_pass.h>
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <multi_llvm/multi_llvm.h>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index 95bf6fb008e60..d45cdd5fcbce3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -25,6 +25,7 @@
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <multi_llvm/vector_type_helper.h>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
index 96657c945cc83..4301ce93cabfc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
@@ -20,6 +20,7 @@
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
 #include <sys/types.h>
 
 using namespace llvm;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
index a4b20adc3b18e..bbbf353517ff3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
@@ -19,6 +19,7 @@
 #include <compiler/utils/sub_group_analysis.h>
 #include <llvm/ADT/PriorityWorklist.h>
 #include <llvm/ADT/SetOperations.h>
+#include <llvm/IR/Module.h>
 
 using namespace llvm;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
index eff301b86f1f5..0f630076d6884 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
@@ -23,6 +23,7 @@
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Support/Casting.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Cloning.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index 7d439fa8c4f06..af6742390abfc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -22,6 +22,7 @@
 #define VECZ_PASS_H
 
 #include <compiler/utils/vectorization_factor.h>
+#include <llvm/IR/Function.h>
 #include <llvm/IR/PassManager.h>
 
 #include <cstdint>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
index 61fcfe4ed66ec..230e5aa883919 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -24,6 +24,10 @@
 #include <llvm/ADT/StringRef.h>
 #include <llvm/IR/PassManager.h>
 
+namespace llvm {
+class Value;
+}
+
 namespace vecz {
 
 /// @brief Determines whether vectorization of a function is possible.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
index ddb2bfcb9915b..d1e84515ce732 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -24,6 +24,7 @@
 #include <inttypes.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
 
 #include <optional>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
index 417f57a218516..2f2991280e692 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -27,7 +27,9 @@
 #include <memory>
 
 namespace llvm {
+class BasicBlock;
 class Function;
+class Instruction;
 class Value;
 class DominatorTree;
 class PostDominatorTree;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 491086842da18..9ea491d9c1e66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -29,6 +29,7 @@
 #include <llvm/Bitcode/BitcodeReader.h>
 #include <llvm/Bitcode/BitcodeWriterPass.h>
 #include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
 #include <llvm/IRPrinter/IRPrintingPasses.h>
 #include <llvm/IRReader/IRReader.h>
 #include <llvm/InitializePasses.h>

From 2007e5348b3829fe378137312dd656ef2d1407e6 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 29 Aug 2024 14:03:11 +0100
Subject: [PATCH 116/182] [LLVM 20] Adjust irreducible_loop test

LLVM 20 is able to produce simpler IR. Handle this in our checks.
---
 .../vecz/test/lit/llvm/irreducible_loop.ll    | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
index 0cbf1c85c0ee5..e6d958c7e826d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -49,20 +49,17 @@ declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_irreducible_loop
 ; CHECK: entry:
-; CHECK:   br label %irr.guard.outer
+; CHECK-LT20:   br label %irr.guard.outer
 
-; CHECK: irr.guard.outer:                                  ; preds = %irr.guard.pure_exit, %entry
+; CHECK-LT20: irr.guard.outer:                                  ; preds = %irr.guard.pure_exit, %entry
 ; CHECK:   br label %irr.guard
 
-; LLVM 16 re-orders the Basic Blocks, without any change to the CFG.
-; CHECK-LE15: irr.guard.pure_exit:                              ; preds = %irr.guard
-; CHECK-LE15:   br i1 %{{.+}}, label %do.end, label %irr.guard.outer
+; CHECK-LT20: do.end:                                           ; preds = %irr.guard.pure_exit
+; CHECK-LT20:   ret void
 
-; CHECK: do.end:                                           ; preds = %irr.guard.pure_exit
-; CHECK:   ret void
-
-; CHECK: irr.guard:                                        ; preds = %irr.guard, %irr.guard.outer
+; CHECK: irr.guard:
 ; CHECK:   br i1 %{{.+}}, label %irr.guard.pure_exit, label %irr.guard
 
-; CHECK-GT15: irr.guard.pure_exit:                              ; preds = %irr.guard
-; CHECK-GT15:   br i1 %{{.+}}, label %do.end, label %irr.guard.outer
+; CHECK: irr.guard.pure_exit:                              ; preds = %irr.guard
+; CHECK-LT20:   br i1 %{{.+}}, label %do.end, label %irr.guard.outer
+; CHECK-GE20:   ret void

From 8f8bef4e698a5c6c53e9530128022db5228ab3aa Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 29 Aug 2024 17:44:25 +0100
Subject: [PATCH 117/182] [LLVM 20] Adjust for non-experimental builtins

Several llvm.stepvector builtins are no longer experimental. Adjust
tests to allow both llvm.stepvector and llvm.experimental.stepvector
names.
---
 .../test/lit/llvm/ScalableVectors/broadcast_vector.ll     | 6 +++---
 .../lit/llvm/ScalableVectors/define_interleaved_store.ll  | 2 +-
 .../ScalableVectors/define_interleaved_store_as_masked.ll | 2 +-
 .../lit/llvm/ScalableVectors/define_subgroup_scans.ll     | 4 ++--
 .../lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll  | 8 ++++----
 .../vecz/test/lit/llvm/ScalableVectors/extract_element.ll | 4 ++--
 .../vecz/test/lit/llvm/ScalableVectors/insert_element.ll  | 2 +-
 .../test/lit/llvm/ScalableVectors/interleaved_load.ll     | 2 +-
 .../lit/llvm/ScalableVectors/packetize_mask_varying.ll    | 2 +-
 .../test/lit/llvm/ScalableVectors/select_scalar_vector.ll | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/shuffle.ll         | 2 +-
 .../test/lit/llvm/ScalableVectors/subgroup_builtins.ll    | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll  | 2 +-
 .../VectorPredication/define_interleaved_load_store.ll    | 4 ++--
 14 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index d43996696bcb4..3fdb36fd09c5b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -107,7 +107,7 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16
-; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
 ; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
@@ -143,7 +143,7 @@ entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x i32>, align 16
 ; CHECK-NEXT:    [[FIXLEN_ALLOC1:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16
-; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
 ; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
@@ -168,7 +168,7 @@ entry:
 ; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast(
 ; CHECK-NEXT:  entry:
 ; CHECK:    [[FIXLEN_MASK_ALLOC:%.*]] = alloca <4 x i8>, align 4
-; CHECK:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
 ; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK:    [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
 ; CHECK:    store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index 1d8a25b9fc14f..6c0be75737891 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -58,7 +58,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: entry:
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, <vscale x 4 x i32> zeroinitializer
-; CHECK: %2 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK: %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK: %3 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %2
 ; CHECK: %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
 ; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32{{( immarg)?}} 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index b1199cf8423d0..f6a350a65b384 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -57,7 +57,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: entry:
 ; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:   %2 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:   %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK:   %3 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %2
 ; CHECK:   %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
 ; CHECK:   call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32 immarg 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 830a0ae5d4370..12b2856ce481d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -42,7 +42,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
 ; CHECK: entry:
 ; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
 ; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
 ; CHECK:   br label %loop
@@ -77,7 +77,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
 ; CHECK: entry:
 ; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
 ; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
 ; CHECK:   br label %loop
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
index abe654d8cdb0d..f99394619e9c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -42,7 +42,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<v
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
 ; CHECK: entry:
 ; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
@@ -56,7 +56,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<v
 ; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
 ;------- there will be a bitcast here if pointers are typed
 ; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
-; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
 ; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
@@ -79,7 +79,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
 ; CHECK: entry:
 ; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
@@ -93,7 +93,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 ; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
 ;------- there will be a bitcast here if pointers are typed
 ; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
-; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
 ; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index 27f7b54eb6442..d5a92d13b44ae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -112,7 +112,7 @@ entry:
 ; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec(
 ; EE-UNI-VEC: [[T3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 %call, {{(i32|i64)}} 0
 ; EE-UNI-VEC: [[T4:%.*]] = shufflevector <vscale x 4 x i64> [[T3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EE-UNI-VEC: [[STEP:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; EE-UNI-VEC: [[STEP:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; EE-UNI-VEC: [[T5:%.*]] = add <vscale x 4 x i64> [[T4]], [[STEP]]
 ; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 2, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
@@ -130,7 +130,7 @@ entry:
 ; EE-INDICES: [[T2:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) [[T0]], align 4
 ; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
-; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T4]], [[T3]]
 ; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index 1712e6dd4579a..8b4bbb4590935 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -98,7 +98,7 @@ entry:
 ; IE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
 ; IE-INDICES: [[VAL:%.*]] = uitofp <vscale x 4 x i64> {{%.*}} to <vscale x 4 x float>
 ; IE-INDICES: store <vscale x 16 x float> {{%.*}}, ptr [[ALLOC]], align 64
-; IE-INDICES: [[T1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IE-INDICES: [[T1:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index 1bff13b48c65a..d4f4c5339754d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -50,7 +50,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <vscale x 4 x ptr addrspace(1)> [[TMP0]], <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[ARG2]], {{i32|i64}} 0
 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <vscale x 4 x i64> [[TMP2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <vscale x 4 x ptr addrspace(1)> [[TMP1]], <vscale x 4 x i64> [[TMP5]]
 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)) [[MASKED_ATTRS:#[0-9]+]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index b28ea35060d36..da232d896b93f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -38,7 +38,7 @@ if.then:
 if.end:
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_mask_varying
-; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
 ; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index d6242cfe70392..0d2fb8b16c8ee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -43,7 +43,7 @@ entry:
 ; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
 ; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
 ; CHECK: store <vscale x 4 x i8> [[sext]], ptr [[alloc:%.*]], align 4
-; CHECK: [[idx0:%.*]] = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+; CHECK: [[idx0:%.*]] = call <vscale x 8 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv8i32()
 ; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 1, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
 
 ; Note that since we just did a lshr 1 on the input of the extend, it doesn't
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 0c4a3b5a5b1d9..332769b3a0f60 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -32,7 +32,7 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
   store <4 x i32> %splat, <4 x i32>* %arrayidxz
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat
-; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
 ; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 55ff8ee65aaa1..6100a08e88757 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -51,7 +51,7 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 ; CHECK: [[MUL:%.*]] = mul i32 %call, [[SHL]]
 ; CHECK: [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[MUL]], i64 0
 ; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK: [[STEPVEC:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK: [[STEPVEC:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK: [[LID:%.*]] = add <vscale x 4 x i32> [[SPLAT]], [[STEPVEC]]
 ; CHECK: [[EXT:%.*]] = sext i32 %call to i64
 ; CHECK: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
index 30d440b5ea5e2..b28940d1204a5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -37,5 +37,5 @@ entry:
 declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_nxv4_store_ult
-; CHECK:   [[step:%[0-9.a-z]+]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:   [[step:%[0-9.a-z]+]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK:   %{{.*}} = add <vscale x 4 x i64> %{{.*}}, [[step]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index 2d01057a6170e..bf8a3f08a104c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -58,7 +58,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: entry:
 ; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:   %3 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:   %3 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK:   %4 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %3
 ; CHECK:   %5 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %4
 ; CHECK:   %6 = call <vscale x 4 x double> @llvm.vp.gather.nxv4f64.nxv4p1(<vscale x 4 x ptr addrspace(1)> %5, <vscale x 4 x i1> %1, i32 %2)
@@ -72,7 +72,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: entry:
 ; CHECK:  %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:  %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:  %4 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK:  %4 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK:  %5 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %4
 ; CHECK:  %6 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %5
 ; CHECK:  call void @llvm.vp.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %6, <vscale x 4 x i1> %2, i32 %3)

From a93c2cca8fafd76d847d66f5695184ef5f635547 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 2 Sep 2024 14:12:10 +0100
Subject: [PATCH 118/182] [LLVM 20] Adjust packetize_mask_varying test

LLVM 20 is able to figure out that the mask for this load is
unnecessary, as it is always a splat of true. Adjust the test to allow
for this.
---
 .../llvm/VectorPredication/packetize_mask_varying.ll   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 6fc5db369a69f..91d9b9c3f22ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -14,8 +14,8 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
-; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
+; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %t
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -40,8 +40,10 @@ if.end:
 ; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying
 ; CHECK: [[CMP:%.*]] = icmp slt <vscale x 4 x i64> %{{.*}},
 ; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[CMP]], {{.*}}, i32 {{.*}})
-; CHECK: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0
-; CHECK: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-LT20: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0
+; CHECK-LT20: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-LT20: [[VAL:%.*]] = call <4 x i32> @__vecz_b_masked_load16_Dv4_ju3ptrDv4_b(ptr %aptr, <4 x i1> [[RESPLAT]])
+; CHECK-GE20: [[VAL:%.*]] = load <4 x i32>, ptr %aptr
 }
 
 declare i64 @__mux_get_global_id(i32)

From 2e6fae4779aadb5ff201189a40ae5d7d921eee0b Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 5 Sep 2024 18:45:43 +0100
Subject: [PATCH 119/182] [vecz] Avoid scalarization using large ints

When we scalarize, when needed, we bitcast values to integer types,
shift and mask as needed, and then bitcast to the destination type. This
is not generally valid: when any source element is poison, if it is
inserted into a full-width integer type, the full integer becomes
poison, it is not possible to mask out bits.

This commit avoids creating the full-width integer by bitcasting
individual elements of the source vector, which not only avoids the bug
but also generally results in simpler shorter code.

This commit also removes some dead code from scalarizer.cpp. UndefValue
is already handled by the code immediately above.
---
 .../vecz/source/include/simd_packet.h         |   3 +-
 .../vecz/source/simd_packet.cpp               |   2 -
 .../vecz/source/transform/scalarizer.cpp      | 102 ++++++++-----
 .../vecz/test/lit/llvm/scalarize-bitcast.ll   | 135 ++++++++++++++++++
 4 files changed, 199 insertions(+), 43 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
index 1c7bf61e4e2ab..220b74e8424c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
@@ -72,8 +72,7 @@ struct PacketMask {
 
 /// @brief Packet of LLVM values (e.g. instructions), one for each SIMD lane.
 struct SimdPacket : public llvm::SmallVector<llvm::Value *, 4> {
-  /// @brief Create a new packet with no value and all lanes disabled.
-  SimdPacket();
+  using SmallVector::SmallVector;
 
   /// @brief Return the value at the given index.
   /// @param[in] Index Index of the value to return.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
index 7c834d3f5cb5d..0f31e329086d3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
@@ -21,8 +21,6 @@
 using namespace llvm;
 using namespace vecz;
 
-SimdPacket::SimdPacket() : Mask(0) {}
-
 llvm::Value *SimdPacket::at(unsigned Index) const {
   if (Index >= size()) {
     return nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 6b678b8249d3a..a95910f6e228b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -625,18 +625,6 @@ SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) {
     return P;
   }
 
-  if (isa<UndefValue>(V)) {
-    Value *ScalarUndef = UndefValue::get(VecTy->getElementType());
-    SimdPacket *P = getPacket(V, SimdWidth);
-    for (unsigned i = 0; i < SimdWidth; i++) {
-      if (!PM.isEnabled(i) || P->at(i)) {
-        continue;
-      }
-      P->set(i, ScalarUndef);
-    }
-    return P;
-  }
-
   Instruction *insert = nullptr;
 
   if (auto *Arg = dyn_cast<Argument>(V)) {
@@ -1090,46 +1078,82 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
   VECZ_STAT_FAIL_IF(Vec3Src ^ Vec3Dst, VeczScalarizeFailBitcast);
 
   // Handle non-vector -> vector casts and vector casts with different widths.
-  // This is done by casting the source to an integer and doing bitwise
-  // extractions with ANDs and shifts.
   if (!VecSrcTy || (VecSrcTy->getNumElements() != SimdWidth)) {
-    Type *SrcAsIntTy = SrcTy;
-    Value *SrcAsInt = Src;
-    Type *DstEleTy = VecDstTy->getElementType();
-    Type *DstEleAsIntTy = DstEleTy;
-    const uint64_t SrcBits = SrcTy->getPrimitiveSizeInBits();
-    const uint64_t LaneBits = DstEleTy->getPrimitiveSizeInBits();
-    if (!SrcTy->isIntegerTy()) {
-      SrcAsIntTy = SrcTy->getIntNTy(BC->getContext(), SrcBits);
-      SrcAsInt = B.CreateBitCast(SrcAsInt, SrcAsIntTy);
-      SrcAsInt = scalarizeOperands(cast<Instruction>(SrcAsInt));
-    }
-    if (!DstEleTy->isIntegerTy()) {
-      DstEleAsIntTy = IntegerType::get(BC->getContext(), LaneBits);
-    }
-
+    VECZ_FAIL_IF(BC->getModule()->getDataLayout().isBigEndian());
+
+    // Treat scalars as vectors of length 1.
+    SimdPacket SrcScalar{Src};
+    SimdPacket &S =
+        VecSrcTy ? *getPacket(Src, VecSrcTy->getNumElements()) : SrcScalar;
+    Type *const SrcEleTy = VecSrcTy ? VecSrcTy->getElementType() : SrcTy;
+    // Source element need not be a primitive if it was a non-vector, but in
+    // that case we know the size must match the destination vector type.
+    const size_t SrcEleSize = VecSrcTy ? SrcEleTy->getPrimitiveSizeInBits()
+                                       : VecDstTy->getPrimitiveSizeInBits();
+    Type *const SrcEleIntTy =
+        SrcEleTy->isIntegerTy()
+            ? SrcEleTy
+            : SrcEleTy->getIntNTy(BC->getContext(),
+                                  SrcEleTy->getPrimitiveSizeInBits());
+    Type *const DstEleTy = VecDstTy->getElementType();
+    const size_t DstEleSize = DstEleTy->getPrimitiveSizeInBits();
+    Type *const DstEleIntTy =
+        DstEleTy->isIntegerTy()
+            ? DstEleTy
+            : DstEleTy->getIntNTy(BC->getContext(),
+                                  DstEleTy->getPrimitiveSizeInBits());
     SimdPacket *P = getPacket(BC, SimdWidth);
+    PacketMask SPM;
     for (unsigned i = 0; i < SimdWidth; i++) {
       if (!PM.isEnabled(i) || P->at(i)) {
         continue;
       }
-      APInt LaneMask(SrcBits, 1);
-      LaneMask = LaneMask.shl(LaneBits);
-      LaneMask -= APInt(SrcBits, 1);
-      LaneMask = LaneMask.shl(i * LaneBits);
-      Value *LaneMaskVal = ConstantInt::get(SrcAsIntTy, LaneMask);
-      Value *Lane = B.CreateAnd(SrcAsInt, LaneMaskVal);
-      Lane = B.CreateLShr(Lane, LaneBits * i);
-      Lane = B.CreateTrunc(Lane, DstEleAsIntTy);
-      if (!DstEleTy->isIntegerTy()) {
+      if (VecSrcTy) {
+        for (unsigned j = i * DstEleSize / SrcEleSize;
+             j * SrcEleSize < (i + 1) * DstEleSize; ++j) {
+          SPM.enable(j);
+        }
+        SimdPacket *SrcPacket = scalarize(Src, SPM);
+        VECZ_FAIL_IF(!SrcPacket);
+        assert(SrcPacket == &S &&
+               "Scalarization of Src should update existing packet");
+      }
+      Value *Lane = nullptr;
+      for (unsigned j = i * DstEleSize / SrcEleSize;
+           j * SrcEleSize < (i + 1) * DstEleSize; ++j) {
+        Value *SrcPart = S[j];
+        assert(
+            SrcPart &&
+            "Scalarization of Src failure should have been detected earlier");
+        if (SrcEleIntTy != SrcEleTy) {
+          SrcPart = B.CreateBitCast(SrcPart, SrcEleIntTy);
+        }
+        if (SrcEleIntTy->getIntegerBitWidth() <
+            DstEleIntTy->getIntegerBitWidth()) {
+          SrcPart = B.CreateZExt(SrcPart, DstEleIntTy);
+        }
+        if (i * DstEleSize > j * SrcEleSize) {
+          SrcPart = B.CreateLShr(SrcPart, i * DstEleSize - j * SrcEleSize);
+        } else if (j * SrcEleSize > i * DstEleSize) {
+          SrcPart = B.CreateShl(SrcPart, j * SrcEleSize - i * DstEleSize);
+        }
+        if (SrcEleIntTy->getIntegerBitWidth() >
+            DstEleIntTy->getIntegerBitWidth()) {
+          SrcPart = B.CreateTrunc(SrcPart, DstEleIntTy);
+        }
+        Lane = Lane ? B.CreateOr(Lane, SrcPart) : SrcPart;
+      }
+      if (DstEleTy != DstEleIntTy) {
         Lane = B.CreateBitCast(Lane, DstEleTy);
       }
+      assert(Lane && "No bits found for lane");
       P->set(i, Lane);
     }
     return P;
   }
 
-  // Handle vector -> vector casts, quite a more straighforward affair.
+  // Handle same width vector -> vector casts, quite a more straighforward
+  // affair.
   SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
   Type *DstEleTy = VecDstTy->getElementType();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
new file mode 100644
index 0000000000000..89556f91437a0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
@@ -0,0 +1,135 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @bitcast1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %gid
+  %0 = load <2 x float>, ptr addrspace(1) %pin, align 4
+  %1 = bitcast <2 x float> %0 to <4 x half>
+  store <4 x half> %1, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast1
+; CHECK:      [[A0:%.+]] = load float,
+; CHECK-NEXT: [[C0:%.+]] = load float,
+; CHECK-NEXT: [[A1:%.+]] = bitcast float [[A0]] to i32
+; CHECK-NEXT: [[A2:%.+]] = trunc i32 [[A1]] to i16
+; CHECK-NEXT: [[A3:%.+]] = bitcast i16 [[A2]] to half
+; CHECK-NEXT: [[B1:%.+]] = bitcast float [[A0]] to i32
+; CHECK-NEXT: [[B2:%.+]] = lshr i32 [[B1]], 16
+; CHECK-NEXT: [[B3:%.+]] = trunc i32 [[B2]] to i16
+; CHECK-NEXT: [[B4:%.+]] = bitcast i16 [[B3]] to half
+; CHECK-NEXT: [[C1:%.+]] = bitcast float [[C0]] to i32
+; CHECK-NEXT: [[C2:%.+]] = trunc i32 [[C1]] to i16
+; CHECK-NEXT: [[C3:%.+]] = bitcast i16 [[C2]] to half
+; CHECK-NEXT: [[D1:%.+]] = bitcast float [[C0]] to i32
+; CHECK-NEXT: [[D2:%.+]] = lshr i32 [[D1]], 16
+; CHECK-NEXT: [[D3:%.+]] = trunc i32 [[D2]] to i16
+; CHECK-NEXT: [[D4:%.+]] = bitcast i16 [[D3]] to half
+; CHECK:      store half [[A3]],
+; CHECK-NEXT: store half [[B4]],
+; CHECK-NEXT: store half [[C3]],
+; CHECK-NEXT: store half [[D4]],
+; CHECK-NEXT: ret void
+
+define dso_local spir_kernel void @bitcast2(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i64 %gid
+  %0 = load <4 x half>, ptr addrspace(1) %pin, align 4
+  %1 = bitcast <4 x half> %0 to <2 x float>
+  store <2 x float> %1, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast2
+; CHECK:      [[A0:%.+]] = load half,
+; CHECK-NEXT: [[B0:%.+]] = load half,
+; CHECK-NEXT: [[C0:%.+]] = load half,
+; CHECK-NEXT: [[D0:%.+]] = load half,
+; CHECK-NEXT: [[A1:%.+]] = bitcast half [[A0]] to i16
+; CHECK-NEXT: [[A2:%.+]] = zext i16 [[A1]] to i32
+; CHECK-NEXT: [[B1:%.+]] = bitcast half [[B0]] to i16
+; CHECK-NEXT: [[B2:%.+]] = zext i16 [[B1]] to i32
+; CHECK-NEXT: [[B3:%.+]] = shl i32 [[B2]], 16
+; CHECK-NEXT: [[AB4:%.+]] = or i32 [[A2]], [[B3]]
+; CHECK-NEXT: [[AB5:%.+]] = bitcast i32 [[AB4]] to float
+; CHECK-NEXT: [[C1:%.+]] = bitcast half [[C0]] to i16
+; CHECK-NEXT: [[C2:%.+]] = zext i16 [[C1]] to i32
+; CHECK-NEXT: [[D1:%.+]] = bitcast half [[D0]] to i16
+; CHECK-NEXT: [[D2:%.+]] = zext i16 [[D1]] to i32
+; CHECK-NEXT: [[D3:%.+]] = shl i32 [[D2]], 16
+; CHECK-NEXT: [[CD4:%.+]] = or i32 [[C2]], [[D3]]
+; CHECK-NEXT: [[CD5:%.+]] = bitcast i32 [[CD4]] to float
+; CHECK:      store float [[AB5]],
+; CHECK-NEXT: store float [[CD5]],
+; CHECK-NEXT: ret void
+
+define dso_local spir_kernel void @bitcast3(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds <2 x i32>, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i64 %gid
+  %0 = load <2 x i32>, ptr addrspace(1) %pin, align 4
+  %1 = bitcast <2 x i32> %0 to <2 x float>
+  store <2 x float> %1, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast3
+; CHECK:      [[A0:%.+]] = load i32,
+; CHECK-NEXT: [[B0:%.+]] = load i32,
+; CHECK-NEXT: [[A1:%.+]] = bitcast i32 [[A0]] to float
+; CHECK-NEXT: [[B1:%.+]] = bitcast i32 [[B0]] to float
+; CHECK:      store float [[A1]],
+; CHECK-NEXT: store float [[B1]],
+; CHECK-NEXT: ret void
+
+define dso_local spir_kernel void @bitcast4(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %gid
+  %0 = load i32, ptr addrspace(1) %pin, align 4
+  %1 = insertelement <2 x i32> poison, i32 %0, i32 0
+  %2 = bitcast <2 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  store <4 x i16> %3, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast4
+; CHECK:      [[A0:%.+]] = load i32,
+; CHECK-NEXT: [[A1:%.+]] = trunc i32 [[A0]] to i16
+; CHECK-NEXT: [[B0:%.+]] = lshr i32 %0, 16
+; CHECK-NEXT: [[B1:%.+]] = trunc i32 [[B0]] to i16
+; CHECK:      store i16 [[A1]],
+; CHECK-NEXT: store i16 [[B1]],
+; CHECK-NEXT: store i16 [[A1]],
+; CHECK-NEXT: store i16 [[B1]],
+; CHECK-NEXT: ret void
+
+declare i64 @__mux_get_global_id(i32 noundef)

From 86798e6431741458a366ab4bcf7f1f4c94b6561c Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 6 Sep 2024 12:29:26 +0100
Subject: [PATCH 120/182] Fix compiler breakage on llvm 20 for
 createSimpleTargetReduction.

This has been renamed in llvm to createSimpleReduction, so modify
multi_llvm.
---
 .../compiler_pipeline/include/multi_llvm/loop_utils.h        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
index 24ff0aed579c4..8301f9b639f0b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
@@ -24,7 +24,10 @@ namespace multi_llvm {
 inline llvm::Value *createSimpleTargetReduction(
     llvm::IRBuilderBase &B, const llvm::TargetTransformInfo *TTI,
     llvm::Value *Src, llvm::RecurKind RdxKind) {
-#if LLVM_VERSION_MAJOR >= 18
+#if LLVM_VERSION_MAJOR >= 20
+  (void)TTI;
+  return llvm::createSimpleReduction(B, Src, RdxKind);
+#elif LLVM_VERSION_MAJOR >= 18
   (void)TTI;
   return llvm::createSimpleTargetReduction(B, Src, RdxKind);
 #else

From d36680798f46f4d9f29abed49f5e84e4521c2390 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 9 Sep 2024 14:13:37 +0100
Subject: [PATCH 121/182] [LLVM20] Handle USubCond, USubSat.

LLVM 20 adds usub_cond and usub_sat atomicrmw instructions. Handle them.
---
 .../vecz/source/vectorization_context.cpp             | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index a8529c5a91cb4..23ac15927c9df 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -28,6 +28,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <algorithm>
@@ -444,6 +445,12 @@ VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
       AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap;
     } else if (FnName.consume_front("udecwrap")) {
       AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap;
+#if LLVM_VERSION_GREATER_EQUAL(20, 0)
+    } else if (FnName.consume_front("usubcond")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::USubCond;
+    } else if (FnName.consume_front("usubsat")) {
+      AtomicInfo.BinOp = AtomicRMWInst::BinOp::USubSat;
+#endif
     } else {
       return std::nullopt;
     }
@@ -583,6 +590,10 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
       BINOP_CASE(FMin, "fmin");
       BINOP_CASE(UIncWrap, "uincwrap");
       BINOP_CASE(UDecWrap, "udecwrap");
+#if LLVM_VERSION_GREATER_EQUAL(20, 0)
+      BINOP_CASE(USubCond, "usubcond");
+      BINOP_CASE(USubSat, "usubsat");
+#endif
       case llvm::AtomicRMWInst::BAD_BINOP:
         return nullptr;
     }

From 41658d9d00c1100ac182eb7b2b197d4568e3883f Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 11 Sep 2024 15:20:56 +0100
Subject: [PATCH 122/182] [vecz] Clear getelementptr flags.

If a getelementptr has inbounds/nusw/nuw flags, it is valid for the
result to be out of bounds or to wrap so long as the result is not used
to access any object. However, if we vectorize, multiple elements will
all be loaded based on the first element's address, and in that we may
add accesses, so we need to clear the inbounds/nusw/nuw flags unless we
can prove they are still valid.
---
 .../analysis/uniform_value_analysis.cpp       |  8 +++
 .../llvm/ScalableVectors/broadcast_vector.ll  | 16 +++---
 .../llvm/ScalableVectors/extract_element.ll   | 10 ++--
 .../llvm/ScalableVectors/insert_element.ll    |  6 +--
 .../ScalableVectors/packetize_mask_varying.ll |  2 +-
 .../ScalableVectors/select_scalar_vector.ll   |  2 +-
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |  2 +-
 .../llvm/ScalableVectors/subgroup_builtins.ll |  2 +-
 .../control_flow_conversion_uniform_loop.ll   |  2 +-
 .../lit/llvm/extractelement_constant_index.ll |  2 +-
 .../vecz/test/lit/llvm/load_add_store.ll      |  6 +--
 .../lit/llvm/packetization_uniform_branch.ll  |  4 +-
 .../test/lit/llvm/packetize_struct_gep.ll     |  4 +-
 .../test/lit/llvm/pass_pipeline_printafter.ll |  4 +-
 .../lit/llvm/scalarization_instructions.ll    | 50 +++++++++----------
 .../vecz/test/lit/llvm/scalarize_mixed_gep.ll |  2 +-
 .../test/lit/llvm/squash_float2_gather.ll     |  6 +--
 .../vecz/test/lit/llvm/stride_analysis.ll     | 28 +++++------
 .../vecz/test/lit/llvm/subgroup_broadcast.ll  |  2 +-
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |  2 +-
 .../test/lit/llvm/uniform_address_base.ll     |  4 +-
 .../test/lit/llvm/uniform_address_index.ll    |  4 +-
 22 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index ff64eccda0b97..558d225250cec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -385,6 +385,14 @@ void UniformValueResult::markVaryingValues(Value *V, Value *From) {
       markVaryingValues(Alloca);
     }
   } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(VIns)) {
+    // We need to clear the flags because the initial address may be out of
+    // bounds but masked out.
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+    GEP->setNoWrapFlags(GEPNoWrapFlags::none());
+#else
+    GEP->setIsInBounds(false);
+#endif
+
     // Same as with the stores
     AllocaInst *Alloca = findAllocaFromPointer(GEP->getPointerOperand());
     if (Alloca) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 3fdb36fd09c5b..519bb696cd52e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -99,7 +99,7 @@ entry:
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF8000020000000, {{(i32|i64)}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 
@@ -113,17 +113,17 @@ entry:
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
@@ -132,7 +132,7 @@ entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -156,12 +156,12 @@ entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i32> {{(undef|poison)}})
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    [[V46:%.*]] = fadd <vscale x 16 x float> [[TMP6]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index d5a92d13b44ae..91c989df3c499 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -95,8 +95,8 @@ entry:
 ; EE: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
 ; EE: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
 ; EE: [[IDX:%.*]] = sext i32 %idx to i64
-; EE: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], i64 [[IDX]]
-; EE: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_interleaved_load4_4_u5nxv4fu3ptr(ptr nonnull [[ADDR]])
+; EE: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], i64 [[IDX]]
+; EE: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_interleaved_load4_4_u5nxv4fu3ptr(ptr [[ADDR]])
 
 ; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
 
@@ -120,13 +120,13 @@ entry:
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
 ; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i64> [[T6]], [[MOD]]
 
-; EE-UNI-VEC: [[T8:%.*]] = getelementptr inbounds float, ptr {{%.*}}, <vscale x 4 x i64> [[T7]]
+; EE-UNI-VEC: [[T8:%.*]] = getelementptr float, ptr {{%.*}}, <vscale x 4 x i64> [[T7]]
 ; EE-UNI-VEC: [[T9:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[T8]])
 ; EE-UNI-VEC: store <vscale x 4 x float> [[T9]], ptr addrspace(1) {{%.*}}, align 4
 
 ; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
 ; EE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
-; EE-INDICES: [[T0:%.*]] = getelementptr inbounds i32, ptr addrspace(1) %idxs, i64 %call
+; EE-INDICES: [[T0:%.*]] = getelementptr i32, ptr addrspace(1) %idxs, i64 %call
 ; EE-INDICES: [[T2:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) [[T0]], align 4
 ; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
@@ -134,7 +134,7 @@ entry:
 ; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T4]], [[T3]]
 ; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
-; EE-INDICES: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
+; EE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
 ; EE-INDICES: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[ADDR]])
 
 ; Check we promote from i1 to i8 before doing our memops
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index 8b4bbb4590935..b2dcb47b5aeb3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -85,8 +85,8 @@ entry:
 ; IE: [[VAL1:%.*]] = shufflevector <vscale x 4 x float> [[VAL0]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; IE: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
 ; IE: [[IDX:%.*]] = sext i32 %idx to i64
-; IE: [[ADDR:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], i64 [[IDX]]
-; IE: call void @__vecz_b_interleaved_store4_4_u5nxv4fu3ptr(<vscale x 4 x float> [[VAL1]], ptr nonnull [[ADDR]])
+; IE: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], i64 [[IDX]]
+; IE: call void @__vecz_b_interleaved_store4_4_u5nxv4fu3ptr(<vscale x 4 x float> [[VAL1]], ptr [[ADDR]])
 ; IE: = load <vscale x 16 x float>, ptr [[ALLOC]], align 64
 
 ; Both the vector and index are uniform, so check we're not unnecessarily packetizing
@@ -105,7 +105,7 @@ entry:
 ; IE-INDICES: [[T3:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T2]], {{%.*}}
 
 ; IE-INDICES: [[T4:%.*]] = sext <vscale x 4 x i32> [[T3]] to <vscale x 4 x i64>
-; IE-INDICES: [[ADDR:%.*]] = getelementptr inbounds float, ptr %0, <vscale x 4 x i64> [[T4]]
+; IE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr %0, <vscale x 4 x i64> [[T4]]
 ; IE-INDICES: call void @__vecz_b_scatter_store4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x float> [[VAL]], <vscale x 4 x ptr> [[ADDR]])
 ; IE-INDICES: = load <vscale x 16 x float>, ptr [[ALLOC]], align 64
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index da232d896b93f..61682c1baff07 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -45,7 +45,7 @@ if.end:
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
 ; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
 
-; CHECK: [[t1:%.*]] = getelementptr inbounds i8, ptr {{.*}}, <vscale x 16 x i64> [[idx2]]
+; CHECK: [[t1:%.*]] = getelementptr i8, ptr {{.*}}, <vscale x 16 x i64> [[idx2]]
 ; CHECK: [[t2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[t1]],
 ; CHECK: [[splat:%.*]] = trunc <vscale x 16 x i8> [[t2]] to <vscale x 16 x i1>
 ; CHECK: call void @__vecz_b_masked_store16_u6nxv16ju3ptru6nxv16b(<vscale x 16 x i32> {{.*}}, ptr %arrayidxz, <vscale x 16 x i1> [[splat]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index 0d2fb8b16c8ee..f4fa88cb151ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -50,7 +50,7 @@ entry:
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
 ; CHECK: [[sext2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 8 x i32> [[idx1]] to <vscale x 8 x i64>
 
-; CHECK: [[addrs:%.*]] = getelementptr inbounds i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
+; CHECK: [[addrs:%.*]] = getelementptr i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
 ; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[addrs]],
 ; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
 ; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 332769b3a0f60..0745027793052 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -39,7 +39,7 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
 ; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
 
-; CHECK: [[alloc:%.*]] = getelementptr inbounds i32, ptr %{{.*}}, <vscale x 16 x i64> [[idx2]]
+; CHECK: [[alloc:%.*]] = getelementptr i32, ptr %{{.*}}, <vscale x 16 x i64> [[idx2]]
 ; CHECK: [[splat:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[alloc]],
 ; CHECK: store <vscale x 16 x i32> [[splat]], ptr
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 6100a08e88757..19093a2f13153 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -54,7 +54,7 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 ; CHECK: [[STEPVEC:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK: [[LID:%.*]] = add <vscale x 4 x i32> [[SPLAT]], [[STEPVEC]]
 ; CHECK: [[EXT:%.*]] = sext i32 %call to i64
-; CHECK: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]]
 ; CHECK: store <vscale x 4 x i32> [[LID]], ptr addrspace(1) %arrayidx
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
index 2e9d562436b99..f3081873d808c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
@@ -167,7 +167,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: for.body:
 ; CHECK: %add = add nsw i32 %storemerge, %a
 ; CHECK: %idxprom = sext i32 %add2 to i64
-; CHECK: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %idxprom
+; CHECK: %arrayidx = getelementptr i32, ptr %b, i64 %idxprom
 ; CHECK: store i32 %add, ptr %arrayidx, align 4
 ; CHECK: %inc = add nsw i32 %storemerge, 1
 ; CHECK: br label %for.cond
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
index 60efefe3b05bf..f017218847a76 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
@@ -35,6 +35,6 @@ declare i64 @__mux_get_global_id(i32) #1
 
 ; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
 ; CHECK: call <4 x float> @__vecz_b_interleaved_load4_4_Dv4
-; CHECK: getelementptr inbounds float
+; CHECK: getelementptr float
 ; CHECK: store <4 x float>
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
index abcbc465aae0e..436540a5163e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
@@ -34,9 +34,9 @@ entry:
   ret void
 ; CHECK-LABEL: @__vecz_v4_load_add_store(ptr %aptr, ptr %bptr, ptr %zptr)
 ; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
-; CHECK: %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
-; CHECK: %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
-; CHECK: %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+; CHECK: %arrayidxa = getelementptr i32, ptr %aptr, i64 %idx
+; CHECK: %arrayidxb = getelementptr i32, ptr %bptr, i64 %idx
+; CHECK: %arrayidxz = getelementptr i32, ptr %zptr, i64 %idx
 ; CHECK: %[[TMP0:.*]] = load <4 x i32>, ptr %arrayidxa, align 4
 ; CHECK: %[[TMP1:.*]] = load <4 x i32>, ptr %arrayidxb, align 4
 ; CHECK: %sum1 = add <4 x i32> %[[TMP0]], %[[TMP1]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
index 0b448e8f8eba8..7218a621cafc7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -91,12 +91,12 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: br i1 %cmp, label %if.then, label %if.else
 
 ; CHECK: if.then:
-; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr %b, <4 x i64>
+; CHECK: %[[GEP1:.+]] = getelementptr i32, ptr %b, <4 x i64>
 ; CHECK: store <4 x i32> <i32 11, i32 11, i32 11, i32 11>, ptr %{{.+}}, align 4
 ; CHECK: br label %if.end
 
 ; CHECK: if.else:
-; CHECK: %[[GEP2:.+]] = getelementptr inbounds i32, ptr %b, <4 x i64>
+; CHECK: %[[GEP2:.+]] = getelementptr i32, ptr %b, <4 x i64>
 ; CHECK: store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, ptr %{{.+}}, align 4
 ; CHECK: br label %if.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
index 49b67c2053235..5046107c4ed33 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
@@ -42,5 +42,5 @@ declare i64 @__mux_get_global_id(i32)
 ; Check if we can packetize GEPs on structs
 ; Note that we only need to packetize the non-uniform operands..
 ; CHECK: define spir_kernel void @__vecz_v4_test
-; CHECK: getelementptr inbounds %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
-; CHECK: getelementptr inbounds %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
+; CHECK: getelementptr %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
+; CHECK: getelementptr %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
index 43079dfa9e05f..d861f5f930442 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -25,14 +25,14 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: IR Dump After Simplify masked memory operations{{( on __vecz_v2_foo)?}}
 ; CHECK-NEXT: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) #0 {
 ; CHECK-NEXT:   %idx = call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
+; CHECK-NEXT:   %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 %idx
 ; CHECK-NEXT:   store i32 0, ptr addrspace(1) %arrayidx, align 4
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
 ; CHECK: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) {{.*}} {
 ; CHECK-NEXT:   %idx = call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
+; CHECK-NEXT:   %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 %idx
 ; CHECK-NEXT:   store <2 x i32> zeroinitializer, ptr addrspace(1) %arrayidx, align 4
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
index 1940065a68d44..f286d5c81b408 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
@@ -78,20 +78,20 @@ entry:
 ; CHECK: store i32 %[[ADD2]], ptr %[[C_1]]
 ; CHECK: store i32 %[[ADD3]], ptr %[[C_2]]
 ; CHECK: store i32 %[[ADD4]], ptr %[[C_3]]
-; CHECK: %arrayidx3 = getelementptr inbounds <4 x i32>, ptr %a, i64 1
-; CHECK: %[[A1_0:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 0
-; CHECK: %[[A1_1:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 1
-; CHECK: %[[A1_2:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 2
-; CHECK: %[[A1_3:.+]] = getelementptr inbounds i32, ptr %arrayidx3, i32 3
+; CHECK: %arrayidx3 = getelementptr <4 x i32>, ptr %a, i64 1
+; CHECK: %[[A1_0:.+]] = getelementptr i32, ptr %arrayidx3, i32 0
+; CHECK: %[[A1_1:.+]] = getelementptr i32, ptr %arrayidx3, i32 1
+; CHECK: %[[A1_2:.+]] = getelementptr i32, ptr %arrayidx3, i32 2
+; CHECK: %[[A1_3:.+]] = getelementptr i32, ptr %arrayidx3, i32 3
 ; CHECK: %[[LA1_0:.+]] = load i32, ptr %[[A1_0]]
 ; CHECK: %[[LA1_1:.+]] = load i32, ptr %[[A1_1]]
 ; CHECK: %[[LA1_2:.+]] = load i32, ptr %[[A1_2]]
 ; CHECK: %[[LA1_3:.+]] = load i32, ptr %[[A1_3]]
-; CHECK: %arrayidx4 = getelementptr inbounds <4 x i32>, ptr %b, i64 1
-; CHECK: %[[B1_0:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 0
-; CHECK: %[[B1_1:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 1
-; CHECK: %[[B1_2:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 2
-; CHECK: %[[B1_3:.+]] = getelementptr inbounds i32, ptr %arrayidx4, i32 3
+; CHECK: %arrayidx4 = getelementptr <4 x i32>, ptr %b, i64 1
+; CHECK: %[[B1_0:.+]] = getelementptr i32, ptr %arrayidx4, i32 0
+; CHECK: %[[B1_1:.+]] = getelementptr i32, ptr %arrayidx4, i32 1
+; CHECK: %[[B1_2:.+]] = getelementptr i32, ptr %arrayidx4, i32 2
+; CHECK: %[[B1_3:.+]] = getelementptr i32, ptr %arrayidx4, i32 3
 ; CHECK: %[[LB1_0:.+]] = load i32, ptr %[[B1_0]]
 ; CHECK: %[[LB1_1:.+]] = load i32, ptr %[[B1_1]]
 ; CHECK: %[[LB1_2:.+]] = load i32, ptr %[[B1_2]]
@@ -104,20 +104,20 @@ entry:
 ; CHECK: %[[SEXT11:.+]] = sext i1 %[[CMP6]] to i32
 ; CHECK: %[[SEXT12:.+]] = sext i1 %[[CMP8]] to i32
 ; CHECK: %[[SEXT13:.+]] = sext i1 %[[CMP9]] to i32
-; CHECK: %arrayidx5 = getelementptr inbounds <4 x i32>, ptr %c, i64 1
-; CHECK: %[[C1_0:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 0
-; CHECK: %[[C1_1:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 1
-; CHECK: %[[C1_2:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 2
-; CHECK: %[[C1_3:.+]] = getelementptr inbounds i32, ptr %arrayidx5, i32 3
+; CHECK: %arrayidx5 = getelementptr <4 x i32>, ptr %c, i64 1
+; CHECK: %[[C1_0:.+]] = getelementptr i32, ptr %arrayidx5, i32 0
+; CHECK: %[[C1_1:.+]] = getelementptr i32, ptr %arrayidx5, i32 1
+; CHECK: %[[C1_2:.+]] = getelementptr i32, ptr %arrayidx5, i32 2
+; CHECK: %[[C1_3:.+]] = getelementptr i32, ptr %arrayidx5, i32 3
 ; CHECK: store i32 %[[SEXT10]], ptr %[[C1_0]]
 ; CHECK: store i32 %[[SEXT11]], ptr %[[C1_1]]
 ; CHECK: store i32 %[[SEXT12]], ptr %[[C1_2]]
 ; CHECK: store i32 %[[SEXT13]], ptr %[[C1_3]]
-; CHECK: %arrayidx6 = getelementptr inbounds <4 x i32>, ptr %a, i64 2
-; CHECK: %[[A2_0:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 0
-; CHECK: %[[A2_1:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 1
-; CHECK: %[[A2_2:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 2
-; CHECK: %[[A2_3:.+]] = getelementptr inbounds i32, ptr %arrayidx6, i32 3
+; CHECK: %arrayidx6 = getelementptr <4 x i32>, ptr %a, i64 2
+; CHECK: %[[A2_0:.+]] = getelementptr i32, ptr %arrayidx6, i32 0
+; CHECK: %[[A2_1:.+]] = getelementptr i32, ptr %arrayidx6, i32 1
+; CHECK: %[[A2_2:.+]] = getelementptr i32, ptr %arrayidx6, i32 2
+; CHECK: %[[A2_3:.+]] = getelementptr i32, ptr %arrayidx6, i32 3
 ; CHECK: %[[LA2_0:.+]] = load i32, ptr %[[A2_0]]
 ; CHECK: %[[LA2_1:.+]] = load i32, ptr %[[A2_1]]
 ; CHECK: %[[LA2_2:.+]] = load i32, ptr %[[A2_2]]
@@ -130,11 +130,11 @@ entry:
 ; CHECK: %[[SEXT819:.+]] = sext i1 %[[CMP715]] to i32
 ; CHECK: %[[SEXT820:.+]] = sext i1 %[[CMP716]] to i32
 ; CHECK: %[[SEXT821:.+]] = sext i1 %[[CMP717]] to i32
-; CHECK: %arrayidx9 = getelementptr inbounds <4 x i32>, ptr %c, i64 2
-; CHECK: %[[C2_0:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 0
-; CHECK: %[[C2_1:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 1
-; CHECK: %[[C2_2:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 2
-; CHECK: %[[C2_3:.+]] = getelementptr inbounds i32, ptr %arrayidx9, i32 3
+; CHECK: %arrayidx9 = getelementptr <4 x i32>, ptr %c, i64 2
+; CHECK: %[[C2_0:.+]] = getelementptr i32, ptr %arrayidx9, i32 0
+; CHECK: %[[C2_1:.+]] = getelementptr i32, ptr %arrayidx9, i32 1
+; CHECK: %[[C2_2:.+]] = getelementptr i32, ptr %arrayidx9, i32 2
+; CHECK: %[[C2_3:.+]] = getelementptr i32, ptr %arrayidx9, i32 3
 ; CHECK: store i32 %[[SEXT818]], ptr %[[C2_0]]
 ; CHECK: store i32 %[[SEXT819]], ptr %[[C2_1]]
 ; CHECK: store i32 %[[SEXT820]], ptr %[[C2_2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
index 8abeed7bcdd12..d995fd652fd03 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -42,5 +42,5 @@ define void @bar(i64** %ptrptrs, i64 %val) {
 ; gets scalarized/re-packetized correctly
 
 ; CHECK: define void @__vecz_v4_bar
-; CHECK: %[[ADDR:.+]] = getelementptr inbounds {{i64|i8}}, <4 x ptr> %{{.+}}, {{i64 2|i64 16}}
+; CHECK: %[[ADDR:.+]] = getelementptr {{i64|i8}}, <4 x ptr> %{{.+}}, {{i64 2|i64 16}}
 ; CHECK: call void @__vecz_b_scatter_store8_Dv4_mDv4_u3ptr(<4 x i64> %.splat{{.*}}, <4 x ptr> %[[ADDR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
index 8c2b80f72ce50..01c0d0d79fff2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
@@ -44,12 +44,12 @@ attributes #2 = { nobuiltin nounwind }
 ;
 ; CHECK: void @__vecz_v4_squash
 ; CHECK:  %[[GID:.+]] = call i64 @__mux_get_global_id(i64 0) #[[ATTRS:[0-9]+]]
-; CHECK:  %[[IDX_PTR:.+]] = getelementptr inbounds i64, ptr addrspace(1) %idx, i64 %[[GID]]
+; CHECK:  %[[IDX_PTR:.+]] = getelementptr i64, ptr addrspace(1) %idx, i64 %[[GID]]
 ; CHECK:  %[[WIDE_LOAD:.+]] = load <4 x i64>, ptr addrspace(1) %[[IDX_PTR]], align 8
-; CHECK:  %[[DATA_PTR:.+]] = getelementptr inbounds <2 x float>, ptr addrspace(1) %data, <4 x i64> %[[WIDE_LOAD]]
+; CHECK:  %[[DATA_PTR:.+]] = getelementptr <2 x float>, ptr addrspace(1) %data, <4 x i64> %[[WIDE_LOAD]]
 ; CHECK:  %[[GATHER:.+]] = call <4 x i64> @__vecz_b_gather_load8_Dv4_mDv4_u3ptrU3AS1(<4 x ptr addrspace(1)> %[[DATA_PTR]])
 ; CHECK:  %[[UNSQUASH:.+]] = bitcast <4 x i64> %[[GATHER]] to <8 x float>
-; CHECK:  %[[OUTPUT_PTR:.+]] = getelementptr inbounds <2 x float>, ptr addrspace(1) %output, i64 %[[GID]]
+; CHECK:  %[[OUTPUT_PTR:.+]] = getelementptr <2 x float>, ptr addrspace(1) %output, i64 %[[GID]]
 ; CHECK:  store <8 x float> %[[UNSQUASH]], ptr addrspace(1) %[[OUTPUT_PTR]], align 8
 ; CHECK:  ret void
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
index 5e1cf09efa858..3bf5a299acc07 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
@@ -30,48 +30,48 @@ entry:
 ; CHECK-NEXT: uniform
   %lduniform = load i8, ptr addrspace(1) %input, align 1
 
-; CHECK: Stride for %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0
+; CHECK: Stride for %arrayidx0 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0
 ; CHECK-NEXT: linear stride of 1
   %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0
   %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1
 
   %truncglobalid0 = trunc i64 %globalid0 to i32
 
-; CHECK: Stride for %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
+; CHECK: Stride for %arrayidx1 = getelementptr i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
 ; CHECK-NEXT: linear stride of 1
   %sexttruncglobalid0 = sext i32 %truncglobalid0 to i64
   %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
   %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1
 
-; CHECK: Stride for %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
+; CHECK: Stride for %arrayidx2 = getelementptr i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
 ; CHECK-NEXT: divergent
   %zexttruncglobalid0 = zext i32 %truncglobalid0 to i64
   %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
   %ld2 = load i8, ptr addrspace(1) %arrayidx2, align 1
 
-; CHECK: Stride for %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0
+; CHECK: Stride for %arrayidx3 = getelementptr i32, ptr addrspace(1) %input, i64 %globalid0
 ; CHECK-NEXT: linear stride of 4
   %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0
   %ld3 = load i8, ptr addrspace(1) %arrayidx3, align 1
 
-; CHECK: Stride for %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul8
+; CHECK: Stride for %arrayidx4 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0mul8
 ; CHECK-NEXT: linear stride of 8
   %globalid0mul8 = mul i64 %globalid0, 8
   %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul8
   %ld4 = load i8, ptr addrspace(1) %arrayidx4, align 1
 
-; CHECK: Stride for %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul16
+; CHECK: Stride for %arrayidx5 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0mul16
 ; CHECK-NEXT: linear stride of 16
   %globalid0mul16 = mul i64 %globalid0mul8, 2
   %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul16
   %ld5 = load i8, ptr addrspace(1) %arrayidx5, align 1
 
-; CHECK: Stride for %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0mul8
+; CHECK: Stride for %arrayidx6 = getelementptr i32, ptr addrspace(1) %input, i64 %globalid0mul8
 ; CHECK-NEXT: linear stride of 32
   %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0mul8
   %ld6 = load i32, ptr addrspace(1) %arrayidx6, align 1
 
-; CHECK: Stride for %arrayidx7 = getelementptr inbounds i16, ptr addrspace(1) %input, i64 %idxprom7
+; CHECK: Stride for %arrayidx7 = getelementptr i16, ptr addrspace(1) %input, i64 %idxprom7
 ; CHECK-NEXT: linear stride of 2
   %mul7 = mul i64 %localsize0, %groupid0
   %add7 = add i64 %mul7, %localid0
@@ -81,7 +81,7 @@ entry:
   %arrayidx7 = getelementptr inbounds i16, ptr addrspace(1) %input, i64 %idxprom7
   %ld7 = load i16, ptr addrspace(1) %arrayidx7, align 1
 
-; CHECK: Stride for %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom8
+; CHECK: Stride for %arrayidx8 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom8
 ; CHECK-NEXT: divergent
   %mul8 = mul i64 %localsize0, %groupid0
   %add8 = add i64 %mul8, %localid0
@@ -91,7 +91,7 @@ entry:
   %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom8
   %ld8 = load i8, ptr addrspace(1) %arrayidx8, align 1
 
-; CHECK: Stride for %arrayidx9 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom9
+; CHECK: Stride for %arrayidx9 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom9
 ; CHECK-NEXT: divergent
   %mul9 = mul i64 %groupid0, %localsize0
   %add9 = add nuw nsw i64 %localid0, 4294967295
@@ -115,7 +115,7 @@ entry:
   %conv = add i32 %0, -1
   %trunclocalsize0 = trunc i64 %localsize0 to i32
 
-; CHECK: Stride for %arrayidx_pre = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_pre
+; CHECK: Stride for %arrayidx_pre = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom_pre
 ; CHECK-NEXT: divergent
   %idxprom_pre = zext i32 %conv to i64
   %arrayidx_pre = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_pre
@@ -126,7 +126,7 @@ entry:
 for.body:
 ; The below is fundamentally the same stride calculation as %arrayidx_pre -
 ; make sure the loop and the PHI don't throw off the analysis.
-; CHECK: Stride for %arrayidx_loop = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_loop
+; CHECK: Stride for %arrayidx_loop = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom_loop
 ; CHECK-NEXT: divergent
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %gx2.050.us = phi i32 [ %conv, %entry ], [ %conv26.us, %for.body ]
@@ -154,7 +154,7 @@ entry:
   %add = add i64 %mul, %localid0
   %addtrunc = trunc i64 %add to i32
 
-; CHECK: Stride for %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom0
+; CHECK: Stride for %arrayidx0 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom0
 ; CHECK-NEXT: divergent
   %idxprom0 = zext i32 %addtrunc to i64
   %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom0
@@ -162,7 +162,7 @@ entry:
 
 ; The below is fundamentally the same stride calculation as %arrayidx0 - make
 ; sure the select doesn't throw off the analysis.
-; CHECK: Stride for %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom1
+; CHECK: Stride for %arrayidx1 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom1
 ; CHECK-NEXT: divergent
   %sel1 = select i1 %cmp, i32 %addtrunc, i32 %addtrunc
   %idxprom1 = zext i32 %sel1 to i64
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
index 2344f68691f2f..d2fc09ce1d187 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -41,5 +41,5 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
 ; CHECK: [[BCAST:%.+]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: %idx = tail call i32 @__mux_get_sub_group_local_id()
 ; CHECK: [[EXT:%.*]] = sext i32 %idx to i64
-; CHECK: %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: %arrayidx2 = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]]
 ; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %arrayidx2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index a07f2b7dda6d0..6460d40e1acae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -50,7 +50,7 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 ; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: [[ID:%.*]] = or {{(disjoint )?}}<4 x i32> [[SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: [[EXT:%.*]] = sext i32 %call to i64
-; CHECK: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]]
 ; CHECK: store <4 x i32> [[ID]], ptr addrspace(1) %arrayidx
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
index 8f5e9a83edffd..a5d9c7b811555 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
@@ -49,8 +49,8 @@ declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1
 ; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
 ; CHECK: entry:
 ; CHECK: call i64 @__mux_get_global_id(i32 0)
-; CHECK-DAG: %[[INA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %[[X:.+]]
+; CHECK-DAG: %[[INA:.+]] = getelementptr i32, ptr addrspace(1) %in, i32 %[[X:.+]]
 ; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
-; CHECK-DAG: %[[OUTA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %[[X:.+]]
+; CHECK-DAG: %[[OUTA:.+]] = getelementptr i32, ptr addrspace(1) %out, i32 %[[X:.+]]
 ; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]]
 ; CHECK-NOT: call <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
index 8f5e9a83edffd..a5d9c7b811555 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
@@ -49,8 +49,8 @@ declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1
 ; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
 ; CHECK: entry:
 ; CHECK: call i64 @__mux_get_global_id(i32 0)
-; CHECK-DAG: %[[INA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %[[X:.+]]
+; CHECK-DAG: %[[INA:.+]] = getelementptr i32, ptr addrspace(1) %in, i32 %[[X:.+]]
 ; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
-; CHECK-DAG: %[[OUTA:.+]] = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %[[X:.+]]
+; CHECK-DAG: %[[OUTA:.+]] = getelementptr i32, ptr addrspace(1) %out, i32 %[[X:.+]]
 ; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]]
 ; CHECK-NOT: call <4 x i32>

From 18194f12e9281558626d568f4bbb0f2314d28cec Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 7 Oct 2024 18:44:03 +0100
Subject: [PATCH 123/182] Upgrade clang-format & clang-tidy.

We were on version 17 before, this moves us to version 19.
---
 .../include/compiler/utils/mangling.h         |   8 +-
 .../compiler/utils/sub_group_analysis.h       |   2 +-
 .../source/barrier_regions.cpp                |   3 +-
 .../source/cl_builtin_info.cpp                | 162 ++++++++++--------
 .../source/mux_builtin_info.cpp               |   2 +-
 .../optimal_builtin_replacement_pass.cpp      |   2 +-
 .../source/pass_functions.cpp                 |  34 ++--
 ...lace_local_module_scope_variables_pass.cpp |   5 +-
 .../source/work_item_loops_pass.cpp           |   2 +-
 .../vecz/include/vecz/vecz_target_info.h      |   6 +
 .../source/analysis/liveness_analysis.cpp     |   2 +-
 .../vecz/source/analysis/stride_analysis.cpp  |  17 +-
 .../vectorizable_function_analysis.cpp        |   2 +-
 .../include/analysis/divergence_analysis.h    |   2 +-
 .../vecz/source/include/transform/passes.h    |   2 +-
 .../vecz/source/ir_cleanup.cpp                |   3 +-
 .../compiler_passes/vecz/source/pass.cpp      |   4 +-
 .../control_flow_conversion_pass.cpp          |   2 +-
 .../interleaved_group_combine_pass.cpp        |   8 +-
 .../transform/packetization_helpers.cpp       |   5 +-
 .../vecz/source/transform/packetizer.cpp      |   6 +-
 .../source/transform/printf_scalarizer.cpp    |   2 +-
 .../vecz/source/transform/scalarizer.cpp      |  18 +-
 .../vecz/source/vector_target_info.cpp        |   8 -
 .../vecz/source/vector_target_info_riscv.cpp  |   5 +-
 .../vecz/source/vectorization_context.cpp     |  14 +-
 .../vecz/source/vectorization_heuristics.cpp  |   2 +-
 .../vecz/source/vectorizer.cpp                |   6 +-
 .../vecz/source/vecz_pass_builder.cpp         |   2 +-
 29 files changed, 172 insertions(+), 164 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
index cec95aeb0bc55..087acd2518549 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
@@ -132,16 +132,16 @@ class TypeQualifiers final {
   StorageT storage_;
 
   /// @brief Number of bits used to encode the size of the list.
-  const static unsigned NumCountBits = 4;
+  static const unsigned NumCountBits = 4;
 
   /// @brief Number of bits used to encode one qualifier in the list.
-  const static unsigned NumQualBits = 10;
+  static const unsigned NumQualBits = 10;
 
   /// @brief Number of bits that can be used to store the list.
-  const static unsigned NumStorageBits = sizeof(StorageT) * 8;
+  static const unsigned NumStorageBits = sizeof(StorageT) * 8;
 
   /// @brief Maximum size of the list.
-  const static unsigned MaxSize = (NumStorageBits - NumCountBits) / NumQualBits;
+  static const unsigned MaxSize = (NumStorageBits - NumCountBits) / NumQualBits;
 
   static_assert(MaxSize < (1 << NumCountBits) - 1, "MaxSize cannot be encoded");
 };
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
index fb19fc956027b..822bf9f7a2f47 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
@@ -63,7 +63,7 @@ class GlobalSubgroupInfo {
   /// @brief Returns the SubgroupInfo for the provided function.
   ///
   /// The function must already exist in the map.
-  inline const SubgroupInfo *operator[](const llvm::Function *F) const {
+  const SubgroupInfo *operator[](const llvm::Function *F) const {
     const const_iterator I = FunctionMap.find(F);
     assert(I != FunctionMap.end() && "Function not in sub-group info!");
     return I->second.get();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 97b035374388a..03118c52da90e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -1050,8 +1050,7 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
   BasicBlock *entry_point = region.entry;
   LLVMContext &context = module_.getContext();
 
-  LLVM_DEBUG(dbgs() << "\n"; unsigned i = 0; for (auto *d
-                                                  : region.blocks) {
+  LLVM_DEBUG(dbgs() << "\n"; unsigned i = 0; for (auto *d : region.blocks) {
     dbgs() << "entry block: " << entry_point->getName() << "\n";
     dbgs() << "region visited path [" << i++ << "] = " << d->getName()
            << "\n\n";
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index b37eea39b46eb..08f7ef2f868e8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -1541,10 +1541,13 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
           return nullptr;
         }
         const bool IsSigned = *IsParamSignedOrNone;
-        const Intrinsic::ID IntrinsicOpc =
-            BuiltinID == eCLBuiltinSubSat
-                ? (IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat)
-                : (IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat);
+        const Intrinsic::ID IntrinsicOpc = [=] {
+          if (BuiltinID == eCLBuiltinSubSat) {
+            return IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat;
+          } else {
+            return IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat;
+          }
+        }();
         return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
                                                       IntrinsicOpc);
       }
@@ -2621,7 +2624,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineShuffle(BuiltinID BuiltinID,
   assert(Width != 3 && "Invalid vector width of 3!");
   const int N = (Width == 3 ? 4 : Width);
   const int SignificantBits =
-      stdcompat::ilogb(2 * N - 1) + (isShuffle2 ? 1 : 0);
+      stdcompat::ilogb((2 * N) - 1) + (isShuffle2 ? 1 : 0);
   const unsigned BitMask = ~((~0u) << SignificantBits);
   Value *BitMaskV = ConstantVector::getSplat(ElementCount::getFixed(MaskWidth),
                                              ConstantInt::get(MaskTy, BitMask));
@@ -3171,8 +3174,7 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
     SmallVector<Type *, 4> ArgumentTypes;
     SmallVector<compiler::utils::TypeQualifiers, 4> Qualifiers;
 
-    const auto DemangledName = std::string(
-        Mangler.demangleName(F->getName(), ArgumentTypes, Qualifiers));
+    Mangler.demangleName(F->getName(), ArgumentTypes, Qualifiers);
 
     assert(Qualifiers.size() == 1 && ArgumentTypes.size() == 1 &&
            "Unknown collective builtin");
@@ -3188,116 +3190,124 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
       default:
         llvm_unreachable("unknown group operation for which to check the type");
       case eMuxBuiltinSubgroupReduceAdd:
-        MuxBuiltinID = IsFP ? eMuxBuiltinSubgroupReduceFAdd : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupReduceFAdd;
         break;
       case eMuxBuiltinSubgroupReduceMul:
-        MuxBuiltinID = IsFP ? eMuxBuiltinSubgroupReduceFMul : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupReduceFMul;
         break;
       case eMuxBuiltinSubgroupReduceUMin:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinSubgroupReduceFMin
-                 : (IsSignedInt ? eMuxBuiltinSubgroupReduceSMin : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinSubgroupReduceFMin;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinSubgroupReduceSMin;
+        }
         break;
       case eMuxBuiltinSubgroupReduceUMax:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinSubgroupReduceFMax
-                 : (IsSignedInt ? eMuxBuiltinSubgroupReduceSMax : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinSubgroupReduceFMax;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinSubgroupReduceSMax;
+        }
         break;
       case eMuxBuiltinSubgroupScanAddInclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinSubgroupScanFAddInclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFAddInclusive;
         break;
       case eMuxBuiltinSubgroupScanAddExclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinSubgroupScanFAddExclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFAddExclusive;
         break;
       case eMuxBuiltinSubgroupScanMulInclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinSubgroupScanFMulInclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFMulInclusive;
         break;
       case eMuxBuiltinSubgroupScanMulExclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinSubgroupScanFMulExclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFMulExclusive;
         break;
       case eMuxBuiltinSubgroupScanUMinInclusive:
-        MuxBuiltinID = IsFP
-                           ? eMuxBuiltinSubgroupScanFMinInclusive
-                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMinInclusive
-                                          : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanFMinInclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanSMinInclusive;
+        }
         break;
       case eMuxBuiltinSubgroupScanUMinExclusive:
-        MuxBuiltinID = IsFP
-                           ? eMuxBuiltinSubgroupScanFMinExclusive
-                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMinExclusive
-                                          : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanFMinExclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanSMinExclusive;
+        }
         break;
       case eMuxBuiltinSubgroupScanUMaxInclusive:
-        MuxBuiltinID = IsFP
-                           ? eMuxBuiltinSubgroupScanFMaxInclusive
-                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMaxInclusive
-                                          : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxInclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxInclusive;
+        }
         break;
       case eMuxBuiltinSubgroupScanUMaxExclusive:
-        MuxBuiltinID = IsFP
-                           ? eMuxBuiltinSubgroupScanFMaxExclusive
-                           : (IsSignedInt ? eMuxBuiltinSubgroupScanSMaxExclusive
-                                          : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxExclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxExclusive;
+        }
         break;
       case eMuxBuiltinWorkgroupReduceAdd:
-        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFAdd : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupReduceFAdd;
         break;
       case eMuxBuiltinWorkgroupReduceMul:
-        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFMul : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMul;
         break;
       case eMuxBuiltinWorkgroupReduceUMin:
-        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFMin
-                            : (IsSignedInt ? eMuxBuiltinWorkgroupReduceSMin
-                                           : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMin;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMin;
+        }
         break;
       case eMuxBuiltinWorkgroupReduceUMax:
-        MuxBuiltinID = IsFP ? eMuxBuiltinWorkgroupReduceFMax
-                            : (IsSignedInt ? eMuxBuiltinWorkgroupReduceSMax
-                                           : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMax;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMax;
+        }
         break;
       case eMuxBuiltinWorkgroupScanAddInclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFAddInclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddInclusive;
         break;
       case eMuxBuiltinWorkgroupScanAddExclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFAddExclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddExclusive;
         break;
       case eMuxBuiltinWorkgroupScanMulInclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFMulInclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulInclusive;
         break;
       case eMuxBuiltinWorkgroupScanMulExclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFMulExclusive : MuxBuiltinID;
+        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulExclusive;
         break;
       case eMuxBuiltinWorkgroupScanUMinInclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFMinInclusive
-                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMinInclusive
-                                : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinInclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinInclusive;
+        }
         break;
       case eMuxBuiltinWorkgroupScanUMinExclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFMinExclusive
-                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMinExclusive
-                                : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinExclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinExclusive;
+        }
         break;
       case eMuxBuiltinWorkgroupScanUMaxInclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFMaxInclusive
-                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMaxInclusive
-                                : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxInclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxInclusive;
+        }
         break;
       case eMuxBuiltinWorkgroupScanUMaxExclusive:
-        MuxBuiltinID =
-            IsFP ? eMuxBuiltinWorkgroupScanFMaxExclusive
-                 : (IsSignedInt ? eMuxBuiltinWorkgroupScanSMaxExclusive
-                                : MuxBuiltinID);
+        if (IsFP) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxExclusive;
+        } else if (IsSignedInt) {
+          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxExclusive;
+        }
         break;
     }
   }
@@ -3416,9 +3426,13 @@ Instruction *CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(
           IsStrided ? ElementSize
                     : B.CreateMul(ElementSize, NumElements, "width.bytes");
 
-      const BuiltinID MuxBuiltinID =
-          IsRead ? (IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D)
-                 : (IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D);
+      const BuiltinID MuxBuiltinID = [&] {
+        if (IsRead) {
+          return IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D;
+        } else {
+          return IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D;
+        }
+      }();
 
       auto *const MuxDMA =
           BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, EventIn->getType());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index 32c31877b0d35..a84ee21df5e6f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -39,7 +39,7 @@ namespace SchedParamIndices {
 enum {
   WI = 0,
   WG = 1,
-  TOTAL,
+  TOTAL = 2,
 };
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index dec9f0958b229..af89fc5f219bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -232,7 +232,7 @@ PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C,
         [BI](CallBase &CB, StringRef, const SmallVectorImpl<Type *> &,
              const SmallVectorImpl<TypeQualifiers> &) -> Value * {
           Function *Callee = CB.getCalledFunction();
-          auto const Props = BI->analyzeBuiltin(*Callee).properties;
+          const auto Props = BI->analyzeBuiltin(*Callee).properties;
           if (Props & eBuiltinPropertyCanEmitInline) {
             IRBuilder<> B(&CB);
             const SmallVector<Value *, 4> Args(CB.args());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 4bcf4f39cb63b..43dea54c8c490 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -122,7 +122,7 @@ bool funcContainsDebugMetadata(const llvm::Function &func,
 
   for (auto &BB : func) {
     for (auto &Inst : BB) {
-      if (auto DL = Inst.getDebugLoc()) {
+      if (const auto &DL = Inst.getDebugLoc()) {
         llvm::DILocation *loc = DL.get();
         vmap.MD()[loc].reset(loc);
         foundDI = true;
@@ -689,7 +689,7 @@ llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
                                      llvm::Value *RHS, llvm::RecurKind Kind) {
   switch (Kind) {
     default:
-      break;
+      llvm_unreachable("Unexpected Kind");
     case llvm::RecurKind::None:
       return nullptr;
     case llvm::RecurKind::Add:
@@ -702,29 +702,23 @@ llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
       return B.CreateAnd(LHS, RHS);
     case llvm::RecurKind::Xor:
       return B.CreateXor(LHS, RHS);
+    case llvm::RecurKind::SMin:
+      return B.CreateBinaryIntrinsic(llvm::Intrinsic::smin, LHS, RHS);
+    case llvm::RecurKind::UMin:
+      return B.CreateBinaryIntrinsic(llvm::Intrinsic::umin, LHS, RHS);
+    case llvm::RecurKind::SMax:
+      return B.CreateBinaryIntrinsic(llvm::Intrinsic::smax, LHS, RHS);
+    case llvm::RecurKind::UMax:
+      return B.CreateBinaryIntrinsic(llvm::Intrinsic::umax, LHS, RHS);
     case llvm::RecurKind::FAdd:
       return B.CreateFAdd(LHS, RHS);
     case llvm::RecurKind::FMul:
       return B.CreateFMul(LHS, RHS);
+    case llvm::RecurKind::FMin:
+      return B.CreateBinaryIntrinsic(llvm::Intrinsic::minnum, LHS, RHS);
+    case llvm::RecurKind::FMax:
+      return B.CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, LHS, RHS);
   }
-  assert((Kind == llvm::RecurKind::FMin || Kind == llvm::RecurKind::FMax ||
-          Kind == llvm::RecurKind::SMin || Kind == llvm::RecurKind::SMax ||
-          Kind == llvm::RecurKind::UMin || Kind == llvm::RecurKind::UMax) &&
-         "Unexpected min/max Kind");
-  if (Kind == llvm::RecurKind::FMin || Kind == llvm::RecurKind::FMax) {
-    return B.CreateBinaryIntrinsic(Kind == llvm::RecurKind::FMin
-                                       ? llvm::Intrinsic::minnum
-                                       : llvm::Intrinsic::maxnum,
-                                   LHS, RHS);
-  }
-  const bool isMin =
-      Kind == llvm::RecurKind::SMin || Kind == llvm::RecurKind::UMin;
-  const bool isSigned =
-      Kind == llvm::RecurKind::SMin || Kind == llvm::RecurKind::SMax;
-  const llvm::Intrinsic::ID intrOpc =
-      isMin ? (isSigned ? llvm::Intrinsic::smin : llvm::Intrinsic::umin)
-            : (isSigned ? llvm::Intrinsic::smax : llvm::Intrinsic::umax);
-  return B.CreateBinaryIntrinsic(intrOpc, LHS, RHS);
 }
 
 }  // namespace utils
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index d45cdd5fcbce3..42e14bf8e80fb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -29,6 +29,7 @@
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <multi_llvm/vector_type_helper.h>
 
+#include <algorithm>
 #include <cassert>
 #include <functional>
 
@@ -311,9 +312,7 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
     assert(alignment > 0 && "'0' is an impossible alignment");
 
     // check if this is the largest alignment seen so far
-    if (alignment > maxAlignment) {
-      maxAlignment = alignment;
-    }
+    maxAlignment = std::max(alignment, maxAlignment);
 
     // check if member is not already aligned
     const unsigned int remainder = offset % alignment;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index b1c6a5b896a0a..074b1eb95bde9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -1405,7 +1405,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   // An inlinable function call in a function with debug info *must* be given
   // a debug location.
   DILocation *wrapperDbgLoc = nullptr;
-  if (auto *const SP = new_wrapper->getSubprogram()) {
+  if (new_wrapper->getSubprogram()) {
     wrapperDbgLoc = DILocation::get(context, /*line*/ 0, /*col*/ 0,
                                     new_wrapper->getSubprogram());
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
index 3b66d21951ac4..d922eb6e9bd7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -684,6 +684,12 @@ class TargetInfoAnalysis : public llvm::AnalysisInfoMixin<TargetInfoAnalysis> {
   CallbackFn TICallback;
 };
 
+std::unique_ptr<TargetInfo> createTargetInfoArm(llvm::TargetMachine *tm);
+
+std::unique_ptr<TargetInfo> createTargetInfoAArch64(llvm::TargetMachine *tm);
+
+std::unique_ptr<TargetInfo> createTargetInfoRISCV(llvm::TargetMachine *tm);
+
 /// @brief Create a new vector target info instance.
 /// @param[in] tm LLVM target machine that will be used for compilation, can
 /// be NULL if no target data is available.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
index 22d1b267f6a31..d7ef1aecfa07e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -89,7 +89,7 @@ class LivenessResult::Impl {
   void calculateMaxRegistersInBlock(const llvm::BasicBlock *BB);
 
   // private utility method for code conciseness
-  inline BlockLivenessInfo &info(const BasicBlock *BB) const {
+  BlockLivenessInfo &info(const BasicBlock *BB) const {
     auto BIi = LR.BlockInfos.find(BB);
     assert(BIi != LR.BlockInfos.end() && "Block Liveness Info does not exist!");
     return BIi->second;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
index de1f24c6ae67c..745a3bd6f8381 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -102,13 +102,16 @@ PreservedAnalyses StrideAnalysisPrinterPass::run(Function &F,
           continue;
         }
         if (const OffsetInfo *Info = SAR.getInfo(Ptr)) {
-          OS << "* Stride for " << *Ptr << "\n";
-          OS << "  - "
-             << (Info->mayDiverge()
-                     ? "divergent"
-                     : (Info->hasStride()
-                            ? "linear"
-                            : (Info->isUniform() ? "uniform" : "unknown")));
+          OS << "* Stride for " << *Ptr << "\n  - ";
+          if (Info->mayDiverge()) {
+            OS << "divergent";
+          } else if (Info->hasStride()) {
+            OS << "linear";
+          } else if (Info->isUniform()) {
+            OS << "uniform";
+          } else {
+            OS << "unknown";
+          }
           if (Info->isStrideConstantInt()) {
             OS << " stride of " << Info->getStrideAsConstantInt();
           }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
index 988f3208ac213..96ee366e8fe86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -36,7 +36,7 @@ llvm::AnalysisKey VectorizableFunctionAnalysis::Key;
 ///
 /// This flag is for testing and debugging purposes and it should not be used
 /// for normal code as instantiating undefined functions is not always valid.
-cl::opt<bool> HandleDeclOnlyCalls(
+static cl::opt<bool> HandleDeclOnlyCalls(
     "vecz-handle-declaration-only-calls",
     cl::desc("Go ahead and handle calls to declaration-only functions"));
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
index 49d2eafa5186c..4350da506c06a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
@@ -105,7 +105,7 @@ struct BlockQueue {
   index_list indices;
 
   /// @brief Constructs an empty BlockQueue
-  BlockQueue(const DivergenceResult &dr) : DR(dr){};
+  BlockQueue(const DivergenceResult &dr) : DR(dr) {};
 
   /// @brief Constructs a BlockQueue from a set of blocks.
   BlockQueue(const DivergenceResult &dr,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
index a2d115a939589..1af52bb29a086 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
@@ -74,7 +74,7 @@ class BuiltinInliningPass : public llvm::PassInfoMixin<BuiltinInliningPass> {
 /// however much simpler than LLVM's.
 class BasicMem2RegPass : public llvm::PassInfoMixin<BasicMem2RegPass> {
  public:
-  BasicMem2RegPass(){};
+  BasicMem2RegPass() {};
 
   /// @brief The entry point to the pass.
   /// @param[in,out] F Function to optimize.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
index b8bcf24c46c0a..497cf0e8985de 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -131,8 +131,7 @@ void IRCleanup::deleteInstructions() {
   }
 
   // Remove remaining instructions from the list.
-  LLVM_DEBUG(for (Instruction *I
-                  : InstructionsToDelete) {
+  LLVM_DEBUG(for (Instruction *I : InstructionsToDelete) {
     dbgs() << "vecz: could not delete " << *I << "\n";
   });
   InstructionsToDelete.clear();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index 8f90e78f17f86..0dc17a436e275 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -49,7 +49,7 @@ using namespace llvm;
 /// @brief Provide debug logging for Vecz's PassManager
 ///
 /// This flag is intended for testing and debugging purposes.
-cl::opt<bool> DebugVeczPipeline(
+static cl::opt<bool> DebugVeczPipeline(
     "debug-vecz-pipeline",
     cl::desc("Enable debug logging of the vecz PassManager"));
 
@@ -57,7 +57,7 @@ cl::opt<bool> DebugVeczPipeline(
 ///
 /// This flag specifies a textual description of the optimization pass pipeline
 /// to run over the kernel.
-cl::opt<std::string> VeczPassPipeline(
+static cl::opt<std::string> VeczPassPipeline(
     "vecz-passes",
     cl::desc(
         "A textual description of the pass pipeline. To have analysis passes "
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index ed1a5f9408d4a..38de9799dda40 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -2952,7 +2952,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
     PHINode *PHI = PHINode::Create(T, numPreds, opDef->getName() + ".merge");
     multi_llvm::insertBefore(PHI, B->begin());
 
-    auto const *const LTag = DR->getTag(B).loop;
+    const auto *const LTag = DR->getTag(B).loop;
     bool hasVisitedPred = false;
     for (BasicBlock *pred : predecessors(B)) {
       Value *incomingV = nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index de1b2b5c1240e..9614bd6f4806d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -275,9 +275,11 @@ PreservedAnalyses InterleavedGroupCombinePass::run(
       InterleavedOpInfo Info;
 
       const bool OpIsLoad = Op->isLoad();
-      Info.Kind = OpIsLoad
-                      ? (Mask ? eMaskedInterleavedLoad : eInterleavedLoad)
-                      : (Mask ? eMaskedInterleavedStore : eInterleavedStore);
+      if (OpIsLoad) {
+        Info.Kind = Mask ? eMaskedInterleavedLoad : eInterleavedLoad;
+      } else {
+        Info.Kind = Mask ? eMaskedInterleavedStore : eInterleavedStore;
+      }
       Info.Op = CI;
       Info.Stride = Stride;
       Info.Removed = false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index dd444cbffeee7..4afe43e375474 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -123,7 +123,7 @@ IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) {
   return {&F.getEntryBlock(), it};
 }
 
-Constant *getShuffleMask(ShuffleVectorInst *shuffle) {
+static Constant *getShuffleMask(ShuffleVectorInst *shuffle) {
   // The mask value seems not to be a proper operand for LLVM 11.
   // NOTE this is marked as "temporary" in the docs!
   return shuffle->getShuffleMaskForBitcode();
@@ -520,7 +520,8 @@ void Packetizer::Result::getPacketValues(SmallVectorImpl<Value *> &vals) const {
   assert(info && "No packet info for this packetization result");
   const auto width = info->numInstances;
   if (width != 0) {
-    return getPacketValues(width, vals);
+    getPacketValues(width, vals);
+    return;
   }
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 287e51508fd2e..69da33329b181 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1124,7 +1124,7 @@ unsigned Packetizer::Impl::getPacketWidthForType(Type *ty,
   // Note that we don't really expect huge values here, over 16 is still
   // currently not officially supported, over 256 would be astonishing,
   // and over 65536 would be inconcievable, so we don't bother to >> 16.
-  unsigned width = fullWidth / maxWidth - 1;
+  unsigned width = (fullWidth / maxWidth) - 1;
   width |= width >> 1;
   width |= width >> 2;
   width |= width >> 4;
@@ -3997,7 +3997,7 @@ ValuePacket Packetizer::Impl::packetizeShuffleVector(
   const auto adjust =
       isa<UndefValue>(srcB) ? -scalarWidth : (width - 1) * scalarWidth;
   for (auto &idx : mask) {
-    if (idx != int(-1) && idx >= int(scalarWidth)) {
+    if (idx != -1 && idx >= int(scalarWidth)) {
       idx += adjust;
     }
   }
@@ -4008,7 +4008,7 @@ ValuePacket Packetizer::Impl::packetizeShuffleVector(
   for (unsigned i = 1, k = 0; i < width; ++i, k += size) {
     for (unsigned j = 0; j < size; ++j) {
       auto maskElem = mask[k + j];
-      if (maskElem != int(-1)) {
+      if (maskElem != -1) {
         maskElem += scalarWidth;
       }
       mask.push_back(maskElem);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
index ff544ff69adaf..aa8ffedce3b9e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -75,7 +75,7 @@ std::string GetFormatStringAsString(Value *op) {
   return array_string->getAsString().str();
 }
 
-bool IncrementPtr(const char **fmt) {
+static bool IncrementPtr(const char **fmt) {
   if (*(++(*fmt)) == '\0') {
     return true;
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index a95910f6e228b..b812d5b371f1e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -31,6 +31,8 @@
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
+#include <algorithm>
+
 #include "debugging.h"
 #include "llvm_helpers.h"
 #include "memory_operations.h"
@@ -834,9 +836,7 @@ SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
   // whole vector.
   const unsigned Alignment = Load->getAlign().value();
   unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
-  if (Alignment < EleAlign) {
-    EleAlign = Alignment;
-  }
+  EleAlign = std::min(Alignment, EleAlign);
 
   // Emit scalarized loads.
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -919,9 +919,7 @@ SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
   // See comment at equivalent part of scalarizeLoad()
   const unsigned Alignment = Store->getAlign().value();
   unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
-  if (Alignment < EleAlign) {
-    EleAlign = Alignment;
-  }
+  EleAlign = std::min(Alignment, EleAlign);
 
   // Emit scalarized stores.
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -1133,9 +1131,9 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
           SrcPart = B.CreateZExt(SrcPart, DstEleIntTy);
         }
         if (i * DstEleSize > j * SrcEleSize) {
-          SrcPart = B.CreateLShr(SrcPart, i * DstEleSize - j * SrcEleSize);
+          SrcPart = B.CreateLShr(SrcPart, (i * DstEleSize) - (j * SrcEleSize));
         } else if (j * SrcEleSize > i * DstEleSize) {
-          SrcPart = B.CreateShl(SrcPart, j * SrcEleSize - i * DstEleSize);
+          SrcPart = B.CreateShl(SrcPart, (j * SrcEleSize) - (i * DstEleSize));
         }
         if (SrcEleIntTy->getIntegerBitWidth() >
             DstEleIntTy->getIntegerBitWidth()) {
@@ -1311,9 +1309,7 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
 
   const unsigned Alignment = MaskedOp.getAlignment();
   unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
-  if (Alignment < EleAlign) {
-    EleAlign = Alignment;
-  }
+  EleAlign = std::min(Alignment, EleAlign);
 
   for (unsigned i = 0; i < SimdWidth; i++) {
     if (!PM.isEnabled(i) || P->at(i)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 78c223bb766a0..172c6317c4f69 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -1331,14 +1331,6 @@ bool TargetInfo::canPacketize(const llvm::Value *, ElementCount) const {
   return true;
 }
 
-namespace vecz {
-std::unique_ptr<TargetInfo> createTargetInfoArm(TargetMachine *tm);
-
-std::unique_ptr<TargetInfo> createTargetInfoAArch64(TargetMachine *tm);
-
-std::unique_ptr<TargetInfo> createTargetInfoRISCV(TargetMachine *tm);
-}  // namespace vecz
-
 std::unique_ptr<TargetInfo> vecz::createTargetInfoFromTargetMachine(
     TargetMachine *tm) {
   // The TargetMachine is allowed to be null
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 748991d555454..e1a42789c796f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -746,9 +746,8 @@ Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
   auto *const I32Ty = Type::getInt32Ty(B.getContext());
   auto *const I64Ty = Type::getInt64Ty(B.getContext());
 
-  auto *const VL =
-      B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli, {I64Ty},
-                        {RemainingIters, VSEW, VLMul});
+  auto *const VL = B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli,
+                                     {I64Ty}, {RemainingIters, VSEW, VLMul});
 
   return B.CreateTrunc(VL, I32Ty);
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 23ac15927c9df..c1a735bb35dbd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -502,7 +502,7 @@ VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
 
   if (IsCmpXchg) {
     if (auto Ordering = demangleOrdering()) {
-      AtomicInfo.CmpXchgFailureOrdering = *Ordering;
+      AtomicInfo.CmpXchgFailureOrdering = Ordering;
     } else {
       return std::nullopt;
     }
@@ -1115,10 +1115,14 @@ bool VectorizationContext::emitMaskedAtomicBody(
 
   Value *const IdxStart = B.getInt32(0);
   ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue());
-  Value *IdxEnd =
-      MA.IsVectorPredicated
-          ? F.getArg(3 + IsCmpXchg)
-          : (!MA.VF.isScalable() ? KnownMin : B.CreateVScale(KnownMin));
+  Value *IdxEnd;
+  if (MA.IsVectorPredicated) {
+    IdxEnd = F.getArg(3 + IsCmpXchg);
+  } else if (MA.VF.isScalable()) {
+    IdxEnd = B.CreateVScale(KnownMin);
+  } else {
+    IdxEnd = KnownMin;
+  }
 
   Value *RetVal = nullptr;
   Value *RetSuccessVal = nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
index beed2b6f3f38e..5859fbfff697d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -295,7 +295,7 @@ bool Heuristics::shouldVectorize() {
       } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
         if (Function *Callee = CI->getCalledFunction()) {
-          auto const builtin = BI.analyzeBuiltin(*Callee);
+          const auto builtin = BI.analyzeBuiltin(*Callee);
           if (!(builtin.properties &
                 compiler::utils::eBuiltinPropertyWorkItem)) {
             weight++;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index 8c90389ef4be6..41feec41bd3c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -25,6 +25,7 @@
 #include <llvm/Support/raw_ostream.h>
 #include <multi_llvm/vector_type_helper.h>
 
+#include <algorithm>
 #include <memory>
 #include <unordered_set>
 
@@ -166,9 +167,8 @@ void collectStatistics(VectorizationUnit &VU, Function *Scalar,
       ScalarVectorInsts += isVectorInst(I);
       // Find out how wide is the widest vector used in the scalar kernel
       if (auto *VecTy = dyn_cast<FixedVectorType>(I.getType())) {
-        if (VecTy->getNumElements() > MaxScalarVectorWidth) {
-          MaxScalarVectorWidth = VecTy->getNumElements();
-        }
+        MaxScalarVectorWidth =
+            std::max(VecTy->getNumElements(), MaxScalarVectorWidth);
       }
     }
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index bcb8f3e1596c6..f1a110b457448 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -140,7 +140,7 @@ void VeczPassMachinery::addClassToPassNames() {
     // FIXME: This is repeating the job of the VectorizationUnitAnalysis.
     // We should track 'failure' more directly in the
     // Function/VectorizationContext?
-    auto const *const VU = Ctx.getActiveVU(F);
+    const auto *const VU = Ctx.getActiveVU(F);
     if (!VU) {
       // Don't run on anything without a VU since it's not currently being
       // vectorized.

From 293dbccf62ebd348c311ee1a7a73ffb14f909fcd Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 14 Oct 2024 15:49:48 +0100
Subject: [PATCH 124/182] Remove LLVM 17 support.

We support the two most recent LLVM major releases. LLVM 19 has been
released, so switch from supporting LLVM 17+18 to supporting LLVM 18+19.
---
 .../include/multi_llvm/basicblock_helper.h    | 41 -----------------
 .../include/multi_llvm/enums.h                | 46 -------------------
 .../include/multi_llvm/loop_utils.h           | 13 ++----
 .../include/multi_llvm/multi_llvm.h           |  1 -
 .../source/barrier_regions.cpp                |  4 --
 .../vecz/source/control_flow_boscc.cpp        |  7 ++-
 .../include/transform/packetization_helpers.h |  9 ++--
 .../source/transform/basic_mem2reg_pass.cpp   |  4 --
 .../control_flow_conversion_pass.cpp          | 19 ++++----
 .../transform/packetization_helpers.cpp       |  7 ++-
 .../vecz/source/transform/packetizer.cpp      | 13 ++----
 .../source/transform/pre_linearize_pass.cpp   |  3 +-
 .../vecz/source/vecz_pass_builder.cpp         |  8 +---
 13 files changed, 29 insertions(+), 146 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h
deleted file mode 100644
index e4175a27ebb3a..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/basicblock_helper.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#ifndef MULTI_LLVM_BASICBLOCK_HELPER_H_INCLUDED
-#define MULTI_LLVM_BASICBLOCK_HELPER_H_INCLUDED
-
-#include <llvm/IR/BasicBlock.h>
-#include <multi_llvm/llvm_version.h>
-
-namespace multi_llvm {
-inline void insertBefore(llvm::Instruction *const I,
-                         const llvm::BasicBlock::iterator InsertPos) {
-#if LLVM_VERSION_GREATER_EQUAL(18, 0)
-  I->insertBefore(InsertPos);
-#else
-  I->insertBefore(&*InsertPos);
-#endif
-}
-
-inline llvm::BasicBlock::iterator getFirstNonPHIIt(llvm::BasicBlock *const BB) {
-#if LLVM_VERSION_GREATER_EQUAL(18, 0)
-  return BB->getFirstNonPHIIt();
-#else
-  return BB->getFirstNonPHI()->getIterator();
-#endif
-}
-}  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_BASICBLOCK_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h
deleted file mode 100644
index e44b7ac6da3a0..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/enums.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#ifndef MULTI_LLVM_ENUMS_H_INCLUDED
-#define MULTI_LLVM_ENUMS_H_INCLUDED
-
-#include <llvm/Support/CodeGen.h>
-#include <multi_llvm/llvm_version.h>
-
-namespace multi_llvm {
-#if LLVM_VERSION_MAJOR >= 18
-
-typedef llvm::CodeGenFileType CodeGenFileType;
-typedef llvm::CodeGenOptLevel CodeGenOptLevel;
-
-#else
-
-struct CodeGenFileType {
-  static constexpr auto AssemblyFile = llvm::CGFT_AssemblyFile;
-  static constexpr auto ObjectFile = llvm::CGFT_ObjectFile;
-  static constexpr auto Null = llvm::CGFT_Null;
-};
-
-struct CodeGenOptLevel {
-  static constexpr auto None = llvm::CodeGenOpt::None;
-  static constexpr auto Less = llvm::CodeGenOpt::Less;
-  static constexpr auto Default = llvm::CodeGenOpt::Default;
-  static constexpr auto Aggressive = llvm::CodeGenOpt::Aggressive;
-};
-
-#endif
-}  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_ENUMS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
index 8301f9b639f0b..fecfbec7cabc2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
@@ -13,6 +13,7 @@
 // under the License.
 //
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 #ifndef MULTI_LLVM_LOOP_UTILS_H_INCLUDED
 #define MULTI_LLVM_LOOP_UTILS_H_INCLUDED
 
@@ -21,17 +22,13 @@
 
 namespace multi_llvm {
 
-inline llvm::Value *createSimpleTargetReduction(
-    llvm::IRBuilderBase &B, const llvm::TargetTransformInfo *TTI,
-    llvm::Value *Src, llvm::RecurKind RdxKind) {
+inline llvm::Value *createSimpleReduction(llvm::IRBuilderBase &B,
+                                          llvm::Value *Src,
+                                          llvm::RecurKind RdxKind) {
 #if LLVM_VERSION_MAJOR >= 20
-  (void)TTI;
   return llvm::createSimpleReduction(B, Src, RdxKind);
-#elif LLVM_VERSION_MAJOR >= 18
-  (void)TTI;
-  return llvm::createSimpleTargetReduction(B, Src, RdxKind);
 #else
-  return llvm::createSimpleTargetReduction(B, TTI, Src, RdxKind);
+  return llvm::createSimpleTargetReduction(B, Src, RdxKind);
 #endif
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index be4169aed2120..e238e6465ac21 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -17,7 +17,6 @@
 #ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 #define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 
-#include <multi_llvm/enums.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/loop_utils.h>
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 03118c52da90e..a54b2fd4d0b9b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -971,11 +971,7 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() {
 
     // Check if the alloca has a debug info source variable attached. If
     // so record this and the matching byte offset into the struct.
-#if LLVM_VERSION_GREATER_EQUAL(18, 0)
     auto DbgIntrinsics = findDbgDeclares(member.value);
-#else
-    auto DbgIntrinsics = FindDbgDeclareUses(member.value);
-#endif
     for (auto DII : DbgIntrinsics) {
       if (auto dbgDeclare = dyn_cast<DbgDeclareInst>(DII)) {
         debug_intrinsics_.push_back(std::make_pair(dbgDeclare, offset));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 0563b0d584479..04a36e8755999 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -23,7 +23,6 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Cloning.h>
-#include <multi_llvm/basicblock_helper.h>
 
 #include <numeric>
 #include <queue>
@@ -1026,7 +1025,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
 
       PHINode *blend = PHINode::Create(liveIn->getType(), 2,
                                        liveIn->getName() + ".boscc_blend");
-      multi_llvm::insertBefore(blend, blendPoint->begin());
+      blend->insertBefore(blendPoint->begin());
       bool replaceUniform = false;
       bool replacePredicate = false;
       // For each predecessor, if it can reach the instruction, set the
@@ -1096,7 +1095,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
 
             PHINode *blend = PHINode::Create(
                 incoming->getType(), 1, incoming->getName() + ".boscc_lcssa");
-            multi_llvm::insertBefore(blend, target->begin());
+            blend->insertBefore(target->begin());
             blend->addIncoming(incoming, runtimeCheckerBlock);
             PHI->setIncomingValue(idx, blend);
           }
@@ -1219,7 +1218,7 @@ bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues(
   auto createLatchIncoming = [&from, &LTag, this] {
     auto *ret =
         PHINode::Create(from->getType(), 2, from->getName() + ".boscc_blend");
-    multi_llvm::insertBefore(ret, LTag->latch->begin());
+    ret->insertBefore(LTag->latch->begin());
     Value *uniform = getUniformV(from);
     Value *default_val = getDefaultValue(from->getType());
     for (BasicBlock *pred : predecessors(LTag->latch)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index 53141575a4280..da3a9ad2f1c20 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -24,7 +24,6 @@
 #include <llvm/ADT/DenseMap.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/IVDescriptors.h>
-#include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/IR/IRBuilder.h>
 #include <multi_llvm/llvm_version.h>
 #include <multi_llvm/multi_llvm.h>
@@ -98,11 +97,9 @@ bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B,
 ///
 /// Only works on RecurKind::And, Or, Xor, Add, Mul, FAdd, FMul, {S,U,F}Min,
 /// {S,U,F}Max.
-llvm::Value *createMaybeVPTargetReduction(llvm::IRBuilderBase &B,
-                                          const llvm::TargetTransformInfo &TTI,
-                                          llvm::Value *Val,
-                                          llvm::RecurKind Kind,
-                                          llvm::Value *VL = nullptr);
+llvm::Value *createMaybeVPReduction(llvm::IRBuilderBase &B, llvm::Value *Val,
+                                    llvm::RecurKind Kind,
+                                    llvm::Value *VL = nullptr);
 
 /// @brief Utility function to obtain an indices vector to be used in a gather
 /// operation.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index af5e0f1737ddc..3aa856e5469f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -183,11 +183,7 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
       StoredValue = Store->getValueOperand();
       ToDelete.push_back(Store);
       DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
-#if LLVM_VERSION_GREATER_EQUAL(18, 0)
       auto DbgIntrinsics = findDbgDeclares(Alloca);
-#else
-      auto DbgIntrinsics = FindDbgDeclareUses(Alloca);
-#endif
       for (auto oldDII : DbgIntrinsics) {
         ConvertDebugDeclareToDebugValue(oldDII, Store, DIB);
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 38de9799dda40..0481b118ba694 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -38,7 +38,6 @@
 #include <llvm/Support/Error.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/basicblock_helper.h>
 
 #include <queue>
 #include <utility>
@@ -717,7 +716,7 @@ bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
 
     if (LTag->isLoopDivergent()) {
       PHINode *PHI = PHINode::Create(maskTy, 2, BB.getName() + ".entry_mask");
-      multi_llvm::insertBefore(PHI, BB.begin());
+      PHI->insertBefore(BB.begin());
       PHI->addIncoming(MaskInfos[preheader].exitMasks[&BB], preheader);
       maskInfo.entryMask = PHI;
       LLVM_DEBUG(dbgs() << "Loop divergent loop header " << BB.getName()
@@ -769,7 +768,7 @@ bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
     // A phi function of the predecessors otherwise.
     PHINode *PHI =
         PHINode::Create(maskTy, numPreds, BB.getName() + ".entry_mask");
-    multi_llvm::insertBefore(PHI, BB.begin());
+    PHI->insertBefore(BB.begin());
     for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) {
       PHI->addIncoming(MaskInfos[*it].exitMasks[&BB], *it);
     }
@@ -951,8 +950,7 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
       // between the mask update and the loop exit mask phi.
       auto *const exitMask =
           PHINode::Create(maskTy, 2, exitBlock->getName() + ".loop_exit_mask");
-      multi_llvm::insertBefore(exitMask,
-                               multi_llvm::getFirstNonPHIIt(LTag.header));
+      exitMask->insertBefore(LTag.header->getFirstNonPHIIt());
       LMask.persistedDivergentExitMasks[exitingBlock] = exitMask;
       if (BOSCC) {
         BOSCC->createReference(exitMask, getDefaultValue(maskTy));
@@ -2097,7 +2095,7 @@ bool ControlFlowConversionState::Impl::generateDivergentLoopResults(
                     << LTag.loop->getName() << "\n");
 
   // First create instructions to save the value of the last iteration ...
-  IRBuilder<> B(LTag.header, multi_llvm::getFirstNonPHIIt(LTag.header));
+  IRBuilder<> B(LTag.header, LTag.header->getFirstNonPHIIt());
   for (Value *LLV : LTag.loopLiveValues) {
     LTag.loopResultPrevs[LLV] =
         B.CreatePHI(LLV->getType(), 2, LLV->getName() + ".prev");
@@ -2123,8 +2121,7 @@ bool ControlFlowConversionState::Impl::generateDivergentLoopResults(
 
         uniformLRP->setIncomingValue(1, LLV);
 
-        multi_llvm::insertBefore(uniformLRP,
-                                 multi_llvm::getFirstNonPHIIt(uniformHeader));
+        uniformLRP->insertBefore(uniformHeader->getFirstNonPHIIt());
         BOSCC->createReference(LRP, uniformLRP, true);
       }
     }
@@ -2199,7 +2196,7 @@ bool ControlFlowConversionState::Impl::blendDivergentLoopLiveValues(
 
     PHINode *blend =
         PHINode::Create(LLV->getType(), 2, LLV->getName() + ".blend");
-    multi_llvm::insertBefore(blend, LTag.pureExit->begin());
+    blend->insertBefore(LTag.pureExit->begin());
 
     // Replace all uses outside the loop.
     VECZ_FAIL_IF(
@@ -2264,7 +2261,7 @@ bool ControlFlowConversionState::Impl::blendDivergentLoopExitMasks(
 
       PHINode *blend =
           PHINode::Create(prev->getType(), 2, prev->getName() + ".blend");
-      multi_llvm::insertBefore(blend, LTag.pureExit->begin());
+      blend->insertBefore(LTag.pureExit->begin());
 
       // Replace all uses outside the loop.
       VECZ_FAIL_IF(!replaceUsesOutsideDivergentLoop(LTag, update, blend,
@@ -2950,7 +2947,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
     const unsigned numPreds = std::distance(pred_begin(B), pred_end(B));
     Value *blend = nullptr;
     PHINode *PHI = PHINode::Create(T, numPreds, opDef->getName() + ".merge");
-    multi_llvm::insertBefore(PHI, B->begin());
+    PHI->insertBefore(B->begin());
 
     const auto *const LTag = DR->getTag(B).loop;
     bool hasVisitedPred = false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 4afe43e375474..4e31c18506adc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -268,13 +268,12 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
   return true;
 }
 
-Value *createMaybeVPTargetReduction(IRBuilderBase &B,
-                                    const TargetTransformInfo &TTI, Value *Val,
-                                    RecurKind Kind, Value *VL) {
+Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind,
+                              Value *VL) {
   assert(isa<VectorType>(Val->getType()) && "Must be vector type");
   // If VL is null, it's not a vector-predicated reduction.
   if (!VL) {
-    return multi_llvm::createSimpleTargetReduction(B, &TTI, Val, Kind);
+    return multi_llvm::createSimpleReduction(B, Val, Kind);
   }
   auto IntrinsicOp = Intrinsic::not_intrinsic;
   switch (Kind) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 69da33329b181..d578417f38311 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -921,7 +921,7 @@ Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
   // value.
   Value *&f = conds.front();
 
-  return createMaybeVPTargetReduction(B, TTI, f, kind, VL);
+  return createMaybeVPReduction(B, f, kind, VL);
 }
 
 Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) {
@@ -976,7 +976,7 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
       if (newCond->getType()->isVectorTy()) {
         IRBuilder<> B(Branch);
         const RecurKind kind = RecurKind::Or;
-        newCond = createMaybeVPTargetReduction(B, TTI, newCond, kind, VL);
+        newCond = createMaybeVPReduction(B, newCond, kind, VL);
       }
 
       Branch->setCondition(newCond);
@@ -1269,8 +1269,7 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   }
 
   // Reduce to a scalar.
-  Value *v = createMaybeVPTargetReduction(B, TTI, opPackets.front(),
-                                          Info->Recurrence, VL);
+  Value *v = createMaybeVPReduction(B, opPackets.front(), Info->Recurrence, VL);
 
   // We leave the original reduction function and divert the vectorized
   // reduction through it, giving us a reduction over the full apparent
@@ -1943,8 +1942,7 @@ Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
       }
     }();
 
-    Value *anyOfMask =
-        createMaybeVPTargetReduction(B, TTI, vecMask, RecurKind::Or, VL);
+    Value *anyOfMask = createMaybeVPReduction(B, vecMask, RecurKind::Or, VL);
     anyOfMask->setName("any_of_mask");
 
     if (isVector) {
@@ -2389,8 +2387,7 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
   // Thus we essentially keep the original group scan, but change it to be an
   // exclusive one.
   auto *Reduction = Ops.front();
-  Reduction =
-      createMaybeVPTargetReduction(B, TTI, Reduction, Scan.Recurrence, VL);
+  Reduction = createMaybeVPReduction(B, Reduction, Scan.Recurrence, VL);
 
   // Now we defer to an *exclusive* scan over the group.
   auto ExclScan = Scan;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index fee752eb9df71..a98ec1855d3ca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -121,8 +121,7 @@ InstructionCost calculateBoolReductionCost(LLVMContext &context, Module *module,
   auto *F = Function::Create(new_fty, Function::InternalLinkage, "tmp", module);
   auto *BB = BasicBlock::Create(context, "reduce", F);
   IRBuilder<> B(BB);
-  multi_llvm::createSimpleTargetReduction(B, &TTI, &*F->arg_begin(),
-                                          RecurKind::And);
+  multi_llvm::createSimpleReduction(B, &*F->arg_begin(), RecurKind::And);
   const InstructionCost cost = calculateBlockCost(*BB, TTI);
 
   // We don't really need that function in the module anymore because it's
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index f1a110b457448..883dd9330d0c1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -42,6 +42,7 @@
 #include <llvm/Transforms/Scalar/FlattenCFG.h>
 #include <llvm/Transforms/Scalar/GVN.h>
 #include <llvm/Transforms/Scalar/IndVarSimplify.h>
+#include <llvm/Transforms/Scalar/InferAlignment.h>
 #include <llvm/Transforms/Scalar/LoopPassManager.h>
 #include <llvm/Transforms/Scalar/SROA.h>
 #include <llvm/Transforms/Scalar/SimplifyCFG.h>
@@ -75,10 +76,6 @@
 #include "transform/scalarization_pass.h"
 #include "transform/ternary_transform_pass.h"
 
-#if LLVM_VERSION_GREATER_EQUAL(18, 0)
-#include <llvm/Transforms/Scalar/InferAlignment.h>
-#endif
-
 #define DEBUG_TYPE "vecz"
 using namespace llvm;
 using namespace vecz;
@@ -278,10 +275,7 @@ bool vecz::buildPassPipeline(ModulePassManager &PM) {
     FPM.addPass(InterleavedGroupCombinePass(eInterleavedStore));
     FPM.addPass(InterleavedGroupCombinePass(eInterleavedLoad));
     FPM.addPass(InstCombinePass());
-#if LLVM_VERSION_GREATER_EQUAL(18, 0)
-    // LLVM 18 split this pass out of InstCombine
     FPM.addPass(InferAlignmentPass());
-#endif
     FPM.addPass(DCEPass());
     FPM.addPass(SimplifyMaskedMemOpsPass());
 

From 0cc33f729c2fe45e13a081ab86d52e15c840159e Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 15 Oct 2024 17:09:22 +0100
Subject: [PATCH 125/182] Fix build with LLVM 20.

* LLVM 20 renames Intrinsic::getDeclaration to
  Intrinsic::getOrInsertDeclaration. Handle this with a
  multi_llvm::GetOrInsertIntrinsicDeclaration helper function.
* LLVM 20 deprecates instruction creation overloads that take an
  Instruction * as an insertion point, recommending the ones that take
  an iterator instead. In LLVM 18, for the most part, those overloads
  taking iterators do not yet exist, but Instruction::insertBefore does
  already take iterators, so rework the code to consistently use that.
  This removes some bitcast instructions in memory_operations.cpp that
  could not be updated, but had become unnecessary ever since LLVM
  switched to opaque pointers.
---
 .../include/multi_llvm/intrinsic.h            |  35 +++++
 .../source/barrier_regions.cpp                |  11 +-
 .../compiler_pipeline/source/builtin_info.cpp |   3 +-
 .../source/cl_builtin_info.cpp                |  17 ++-
 .../optimal_builtin_replacement_pass.cpp      |   8 +-
 .../source/pass_functions.cpp                 |   4 +-
 ...lace_local_module_scope_variables_pass.cpp |  59 +++++---
 .../vecz/source/include/memory_operations.h   |  38 ++---
 .../transform/control_flow_conversion_pass.h  |   2 +-
 .../vecz/source/memory_operations.cpp         |  49 +++----
 .../source/transform/basic_mem2reg_pass.cpp   |   6 +-
 .../control_flow_conversion_pass.cpp          | 131 ++++++++++--------
 .../transform/packetization_helpers.cpp       |   6 +-
 .../vecz/source/transform/packetizer.cpp      |  53 ++++---
 .../source/transform/pre_linearize_pass.cpp   |  14 +-
 .../source/transform/remove_intptr_pass.cpp   |   3 +-
 .../transform/simplify_infinite_loop_pass.cpp |   5 +-
 .../transform/ternary_transform_pass.cpp      |  17 ++-
 .../transform/uniform_reassociation_pass.cpp  |   9 +-
 .../vecz/source/vector_target_info.cpp        |  28 ++--
 .../vecz/source/vector_target_info_arm.cpp    |   7 +-
 21 files changed, 291 insertions(+), 214 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
new file mode 100644
index 0000000000000..3c1f1560ceda9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
@@ -0,0 +1,35 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
+#define MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
+
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+static inline auto GetOrInsertIntrinsicDeclaration(
+    llvm::Module *M, llvm::Intrinsic::ID id,
+    llvm::ArrayRef<llvm::Type *> Tys = {}) {
+#if LLVM_VERSION_GREATER_EQUAL(20, 0)
+  return llvm::Intrinsic::getOrInsertDeclaration(M, id, Tys);
+#else
+  return llvm::Intrinsic::getDeclaration(M, id, Tys);
+#endif
+}
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index a54b2fd4d0b9b..07c8b97082a5c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -38,7 +38,7 @@
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <llvm/Transforms/Utils/LCSSA.h>
 #include <llvm/Transforms/Utils/Local.h>
-#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
 
 #include <optional>
 
@@ -580,14 +580,12 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
       auto id = ConstantInt::get(Type::getInt32Ty(module_.getContext()),
                                  barrier_id - kBarrier_StartNewID);
       // Call invoking entry stub
-      auto entry_caller =
-          CallInst::Create(entry_stub, id, "", (Instruction *)nullptr);
+      auto entry_caller = CallInst::Create(entry_stub, id);
       entry_caller->setDebugLoc(split_point->getDebugLoc());
       entry_caller->setCallingConv(entry_stub->getCallingConv());
 
       // Call invoking exit stub
-      auto exit_caller =
-          CallInst::Create(exit_stub, id, "", (Instruction *)nullptr);
+      auto exit_caller = CallInst::Create(exit_stub, id);
       exit_caller->setDebugLoc(split_point->getDebugLoc());
       exit_caller->setCallingConv(exit_stub->getCallingConv());
 
@@ -1163,7 +1161,8 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
       // Change return instruction with end barrier number.
       ConstantInt *cst_zero =
           ConstantInt::get(Type::getInt32Ty(context), kBarrier_EndID);
-      ReturnInst *new_ret = ReturnInst::Create(context, cst_zero, ret);
+      ReturnInst *new_ret = ReturnInst::Create(context, cst_zero);
+      new_ret->insertBefore(ret->getIterator());
       ret->replaceAllUsesWith(new_ret);
       ret->eraseFromParent();
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index bdbc015e21e3f..217d3ea306ba9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -23,6 +23,7 @@
 #include <llvm/ADT/StringExtras.h>
 #include <llvm/ADT/StringSwitch.h>
 #include <llvm/IR/Module.h>
+#include <multi_llvm/intrinsic.h>
 
 using namespace llvm;
 
@@ -538,7 +539,7 @@ Function *BuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
     Type *ScalarType = VecRetTy->getElementType();
     // Get the scalar version of the intrinsic
     Function *ScalarIntrinsic =
-        Intrinsic::getDeclaration(M, IntrinsicID, ScalarType);
+        multi_llvm::GetOrInsertIntrinsicDeclaration(M, IntrinsicID, ScalarType);
 
     return ScalarIntrinsic;
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 08f7ef2f868e8..89449b218b139 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -32,7 +32,6 @@
 #include <llvm/TargetParser/Triple.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
-#include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <cmath>
@@ -2807,7 +2806,8 @@ Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
     auto *const MuxBuiltinFn = BIMuxImpl.getOrDeclareMuxBuiltin(*MuxID, M);
     assert(MuxBuiltinFn && "Could not get/declare mux builtin");
     const SmallVector<Value *> Args(CI.args());
-    auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName(), &CI);
+    auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName());
+    NewCI->insertBefore(CI.getIterator());
     NewCI->takeName(&CI);
     NewCI->setAttributes(MuxBuiltinFn->getAttributes());
     return NewCI;
@@ -3344,9 +3344,9 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
     Args.push_back(Val);
   } else {
     assert(Val->getType()->isIntegerTy());
-    auto *NEZero =
-        ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, Val,
-                         ConstantInt::getNullValue(Val->getType()), "", &CI);
+    auto *NEZero = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, Val,
+                                    ConstantInt::getNullValue(Val->getType()));
+    NEZero->insertBefore(CI.getIterator());
     Args.push_back(NEZero);
   }
 
@@ -3363,7 +3363,8 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
     }
   }
 
-  auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName(), &CI);
+  auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName());
+  NewCI->insertBefore(CI.getIterator());
   NewCI->takeName(&CI);
   NewCI->setAttributes(MuxBuiltinFn->getAttributes());
 
@@ -3371,7 +3372,9 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
     return NewCI;
   }
   // For any/all we need to recreate the original i32 return value.
-  return SExtInst::Create(Instruction::SExt, NewCI, CI.getType(), "sext", &CI);
+  auto *SExt = SExtInst::Create(Instruction::SExt, NewCI, CI.getType(), "sext");
+  SExt->insertBefore(CI.getIterator());
+  return SExt;
 }
 
 Instruction *CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index af89fc5f219bd..bf6dfc786e353 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -28,6 +28,7 @@
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/Module.h>
 #include <llvm/TargetParser/Triple.h>
+#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #define DEBUG_TYPE "ca-optimal-builtins"
@@ -61,7 +62,8 @@ Value *OptimalBuiltinReplacementPass::replaceAbacusCLZ(
   SmallVector<Value *, 4> Args(CB.args());
   // Get the declaration for the intrinsic
   auto *const ArgTy = Args[0]->getType();
-  auto *const Intrinsic = Intrinsic::getDeclaration(M, Intrinsic::ctlz, ArgTy);
+  auto *const Intrinsic =
+      multi_llvm::GetOrInsertIntrinsicDeclaration(M, Intrinsic::ctlz, ArgTy);
   // If we didn't find the intrinsic or the return type isn't what we
   // expect, skip this optimization
   Function *Callee = CB.getCalledFunction();
@@ -82,7 +84,9 @@ Value *OptimalBuiltinReplacementPass::replaceAbacusCLZ(
   LLVMContext &Ctx = M->getContext();
   Args.push_back(ConstantInt::getFalse(Ctx));
 
-  return CallInst::Create(Intrinsic, Args, "", &CB);
+  auto *Call = CallInst::Create(Intrinsic, Args);
+  Call->insertBefore(CB.getIterator());
+  return Call;
 }
 
 Value *OptimalBuiltinReplacementPass::replaceAbacusMulhi(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 43dea54c8c490..1225c4975aba8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -31,7 +31,6 @@
 #include <llvm/IR/Module.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <multi_llvm/llvm_version.h>
-#include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include <cassert>
@@ -440,7 +439,8 @@ void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
       }
 
       // create our new call instruction to replace the old one
-      auto newCi = llvm::CallInst::Create(&newFunc, args, name, ci);
+      auto newCi = llvm::CallInst::Create(&newFunc, args, name);
+      newCi->insertBefore(ci->getIterator());
 
       // use the debug location from the old call (if any)
       newCi->setDebugLoc(ci->getDebugLoc());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index 42e14bf8e80fb..b3381b85ff360 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -60,8 +60,8 @@ GetElementPtrInst *generateStructGEP(Instruction &inst,
   // create a new GEP just before the instruction
   auto GEP = GetElementPtrInst::CreateInBounds(
       funcsStructTy, funcsStruct,
-      {ConstantInt::get(indexTy, 0), ConstantInt::get(indexTy, index)}, "",
-      &inst);
+      {ConstantInt::get(indexTy, 0), ConstantInt::get(indexTy, index)});
+  GEP->insertBefore(inst.getIterator());
   return GEP;
 }
 
@@ -436,7 +436,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
         auto local = generateStructGEP(*gep, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", gep);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(gep->getIterator());
 
         gep->setOperand(0, castedLocal);
         gep->setIsInBounds();
@@ -444,21 +445,25 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
         auto local = generateStructGEP(*cast, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", cast);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(cast->getIterator());
 
         cast->setOperand(0, castedLocal);
       } else if (LoadInst *load = dyn_cast<LoadInst>(user)) {
         auto local = generateStructGEP(*load, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", load);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(load->getIterator());
 
         load->setOperand(0, castedLocal);
       } else if (StoreInst *store = dyn_cast<StoreInst>(user)) {
         auto local = generateStructGEP(*store, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", store);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(store->getIterator());
+
         // global could be pointer or value operand of the store
         if (store->getValueOperand() == global) {
           store->setOperand(0, castedLocal);
@@ -474,7 +479,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
           auto local = generateStructGEP(*inst, structTy, index_map[global]);
 
           auto castedLocal =
-              CastInst::CreatePointerCast(local, global->getType(), "", inst);
+              CastInst::CreatePointerCast(local, global->getType());
+          castedLocal->insertBefore(inst->getIterator());
 
           auto indexTy = Type::getInt32Ty(M.getContext());
           Value *newCv = UndefValue::get(cv->getType());
@@ -482,14 +488,16 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
           // We can't simply 'setOperand' in a 'ConstantVector'. We have to
           // recreate it from scratch.
           for (unsigned i = 0; i < cv->getNumOperands(); ++i) {
+            Instruction *newCvInst;
             if (cv->getOperand(i) == global) {
-              newCv = InsertElementInst::Create(
-                  newCv, castedLocal, ConstantInt::get(indexTy, i), "", inst);
+              newCvInst = InsertElementInst::Create(
+                  newCv, castedLocal, ConstantInt::get(indexTy, i));
             } else {
-              newCv = InsertElementInst::Create(newCv, cv->getOperand(i),
-                                                ConstantInt::get(indexTy, i),
-                                                "", inst);
+              newCvInst = InsertElementInst::Create(
+                  newCv, cv->getOperand(i), ConstantInt::get(indexTy, i));
             }
+            newCvInst->insertBefore(inst->getIterator());
+            newCv = newCvInst;
           }
 
           // And don't forget to replace 'cv' by 'newCv'.
@@ -505,8 +513,9 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
             auto local =
                 generateStructGEP(*incomingBlockT, structTy, index_map[global]);
 
-            auto castedLocal = CastInst::CreatePointerCast(
-                local, global->getType(), "", incomingBlockT);
+            auto castedLocal =
+                CastInst::CreatePointerCast(local, global->getType());
+            castedLocal->insertBefore(incomingBlockT->getIterator());
 
             phi->setIncomingValue(i, castedLocal);
           }
@@ -515,7 +524,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
         auto local = generateStructGEP(*atomic, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", atomic);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(atomic->getIterator());
 
         // global could be pointer or value operand of the atomic
         if (atomic->getPointerOperand() == global) {
@@ -527,7 +537,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
         const auto local =
             generateStructGEP(*atomic, structTy, index_map[global]);
         const auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", atomic);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(atomic->getIterator());
 
         // global could be the pointer
         if (atomic->getPointerOperand() == global) {
@@ -545,7 +556,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
         auto local = generateStructGEP(*select, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", select);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(select->getIterator());
 
         // global could be the true or false value of the select
         if (select->getTrueValue() == global) {
@@ -557,7 +569,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
         auto local = generateStructGEP(*call, structTy, index_map[global]);
 
         auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", call);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(call->getIterator());
 
         unsigned i = 0;
         for (; i < call->getNumOperands(); ++i) {
@@ -568,15 +581,19 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
       } else if (InsertElementInst *insertIns =
                      dyn_cast<InsertElementInst>(user)) {
         auto local = generateStructGEP(*insertIns, structTy, index_map[global]);
-        auto castedLocal = CastInst::CreatePointerCast(local, global->getType(),
-                                                       "", insertIns);
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(insertIns->getIterator());
+
         // Update middle operand as the others are the vector and index
         insertIns->setOperand(1, castedLocal);
       } else if (auto *cmpIns = dyn_cast<CmpInst>(user)) {
         const auto local =
             generateStructGEP(*cmpIns, structTy, index_map[global]);
         const auto castedLocal =
-            CastInst::CreatePointerCast(local, global->getType(), "", cmpIns);
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(cmpIns->getIterator());
+
         // global could be either side of the compare
         if (cmpIns->getOperand(0) == global) {
           cmpIns->setOperand(0, castedLocal);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
index d1e84515ce732..36d23b9d0958b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -70,14 +70,12 @@ llvm::Function *getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
 /// @param[in] EVL vector length as i32, else null (full width operation).
 /// @param[in] Alignment Alignment
 /// @param[in] Name Name to give to the call instruction.
-/// @param[in] InsertBefore Insertion point for the call instruction.
 ///
 /// @return Call instruction or null on error.
 llvm::CallInst *createMaskedLoad(VectorizationContext &Ctx, llvm::Type *Ty,
                                  llvm::Value *Ptr, llvm::Value *Mask,
                                  llvm::Value *EVL, unsigned Alignment,
-                                 llvm::Twine Name = "",
-                                 llvm::Instruction *InsertBefore = nullptr);
+                                 llvm::Twine Name = "");
 
 /// @brief Create a call to a masked store operation builtin function.
 ///
@@ -88,14 +86,12 @@ llvm::CallInst *createMaskedLoad(VectorizationContext &Ctx, llvm::Type *Ty,
 /// @param[in] EVL vector length as i32, else null (full width operation).
 /// @param[in] Alignment Alignment
 /// @param[in] Name Name to give to the call instruction.
-/// @param[in] InsertBefore Insertion point for the call instruction.
 ///
 /// @return Call instruction or null on error.
 llvm::CallInst *createMaskedStore(VectorizationContext &Ctx, llvm::Value *Data,
                                   llvm::Value *Ptr, llvm::Value *Mask,
                                   llvm::Value *EVL, unsigned Alignment,
-                                  llvm::Twine Name = "",
-                                  llvm::Instruction *InsertBefore = nullptr);
+                                  llvm::Twine Name = "");
 
 /// @brief Return or declare a (masked) interleaved memory operation builtin
 /// function.
@@ -130,14 +126,13 @@ llvm::Function *getOrCreateInterleavedMemOpFn(
 /// case an unmasked builtin is called.
 /// @param[in] Alignment Alignment of the operation.
 /// @param[in] Name Name to give to the call instruction.
-/// @param[in] InsertBefore Insertion point for the call instruction.
 ///
 /// @return Call instruction or null on error.
-llvm::CallInst *createInterleavedLoad(
-    VectorizationContext &Ctx, llvm::Type *Ty, llvm::Value *Ptr,
-    llvm::Value *Stride, llvm::Value *Mask, llvm::Value *EVL,
-    unsigned Alignment, llvm::Twine Name = "",
-    llvm::Instruction *InsertBefore = nullptr);
+llvm::CallInst *createInterleavedLoad(VectorizationContext &Ctx, llvm::Type *Ty,
+                                      llvm::Value *Ptr, llvm::Value *Stride,
+                                      llvm::Value *Mask, llvm::Value *EVL,
+                                      unsigned Alignment,
+                                      llvm::Twine Name = "");
 
 /// @brief Create a call to a (masked) interleaved store builtin function. Also
 /// known as a strided store.
@@ -152,14 +147,13 @@ llvm::CallInst *createInterleavedLoad(
 /// case an unmasked builtin is called.
 /// @param[in] Alignment Alignment of the operation.
 /// @param[in] Name Name to give to the call instruction.
-/// @param[in] InsertBefore Insertion point for the call instruction.
 ///
 /// @return Call instruction or null on error.
-llvm::CallInst *createInterleavedStore(
-    VectorizationContext &Ctx, llvm::Value *Data, llvm::Value *Ptr,
-    llvm::Value *Stride, llvm::Value *Mask, llvm::Value *EVL,
-    unsigned Alignment, llvm::Twine Name = "",
-    llvm::Instruction *InsertBefore = nullptr);
+llvm::CallInst *createInterleavedStore(VectorizationContext &Ctx,
+                                       llvm::Value *Data, llvm::Value *Ptr,
+                                       llvm::Value *Stride, llvm::Value *Mask,
+                                       llvm::Value *EVL, unsigned Alignment,
+                                       llvm::Twine Name = "");
 
 /// @brief Return or declare a (masked) scatter/gather memory operation builtin
 /// function.
@@ -196,14 +190,12 @@ llvm::Function *getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx,
 /// @param[in] Alignment Alignment of the operation.
 /// @param[in] EVL vector length as i32, else null (full width operation).
 /// @param[in] Name Name to give to the call instruction.
-/// @param[in] InsertBefore Insertion point for the call instruction.
 ///
 /// @return Call instruction or null on error.
 llvm::CallInst *createGather(VectorizationContext &Ctx, llvm::Type *Ty,
                              llvm::Value *VecPtr, llvm::Value *Mask,
                              llvm::Value *EVL, unsigned Alignment,
-                             llvm::Twine Name = "",
-                             llvm::Instruction *InsertBefore = nullptr);
+                             llvm::Twine Name = "");
 
 /// @brief Create a call to a (masked) scatter memory operation builtin
 /// function.
@@ -218,14 +210,12 @@ llvm::CallInst *createGather(VectorizationContext &Ctx, llvm::Type *Ty,
 /// @param[in] Alignment Alignment of the operation.
 /// @param[in] EVL vector length as i32, else null (full width operation).
 /// @param[in] Name Name to give to the call instruction.
-/// @param[in] InsertBefore Insertion point for the call instruction.
 ///
 /// @return Call instruction or null on error.
 llvm::CallInst *createScatter(VectorizationContext &Ctx, llvm::Value *VecData,
                               llvm::Value *VecPtr, llvm::Value *Mask,
                               llvm::Value *EVL, unsigned Alignment,
-                              llvm::Twine Name = "",
-                              llvm::Instruction *InsertBefore = nullptr);
+                              llvm::Twine Name = "");
 
 /// @brief an enum to distinguish between loads and stores, and between builtin
 /// memop calls and native IR memop instructions.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
index 2f2991280e692..71a5807fb1ba2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -113,7 +113,7 @@ class ControlFlowConversionState {
     llvm::SmallDenseMap<llvm::BasicBlock *, llvm::Value *, 4> exitMasks;
     /// @brief Mask that describes which lanes are active at the start of the
     /// basic block.
-    llvm::Value *entryMask = nullptr;
+    llvm::Instruction *entryMask = nullptr;
   };
   llvm::DenseMap<llvm::BasicBlock *, MaskInfo> MaskInfos;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index ac06e3d84dd08..7a1087504d6e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -108,17 +108,13 @@ Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
 
 static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data,
                                    Type *DataTy, Value *Ptr, Value *Mask,
-                                   Value *EVL, unsigned Alignment, Twine Name,
-                                   Instruction *InsertBefore) {
+                                   Value *EVL, unsigned Alignment, Twine Name) {
   VECZ_FAIL_IF(!DataTy);
   VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
   VECZ_FAIL_IF(!Mask);
   assert(!Data || Data->getType() == DataTy);
   auto *PtrTy =
       PointerType::get(DataTy, Ptr->getType()->getPointerAddressSpace());
-  if (Ptr->getType() != PtrTy) {
-    Ptr = BitCastInst::CreatePointerCast(Ptr, PtrTy, "", InsertBefore);
-  }
   Function *F =
       getOrCreateMaskedMemOpFn(Ctx, DataTy, PtrTy, Alignment,
                                /*IsLoad*/ Data == nullptr, EVL != nullptr);
@@ -132,23 +128,21 @@ static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data,
   if (EVL) {
     Ops.push_back(EVL);
   }
-  return CallInst::Create(F, Ops, Name, InsertBefore);
+  return CallInst::Create(F, Ops, Name);
 }
 
 CallInst *vecz::createMaskedLoad(VectorizationContext &Ctx, Type *Ty,
                                  Value *Ptr, Value *Mask, Value *EVL,
-                                 unsigned Alignment, Twine Name,
-                                 Instruction *InsertBefore) {
+                                 unsigned Alignment, Twine Name) {
   return createMaskedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Mask, EVL, Alignment,
-                           Name, InsertBefore);
+                           Name);
 }
 
 CallInst *vecz::createMaskedStore(VectorizationContext &Ctx, Value *Data,
                                   Value *Ptr, Value *Mask, Value *EVL,
-                                  unsigned Alignment, Twine Name,
-                                  Instruction *InsertBefore) {
+                                  unsigned Alignment, Twine Name) {
   return createMaskedMemOp(Ctx, Data, Data->getType(), Ptr, Mask, EVL,
-                           Alignment, Name, InsertBefore);
+                           Alignment, Name);
 }
 
 static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy,
@@ -244,16 +238,12 @@ Function *vecz::getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx,
 static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data,
                                         Type *DataTy, Value *Ptr, Value *Stride,
                                         Value *Mask, Value *EVL,
-                                        unsigned Alignment, llvm::Twine Name,
-                                        llvm::Instruction *InsertBefore) {
+                                        unsigned Alignment, llvm::Twine Name) {
   VECZ_FAIL_IF(!DataTy);
   VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
   assert(!Data || Data->getType() == DataTy);
   auto *PtrTy = PointerType::get(DataTy->getScalarType(),
                                  Ptr->getType()->getPointerAddressSpace());
-  if (Ptr->getType() != PtrTy) {
-    Ptr = BitCastInst::CreatePointerCast(Ptr, PtrTy, "", InsertBefore);
-  }
   Type *MaskTy = Mask ? Mask->getType() : nullptr;
   Function *F = getOrCreateInterleavedMemOpFn(
       Ctx, DataTy, PtrTy, Stride, MaskTy, Alignment,
@@ -273,23 +263,23 @@ static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data,
   if (!isa<ConstantInt>(Stride)) {
     Ops.push_back(Stride);
   }
-  return CallInst::Create(F, Ops, Name, InsertBefore);
+  return CallInst::Create(F, Ops, Name);
 }
 
 CallInst *vecz::createInterleavedLoad(VectorizationContext &Ctx, Type *Ty,
                                       Value *Ptr, Value *Stride, Value *Mask,
                                       Value *EVL, unsigned Alignment,
-                                      Twine Name, Instruction *InsertBefore) {
+                                      Twine Name) {
   return createInterleavedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Stride, Mask,
-                                EVL, Alignment, Name, InsertBefore);
+                                EVL, Alignment, Name);
 }
 
 CallInst *vecz::createInterleavedStore(VectorizationContext &Ctx, Value *Data,
                                        Value *Ptr, Value *Stride, Value *Mask,
                                        Value *EVL, unsigned Alignment,
-                                       Twine Name, Instruction *InsertBefore) {
+                                       Twine Name) {
   return createInterleavedMemOp(Ctx, Data, Data->getType(), Ptr, Stride, Mask,
-                                EVL, Alignment, Name, InsertBefore);
+                                EVL, Alignment, Name);
 }
 
 static std::string getScatterGatherMemOpName(Type *DataTy, VectorType *VecPtrTy,
@@ -379,8 +369,7 @@ static CallInst *createScatterGatherMemOp(VectorizationContext &Ctx,
                                           Value *VecData, Type *DataTy,
                                           Value *VecPtr, Value *Mask,
                                           Value *EVL, unsigned Alignment,
-                                          Twine Name,
-                                          Instruction *InsertBefore) {
+                                          Twine Name) {
   VECZ_FAIL_IF(!DataTy);
   VECZ_FAIL_IF(!VecPtr || !VecPtr->getType()->isVectorTy() ||
                !VecPtr->getType()->getScalarType()->isPointerTy());
@@ -400,25 +389,23 @@ static CallInst *createScatterGatherMemOp(VectorizationContext &Ctx,
   if (EVL) {
     Ops.push_back(EVL);
   }
-  return CallInst::Create(F, Ops, Name, InsertBefore);
+  return CallInst::Create(F, Ops, Name);
 }
 
 llvm::CallInst *vecz::createGather(VectorizationContext &Ctx, llvm::Type *Ty,
                                    llvm::Value *VecPtr, llvm::Value *Mask,
                                    llvm::Value *EVL, unsigned Alignment,
-                                   llvm::Twine Name,
-                                   llvm::Instruction *InsertBefore) {
+                                   llvm::Twine Name) {
   return createScatterGatherMemOp(Ctx, /*Data*/ nullptr, Ty, VecPtr, Mask, EVL,
-                                  Alignment, Name, InsertBefore);
+                                  Alignment, Name);
 }
 
 llvm::CallInst *vecz::createScatter(VectorizationContext &Ctx,
                                     llvm::Value *VecData, llvm::Value *VecPtr,
                                     llvm::Value *Mask, llvm::Value *EVL,
-                                    unsigned Alignment, llvm::Twine Name,
-                                    llvm::Instruction *InsertBefore) {
+                                    unsigned Alignment, llvm::Twine Name) {
   return createScatterGatherMemOp(Ctx, VecData, VecData->getType(), VecPtr,
-                                  Mask, EVL, Alignment, Name, InsertBefore);
+                                  Mask, EVL, Alignment, Name);
 }
 
 MemOpDesc::MemOpDesc()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index 3aa856e5469f5..89f9a0fecf5e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -22,7 +22,6 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Local.h>
-#include <multi_llvm/llvm_version.h>
 
 #include "debugging.h"
 #include "transform/passes.h"
@@ -225,8 +224,9 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
           NewValue->getType()->getPrimitiveSizeInBits()) {
         return false;
       }
-      NewValue = CastInst::CreateBitOrPointerCast(StoredValue, Load->getType(),
-                                                  "", Load);
+      auto *CI = CastInst::CreateBitOrPointerCast(StoredValue, Load->getType());
+      CI->insertBefore(Load->getIterator());
+      NewValue = CI;
     }
     LLVM_DEBUG(dbgs() << "VM2R: Replaced :" << *Load << "\n");
     LLVM_DEBUG(dbgs() << "      |-> with :" << *NewValue << "\n");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 0481b118ba694..67c256ffb4cec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -38,6 +38,7 @@
 #include <llvm/Support/Error.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/multi_llvm.h>
 
 #include <queue>
 #include <utility>
@@ -371,30 +372,33 @@ STATISTIC(VeczCFGFail,
 
 namespace {
 
-Instruction *getInsertionPt(BasicBlock &BB) {
+BasicBlock::iterator getInsertionPt(BasicBlock &BB) {
   // We have to insert instructions after any Allocas
   auto it = BB.getFirstInsertionPt();
   while (isa<AllocaInst>(*it)) {
     ++it;
   }
-  return &*it;
+  return it;
 }
 
-Instruction *copyMask(Value *mask, Twine name, Instruction *insertBefore) {
-  VECZ_ERROR_IF(!mask || !insertBefore,
-                "Trying to copy mask with invalid arguments");
+Instruction *copyMask(Value *mask, Twine name) {
+  VECZ_ERROR_IF(!mask, "Trying to copy mask with invalid arguments");
   return BinaryOperator::CreateAnd(mask, getDefaultValue(mask->getType(), 1),
-                                   name, insertBefore);
+                                   name);
 }
 
 Instruction *copyEntryMask(Value *mask, BasicBlock &BB) {
   VECZ_ERROR_IF(!mask, "Trying to copy entry mask with invalid arguments");
-  return copyMask(mask, BB.getName() + ".entry_mask", getInsertionPt(BB));
+  auto *EM = copyMask(mask, BB.getName() + ".entry_mask");
+  EM->insertBefore(getInsertionPt(BB));
+  return EM;
 }
 
 Instruction *copyExitMask(Value *mask, StringRef base, BasicBlock &BB) {
   VECZ_ERROR_IF(!mask, "Trying to copy exit mask with invalid arguments");
-  return copyMask(mask, base + ".exit_mask", BB.getTerminator());
+  auto *EM = copyMask(mask, base + ".exit_mask");
+  EM->insertBefore(BB.getTerminator()->getIterator());
+  return EM;
 }
 
 /// Wrap a string into an llvm::StringError, pointing to an instruction.
@@ -754,11 +758,11 @@ bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
         LLVM_DEBUG(dbgs() << "Blend block " << BB.getName()
                           << ": entry mask: " << *maskInfo.entryMask << "\n");
       } else {
-        Instruction *insertBefore =
-            cast<Instruction>(maskInfo.entryMask)->getNextNode();
+        auto InsertPt = std::next(maskInfo.entryMask->getIterator());
         maskInfo.entryMask = BinaryOperator::CreateOr(
             maskInfo.entryMask, MaskInfos[*it].exitMasks[&BB],
-            BB.getName() + ".entry_mask", insertBefore);
+            BB.getName() + ".entry_mask");
+        maskInfo.entryMask->insertBefore(InsertPt);
 
         LLVM_DEBUG(dbgs() << "Blend block " << BB.getName()
                           << ": entry mask: " << *maskInfo.entryMask << "\n");
@@ -1017,8 +1021,8 @@ bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
 
       BinaryOperator *maskUpdate = BinaryOperator::CreateOr(
           REM, maskUpdateOperand,
-          exitBlock->getName() + ".loop_exit_mask.update",
-          exitingBlock->getTerminator());
+          exitBlock->getName() + ".loop_exit_mask.update");
+      maskUpdate->insertBefore(exitingBlock->getTerminator()->getIterator());
 
       LMask.updatedPersistedDivergentExitMasks[exitingBlock] = maskUpdate;
 
@@ -1076,27 +1080,27 @@ bool ControlFlowConversionState::Impl::createCombinedLoopExitMask(
         LMask.combinedDivergentExitMask = copyMask(
             LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand(
                 1),
-            Loop->getName() + ".combined_divergent_exit_mask",
-            LTag.latch->getTerminator());
+            Loop->getName() + ".combined_divergent_exit_mask");
 
         LMask.persistedCombinedDivergentExitMask = copyMask(
             LMask.updatedPersistedDivergentExitMasks[exitingBlock],
-            Loop->getName() + ".persisted_combined_divergent_exit_mask",
-            LTag.latch->getTerminator());
+            Loop->getName() + ".persisted_combined_divergent_exit_mask");
       } else {
         LMask.combinedDivergentExitMask = BinaryOperator::CreateOr(
             LMask.combinedDivergentExitMask,
             LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand(
                 1),
-            Loop->getName() + ".combined_divergent_exit_mask",
-            LTag.latch->getTerminator());
+            Loop->getName() + ".combined_divergent_exit_mask");
 
         LMask.persistedCombinedDivergentExitMask = BinaryOperator::CreateOr(
             LMask.persistedCombinedDivergentExitMask,
             LMask.updatedPersistedDivergentExitMasks[exitingBlock],
-            Loop->getName() + ".persisted_combined_divergent_exit_mask",
-            LTag.latch->getTerminator());
+            Loop->getName() + ".persisted_combined_divergent_exit_mask");
       }
+      LMask.combinedDivergentExitMask->insertBefore(
+          LTag.latch->getTerminator()->getIterator());
+      LMask.persistedCombinedDivergentExitMask->insertBefore(
+          LTag.latch->getTerminator()->getIterator());
     }
   }
 
@@ -1195,7 +1199,8 @@ CallInst *ControlFlowConversionState::Impl::emitMaskedVersion(CallInst *CI,
   }
   fnArgs.push_back(entryBit);
 
-  CallInst *newCI = CallInst::Create(newFunction, fnArgs, "", CI);
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs);
+  newCI->insertBefore(CI->getIterator());
   newCI->setCallingConv(CI->getCallingConv());
   newCI->setAttributes(CI->getAttributes());
 
@@ -1246,9 +1251,11 @@ bool ControlFlowConversionState::Impl::tryApplyMaskToBinOp(
               // to do anything.
               masked = divisor;
             } else {
-              masked = SelectInst::Create(
+              auto *SI = SelectInst::Create(
                   mask, divisor, ConstantInt::get(divisor->getType(), 1),
-                  divisor->getName() + ".masked", &I);
+                  divisor->getName() + ".masked");
+              SI->insertBefore(I.getIterator());
+              masked = SI;
             }
           }
 
@@ -1285,16 +1292,17 @@ bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp(
   if (memOp.isLoadStoreInst()) {
     // Create a new mem-op the same as the original except for the addition
     // of the mask.
-    Value *newVal = nullptr;
+    Instruction *newVal = nullptr;
     if (memOp.isLoad()) {
       newVal = createMaskedLoad(
           Ctx, memOp.getDataType(), memOp.getPointerOperand(), wideMask,
-          /*VL*/ nullptr, memOp.getAlignment(), I->getName(), I);
+          /*VL*/ nullptr, memOp.getAlignment(), I->getName());
     } else {
       newVal = createMaskedStore(
           Ctx, memOp.getDataOperand(), memOp.getPointerOperand(), wideMask,
-          /*VL*/ nullptr, memOp.getAlignment(), I->getName(), I);
+          /*VL*/ nullptr, memOp.getAlignment(), I->getName());
     }
+    newVal->insertBefore(I->getIterator());
 
     VECZ_FAIL_IF(!newVal);
     if (!I->getType()->isVoidTy()) {
@@ -1305,8 +1313,9 @@ bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp(
   }
 
   if (auto *opMask = memOp.getMaskOperand()) {
-    memOp.setMaskOperand(
-        BinaryOperator::CreateAnd(wideMask, opMask, "composite_mask", I));
+    auto *mask = BinaryOperator::CreateAnd(wideMask, opMask, "composite_mask");
+    mask->insertBefore(I->getIterator());
+    memOp.setMaskOperand(mask);
     return true;
   }
 
@@ -1436,8 +1445,9 @@ bool ControlFlowConversionState::Impl::applyMaskToAtomic(
         ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 1));
   }
 
-  CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs, "", &I);
+  CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs);
   VECZ_FAIL_IF(!maskedCI);
+  maskedCI->insertBefore(I.getIterator());
 
   I.replaceAllUsesWith(maskedCI);
   toDelete.emplace_back(&I, maskedCI);
@@ -1537,9 +1547,11 @@ bool ControlFlowConversionState::Impl::createBranchReductions() {
         if (auto *LTag = DR->getTag(&BB).loop;
             DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) {
           if (!isBranchCondTrulyUniform(cond, *UVR)) {
-            cond = BinaryOperator::Create(Instruction::BinaryOps::And, cond,
-                                          MaskInfos[&BB].entryMask,
-                                          cond->getName() + "_active", Branch);
+            auto *newcond = BinaryOperator::Create(
+                Instruction::BinaryOps::And, cond, MaskInfos[&BB].entryMask,
+                cond->getName() + "_active");
+            newcond->insertBefore(Branch->getIterator());
+            cond = newcond;
           }
         }
 
@@ -1548,8 +1560,9 @@ bool ControlFlowConversionState::Impl::createBranchReductions() {
             Twine(baseName).concat(name).str(), FT);
         VECZ_FAIL_IF(!F);
 
-        auto *const newCall = CallInst::Create(
-            F, {cond}, Twine(cond->getName()).concat(name), Branch);
+        auto *const newCall =
+            CallInst::Create(F, {cond}, Twine(cond->getName()).concat(name));
+        newCall->insertBefore(Branch->getIterator());
         Branch->setCondition(newCall);
       }
     } else if (isa<SwitchInst>(TI) &&
@@ -1855,7 +1868,8 @@ bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks(
         break;
       case Instruction::Br: {
         const unsigned keepIdx = succIdx == 0 ? 1 : 0;
-        auto *newT = BranchInst::Create(T->getSuccessor(keepIdx), T);
+        auto *newT = BranchInst::Create(T->getSuccessor(keepIdx));
+        newT->insertBefore(T->getIterator());
 
         updateMaps(T, newT);
 
@@ -2136,8 +2150,9 @@ bool ControlFlowConversionState::Impl::generateDivergentLoopResultUpdates(
   Value *mask = LMask.combinedDivergentExitMask;
   VECZ_ERROR_IF(!mask, "Divergent loop does not have an exit mask");
   PHINode *PHI = LTag.loopResultPrevs[LLV];
-  SelectInst *select = SelectInst::Create(
-      mask, LLV, PHI, LLV->getName() + ".update", LTag.latch->getTerminator());
+  SelectInst *select =
+      SelectInst::Create(mask, LLV, PHI, LLV->getName() + ".update");
+  select->insertBefore(LTag.latch->getTerminator()->getIterator());
   LTag.loopResultUpdates[LLV] = select;
 
   // The PHI function of each loop live value has one incoming value from
@@ -2632,7 +2647,8 @@ bool ControlFlowConversionState::Impl::linearizeCFG() {
         LLVM_DEBUG(dbgs() << "\tRemove successor: " << succ->getName() << "\n");
       }
 
-      auto *newT = BranchInst::Create(T->getSuccessor(0), T);
+      auto *newT = BranchInst::Create(T->getSuccessor(0));
+      newT->insertBefore(T->getIterator());
 
       updateMaps(T, newT);
 
@@ -2724,10 +2740,10 @@ bool ControlFlowConversionState::Impl::generateSelectFromPHI(PHINode *PHI,
     maskInfo.entryMask = copyEntryMask(PHI->getIncomingValue(0), *B);
     for (unsigned i = 1; i < phiNumIncVals; i++) {
       Value *V = PHI->getIncomingValue(i);
-      Instruction *insertBefore =
-          cast<Instruction>(maskInfo.entryMask)->getNextNode();
+      auto InsertPt = std::next(maskInfo.entryMask->getIterator());
       maskInfo.entryMask = BinaryOperator::CreateOr(
-          maskInfo.entryMask, V, B->getName() + ".entry_mask", insertBefore);
+          maskInfo.entryMask, V, B->getName() + ".entry_mask");
+      maskInfo.entryMask->insertBefore(InsertPt);
     }
     newVal = maskInfo.entryMask;
   } else {
@@ -2738,19 +2754,21 @@ bool ControlFlowConversionState::Impl::generateSelectFromPHI(PHINode *PHI,
       Value *cond = MaskInfos[PHIB].exitMasks[B];
       VECZ_ERROR_IF(!cond, "Exit mask does not exist");
 
-      Instruction *insertBefore = &*B->getFirstInsertionPt();
+      auto InsertPt = B->getFirstInsertionPt();
       if (i == 1) {
         if (Instruction *condI = dyn_cast<Instruction>(cond)) {
           BasicBlock *maskParent = condI->getParent();
           if (maskParent == B) {
-            insertBefore = condI->getNextNode();
+            InsertPt = std::next(condI->getIterator());
           }
         }
       } else {
-        insertBefore = cast<Instruction>(select)->getNextNode();
+        InsertPt = std::next(cast<Instruction>(select)->getIterator());
       }
-      select = SelectInst::Create(cond, V, select, PHI->getName() + ".blend",
-                                  insertBefore);
+      auto *selectInst =
+          SelectInst::Create(cond, V, select, PHI->getName() + ".blend");
+      selectInst->insertBefore(InsertPt);
+      select = selectInst;
     }
     newVal = select;
   }
@@ -2820,7 +2838,7 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
       // Instruction that will combine the phi node and the select instructions
       // created from it if some incoming blocks are no longer predecessors.
       Instruction *newBlend = nullptr;
-      Instruction *insertBefore = getInsertionPt(*BB);
+      const BasicBlock::iterator InsertPt = getInsertionPt(*BB);
 
       auto &maskInfo = MaskInfos[BB];
       const bool isEntryMask = PHI == maskInfo.entryMask;
@@ -2837,24 +2855,25 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
           // The entry mask of a blend value should be the conjunction of
           // the incoming masks, so change it.
           if (!newBlend) {
-            newBlend = BinaryOperator::CreateOr(
-                PHI, V, BB->getName() + ".entry_mask", insertBefore);
+            newBlend =
+                BinaryOperator::CreateOr(PHI, V, BB->getName() + ".entry_mask");
           } else {
-            newBlend = BinaryOperator::CreateOr(
-                newBlend, V, BB->getName() + ".entry_mask", insertBefore);
+            newBlend = BinaryOperator::CreateOr(newBlend, V,
+                                                BB->getName() + ".entry_mask");
           }
           maskInfo.entryMask = newBlend;
         } else {
           Value *cond = MaskInfos[incoming].exitMasks[BB];
           VECZ_ERROR_IF(!cond, "Exit mask does not exist");
           if (!newBlend) {
-            newBlend = SelectInst::Create(
-                cond, V, PHI, PHI->getName() + ".blend", insertBefore);
+            newBlend =
+                SelectInst::Create(cond, V, PHI, PHI->getName() + ".blend");
           } else {
-            newBlend = SelectInst::Create(
-                cond, V, newBlend, PHI->getName() + ".blend", insertBefore);
+            newBlend = SelectInst::Create(cond, V, newBlend,
+                                          PHI->getName() + ".blend");
           }
         }
+        newBlend->insertBefore(InsertPt);
         PHI->removeIncomingValue(idx--);
       }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 4e31c18506adc..d515a87acd7c2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -30,7 +30,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/Transforms/Utils/LoopUtils.h>
-#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
@@ -322,8 +322,8 @@ Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind,
       break;
   }
 
-  auto *const F = Intrinsic::getDeclaration(B.GetInsertBlock()->getModule(),
-                                            IntrinsicOp, Val->getType());
+  auto *const F = multi_llvm::GetOrInsertIntrinsicDeclaration(
+      B.GetInsertBlock()->getModule(), IntrinsicOp, Val->getType());
   assert(F && "Could not declare vector-predicated reduction intrinsic");
 
   auto *const VecTy = cast<VectorType>(Val->getType());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index d578417f38311..2299c8a22da7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2639,13 +2639,17 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     // Gather load or scatter store.
     for (unsigned i = 0; i != packetWidth; ++i) {
       if (op.isLoad()) {
-        results.push_back(createGather(Ctx, packetVecTy, ptrPacket[i],
-                                       maskPacket[i], EVL, op.getAlignment(),
-                                       name, op.getInstr()));
+        auto *gather =
+            createGather(Ctx, packetVecTy, ptrPacket[i], maskPacket[i], EVL,
+                         op.getAlignment(), name);
+        gather->insertBefore(op.getInstr()->getIterator());
+        results.push_back(gather);
       } else {
-        results.push_back(createScatter(Ctx, dataPacket[i], ptrPacket[i],
-                                        maskPacket[i], EVL, op.getAlignment(),
-                                        name, op.getInstr()));
+        auto *scatter =
+            createScatter(Ctx, dataPacket[i], ptrPacket[i], maskPacket[i], EVL,
+                          op.getAlignment(), name);
+        scatter->insertBefore(op.getInstr()->getIterator());
+        results.push_back(scatter);
       }
     }
   } else if (!constantStrideVal || constantStride != 1) {
@@ -2704,13 +2708,17 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
                                   Twine(name, ".incr"));
       }
       if (op.isLoad()) {
-        results.push_back(
+        auto *newLoad =
             createInterleavedLoad(Ctx, packetVecTy, ptr, stride, maskPacket[i],
-                                  EVL, op.getAlignment(), name, op.getInstr()));
+                                  EVL, op.getAlignment(), name);
+        newLoad->insertBefore(op.getInstr()->getIterator());
+        results.push_back(newLoad);
       } else {
-        results.push_back(createInterleavedStore(
-            Ctx, dataPacket[i], ptr, stride, maskPacket[i], EVL,
-            op.getAlignment(), name, op.getInstr()));
+        auto *newStore =
+            createInterleavedStore(Ctx, dataPacket[i], ptr, stride,
+                                   maskPacket[i], EVL, op.getAlignment(), name);
+        newStore->insertBefore(op.getInstr()->getIterator());
+        results.push_back(newStore);
       }
     }
   } else {
@@ -2773,13 +2781,17 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
                                     Twine(name, ".incr"));
         }
         if (op.isLoad()) {
-          results.push_back(createMaskedLoad(
-              Ctx, getWideType(dataTy, factor), ptr, maskPacket[i], EVL,
-              op.getAlignment(), name, op.getInstr()));
+          auto *newLoad =
+              createMaskedLoad(Ctx, getWideType(dataTy, factor), ptr,
+                               maskPacket[i], EVL, op.getAlignment(), name);
+          newLoad->insertBefore(op.getInstr()->getIterator());
+          results.push_back(newLoad);
         } else {
-          results.push_back(
+          auto *newStore =
               createMaskedStore(Ctx, dataPacket[i], ptr, maskPacket[i], EVL,
-                                op.getAlignment(), name, op.getInstr()));
+                                op.getAlignment(), name);
+          newStore->insertBefore(op.getInstr()->getIterator());
+          results.push_back(newStore);
         }
       }
     } else {
@@ -3367,13 +3379,14 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
     LoadInst *PointerRetResult =
         B.CreateLoad(PointerRetAlloca->getAllocatedType(), PointerRetAlloca);
     Value *Stride = getSizeInt(B, PointerRetStride);
-    auto *Store = createInterleavedStore(
-        Ctx, PointerRetResult, PointerRetAddr, Stride,
-        /*Mask*/ nullptr, /*EVL*/ nullptr, PointerRetAlloca->getAlign().value(),
-        "", &*B.GetInsertPoint());
+    auto *Store =
+        createInterleavedStore(Ctx, PointerRetResult, PointerRetAddr, Stride,
+                               /*Mask*/ nullptr, /*EVL*/ nullptr,
+                               PointerRetAlloca->getAlign().value());
     if (!Store) {
       return nullptr;
     }
+    Store->insertBefore(B.GetInsertPoint());
   }
   return NewCI;
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index a98ec1855d3ca..c76152a57f3b7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -51,7 +51,6 @@
 #include <llvm/Pass.h>
 #include <llvm/Support/InstructionCost.h>
 #include <llvm/Transforms/Utils/LoopUtils.h>
-#include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "analysis/uniform_value_analysis.h"
@@ -192,15 +191,16 @@ bool hoistInstructions(BasicBlock &BB, BranchInst &Branch, bool exceptions) {
               Value *one = ConstantInt::get(divisor->getType(), 1);
               Value *cond = Branch.getCondition();
 
+              Instruction *SI;
               if (TrueBranch) {
-                masked =
-                    SelectInst::Create(cond, divisor, one,
-                                       divisor->getName() + ".hoist_guard", &I);
+                SI = SelectInst::Create(cond, divisor, one,
+                                        divisor->getName() + ".hoist_guard");
               } else {
-                masked =
-                    SelectInst::Create(cond, one, divisor,
-                                       divisor->getName() + ".hoist_guard", &I);
+                SI = SelectInst::Create(cond, one, divisor,
+                                        divisor->getName() + ".hoist_guard");
               }
+              SI->insertBefore(I.getIterator());
+              masked = SI;
             }
           }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
index bc563fbba7150..fa3fa92c1b418 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
@@ -63,7 +63,8 @@ PreservedAnalyses RemoveIntPtrPass::run(Function &F,
         // hopefully be removed later.
         auto num_values = phi->getNumIncomingValues();
         PHINode *new_phi = PHINode::Create(int_ptr->getSrcTy(), num_values,
-                                           phi->getName() + ".intptr", phi);
+                                           phi->getName() + ".intptr");
+        new_phi->insertBefore(phi->getIterator());
 
         Instruction *insert = phi;
         while (isa<PHINode>(insert)) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
index e3feb0573839e..f104f7b6e85ee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -20,7 +20,6 @@
 #include <unordered_set>
 
 #include "debugging.h"
-#include "multi_llvm/multi_llvm.h"
 #include "transform/passes.h"
 
 using namespace llvm;
@@ -100,8 +99,8 @@ PreservedAnalyses vecz::SimplifyInfiniteLoopPass::run(
     }
     // Add new phi nodes for instructions computed in `toBlend`.
     for (Instruction *I : toBlend) {
-      PHINode *PHI = PHINode::Create(I->getType(), 2, I->getName() + ".blend",
-                                     &target->front());
+      PHINode *PHI = PHINode::Create(I->getType(), 2, I->getName() + ".blend");
+      PHI->insertBefore(target->begin());
       for (BasicBlock *pred : predecessors(target)) {
         if (pred != virtualExit) {
           PHI->addIncoming(I, pred);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
index 69214a20da3a0..d6707529dfca5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -19,6 +19,7 @@
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/PassManager.h>
+#include <multi_llvm/multi_llvm.h>
 
 #include "analysis/stride_analysis.h"
 #include "analysis/uniform_value_analysis.h"
@@ -153,12 +154,14 @@ void Transform(SelectInst *Select, VectorizationContext &Ctx) {
     auto Alignment = Mem.getAlignment();
     if (isa<LoadInst>(Memop)) {
       // Transform load
-      Value *LoadTrue =
+      auto *LoadTrue =
           createMaskedLoad(Ctx, Mem.getDataType(), GepTrue, Condition,
-                           /*VL*/ nullptr, Alignment, "", Memop);
-      Value *LoadFalse =
+                           /*VL*/ nullptr, Alignment);
+      LoadTrue->insertBefore(Memop->getIterator());
+      auto *LoadFalse =
           createMaskedLoad(Ctx, Mem.getDataType(), GepFalse, InvCondition,
-                           /*VL*/ nullptr, Alignment, "", Memop);
+                           /*VL*/ nullptr, Alignment);
+      LoadFalse->insertBefore(Memop->getIterator());
       B.SetInsertPoint(Memop);
       Value *LoadResult = B.CreateSelect(Condition, LoadTrue, LoadFalse);
 
@@ -167,9 +170,11 @@ void Transform(SelectInst *Select, VectorizationContext &Ctx) {
     } else if (isa<StoreInst>(Memop)) {
       // Transform store
       createMaskedStore(Ctx, StoredValue, GepTrue, Condition, /*VL*/ nullptr,
-                        Alignment, "", Memop);
+                        Alignment)
+          ->insertBefore(Memop->getIterator());
       createMaskedStore(Ctx, StoredValue, GepFalse, InvCondition,
-                        /*VL*/ nullptr, Alignment, "", Memop);
+                        /*VL*/ nullptr, Alignment)
+          ->insertBefore(Memop->getIterator());
     }
   };
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
index d4fe6be17dc18..18fbd5848c5c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -166,7 +166,8 @@ bool Reassociator::reassociate(llvm::BinaryOperator &Op) {
       // Transform (Varying Op Uniform) Op Varying
       // into (Varying Op Varying) Op Uniform
       auto *const P = BinaryOperator::Create(Opcode, A->getOperand(0), RHS,
-                                             "varying.reassoc", &Op);
+                                             "varying.reassoc");
+      P->insertBefore(Op.getIterator());
       UVR->setVarying(P);
       Op.setOperand(0, P);
       Op.setOperand(1, A->getOperand(1));
@@ -177,7 +178,8 @@ bool Reassociator::reassociate(llvm::BinaryOperator &Op) {
       // Transform (Varying Op Uniform) Op Uniform
       // into Varying Op (Uniform Op Uniform)
       auto *const P = BinaryOperator::Create(Opcode, A->getOperand(1), RHS,
-                                             "uniform.reassoc", &Op);
+                                             "uniform.reassoc");
+      P->insertBefore(Op.getIterator());
       Op.setOperand(0, A->getOperand(0));
       Op.setOperand(1, P);
       UVR->remove(A);
@@ -192,7 +194,8 @@ bool Reassociator::reassociate(llvm::BinaryOperator &Op) {
     // Transform Varying Op (Varying Op Uniform)
     // into (Varying Op Varying) Op Uniform
     auto *const P = BinaryOperator::Create(Opcode, B->getOperand(0), LHS,
-                                           "varying.reassoc", &Op);
+                                           "varying.reassoc");
+    P->insertBefore(Op.getIterator());
     Op.setOperand(0, P);
     Op.setOperand(1, B->getOperand(1));
     UVR->setVarying(P);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 172c6317c4f69..50de808f10e45 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -20,6 +20,7 @@
 #include <llvm/MC/TargetRegistry.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/TargetParser/Triple.h>
+#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
@@ -505,7 +506,7 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
       const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_gather, Tys, Args);
     } else if (Legality.isMaskLegal()) {
-      Function *MaskedGather = Intrinsic::getDeclaration(
+      Function *MaskedGather = multi_llvm::GetOrInsertIntrinsicDeclaration(
           F->getParent(), Intrinsic::masked_gather, {Ty, VecPtrTy});
 
       if (MaskedGather) {
@@ -603,7 +604,7 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
       const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_scatter, Tys, Args);
     } else if (Legality.isMaskLegal()) {
-      Function *MaskedScatter = Intrinsic::getDeclaration(
+      Function *MaskedScatter = multi_llvm::GetOrInsertIntrinsicDeclaration(
           F->getParent(), Intrinsic::masked_scatter, {DataTy, VecPtrTy});
 
       if (MaskedScatter) {
@@ -699,7 +700,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
   const unsigned fixedVecElts =
       multi_llvm::getVectorNumElements(origSrc->getType());
 
-  Value *load = nullptr;
+  Instruction *load = nullptr;
   if (!index->getType()->isVectorTy()) {
     // If the index remains a scalar (is uniform) then we can use a strided load
     // starting from the address '&alloc[index]', strided by the original vector
@@ -713,8 +714,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
         B.CreateInBoundsGEP(eltTy, bcastalloc, index, "vec.alloc");
 
     load = ::createInterleavedLoad(Ctx, narrowTy, gep, stride, /*Mask*/ nullptr,
-                                   /*EVL*/ nullptr, alignment.value(), "",
-                                   &*B.GetInsertPoint());
+                                   /*EVL*/ nullptr, alignment.value());
   } else {
     // Else if we've got a varying, vector index, then we must use a gather.
     // Take our indices, and add them to a step multiplied by the original
@@ -731,8 +731,9 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
         B.CreateInBoundsGEP(eltTy, bcastalloc, index, "vec.alloc");
 
     load = ::createGather(Ctx, narrowTy, gep, /*Mask*/ nullptr, /*EVL*/ nullptr,
-                          alignment.value(), "", &*B.GetInsertPoint());
+                          alignment.value());
   }
+  load->insertBefore(B.GetInsertPoint());
 
   return load;
 }
@@ -848,6 +849,7 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
 
   // Construct the index, either by packetizing if (if varying) or by
   // splatting it and combining it with a step vector
+  Instruction *store;
   if (!index->getType()->isVectorTy()) {
     // If the index remains a scalar (is uniform) then we can use a strided
     // store starting from the address '&alloc[index]', strided by the original
@@ -861,10 +863,8 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
     auto *const gep =
         B.CreateInBoundsGEP(scalarTy, bcastalloc, index, "vec.alloc");
 
-    Value *store = ::createInterleavedStore(
-        Ctx, elt, gep, stride, /*Mask*/ nullptr,
-        /*EVL*/ nullptr, alignment.value(), "", &*B.GetInsertPoint());
-    VECZ_FAIL_IF(!store);
+    store = ::createInterleavedStore(Ctx, elt, gep, stride, /*Mask*/ nullptr,
+                                     /*EVL*/ nullptr, alignment.value());
   } else {
     // Else if we've got a varying, vector index, then we must use a scatter.
     // Take our indices, and add them to a step multiplied by the original
@@ -886,11 +886,11 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
     auto *const gep =
         B.CreateInBoundsGEP(scalarTy, bcastalloc, index, "vec.alloc");
 
-    Value *store = ::createScatter(Ctx, elt, gep, /*Mask*/ nullptr,
-                                   /*EVL*/ nullptr, alignment.value(), "",
-                                   &*B.GetInsertPoint());
-    VECZ_FAIL_IF(!store);
+    store = ::createScatter(Ctx, elt, gep, /*Mask*/ nullptr,
+                            /*EVL*/ nullptr, alignment.value());
   }
+  VECZ_FAIL_IF(!store);
+  store->insertBefore(B.GetInsertPoint());
 
   // Load the vector back from the stack
   return B.CreateLoad(intoTy, alloc);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index e6fa868d072f4..ea5c7284e2556 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -18,6 +18,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IntrinsicsAArch64.h>
 #include <llvm/IR/IntrinsicsARM.h>
+#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
@@ -223,8 +224,8 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
     Tys = {VecTy, PtrTy};
   }
 
-  Function *IntrFn =
-      Intrinsic::getDeclaration(Op0->getModule(), (Intrinsic::ID)IntrID, Tys);
+  Function *IntrFn = multi_llvm::GetOrInsertIntrinsicDeclaration(
+      Op0->getModule(), (Intrinsic::ID)IntrID, Tys);
   if (!IntrFn) {
     return false;
   }
@@ -378,7 +379,7 @@ bool TargetInfoAArch64::optimizeInterleavedGroup(
     VecTy = cast<FixedVectorType>(Op0->getType());
   }
 
-  Function *IntrFn = Intrinsic::getDeclaration(
+  Function *IntrFn = multi_llvm::GetOrInsertIntrinsicDeclaration(
       Op0->getModule(), (Intrinsic::ID)IntrID, {VecTy, PtrTy});
   if (!IntrFn) {
     return false;

From 1d452d5d31e7f9682521bd7cdaf18acec3b8272c Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Wed, 16 Oct 2024 14:02:23 +0100
Subject: [PATCH 126/182] Add support for ConstantAray in replace-module-scope
 pass.

Native CPU for aarch64 and risc-v was outputting constant arrays which
pointed to local variables. This was not handled by the replace-module-scope
pass.

Fixed by iterating through the ConstantArray and turning each element into
an instruction in replaceConstantExpressionWithInstruction().
---
 .../source/pass_functions.cpp                 | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 1225c4975aba8..1287e1009df0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -153,9 +153,11 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
   // passes)
   constant->removeDeadConstantUsers();
 
-  // Only handle constants which are ConstantExpr or ConstantVector
+  // Only handle constants which are ConstantExpr, ConstantVector or
+  // ConstantArray
   assert((llvm::isa<llvm::ConstantExpr>(constant) ||
-          llvm::isa<llvm::ConstantVector>(constant)) &&
+          llvm::isa<llvm::ConstantVector>(constant) ||
+          llvm::isa<llvm::ConstantArray>(constant)) &&
          "Unsupported constant type in IR");
 
   // For each user of a constant we will check to see if they in turn are
@@ -229,6 +231,22 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
           llvm::FixedVectorType::get(i32Ty, numEls));
       newInst = new llvm::ShuffleVectorInst(insert, undef, zeros);
       newInst->insertAfter(insert);
+    } else if (llvm::ConstantArray *constantArr =
+                   llvm::dyn_cast<llvm::ConstantArray>(constant)) {
+      auto numEls = constantArr->getNumOperands();
+      llvm::Value *undef = llvm::UndefValue::get(constantArr->getType());
+      llvm::Instruction *insertedIns = nullptr;
+      for (unsigned int i = 0; i < numEls; i++) {
+        auto *insertNext = llvm::InsertValueInst::Create(
+            insertedIns ? insertedIns : undef, constantArr->getOperand(i), {i});
+        if (insertedIns) {
+          insertNext->insertAfter(insertedIns);
+        } else {
+          insertNext->insertBefore(useFunc->getEntryBlock().getFirstNonPHI());
+        }
+        insertedIns = insertNext;
+      }
+      newInst = insertedIns;
     }
 
     // replace the use of the constant with the instruction

From 52f70b89ac6fee27cc197bcdf651e0d6c0ee6d24 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 11 Nov 2024 20:14:54 +0000
Subject: [PATCH 127/182] [NFC] Avoid deprecated Type::getPointerTo.

Type::getPointerTo has been deprecated in favour of PointerType::get.
This commit updates the calls accordingly.

Exceptions:
- Some uses were to perform bitcasts between pointer types. These
  bitcasts did nothing ever since LLVM moved to opaque pointers, and are
  removed instead.
- Some uses were in a context where an IRBuilder was available, in which
  case IRBuilder::getPtrTy provides a simpler alternative.
---
 .../source/cl_builtin_info.cpp                |  6 ++--
 .../source/mux_builtin_info.cpp               |  4 +--
 ...lace_local_module_scope_variables_pass.cpp |  3 +-
 .../transform/builtin_inlining_pass.cpp       | 17 ++--------
 .../source/transform/remove_intptr_pass.cpp   |  5 +--
 .../vecz/source/vector_target_info.cpp        | 32 +++++++------------
 6 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 89449b218b139..9490b236ac603 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -2190,8 +2190,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVLoad(Function *F, unsigned Width,
       Data = B.CreateInsertElement(Data, Lane, Index, "vload_insert");
     }
   } else {
-    PointerType *VecPtrTy = DataTy->getPointerTo(PtrTy->getAddressSpace());
-    Value *VecBase = B.CreateBitCast(GEPBase, VecPtrTy, "vload_ptr");
+    Value *VecBase = B.CreateBitCast(GEPBase, PtrTy, "vload_ptr");
     auto *Load = B.CreateLoad(DataTy, VecBase, false, "vload");
 
     const unsigned Align = DataTy->getScalarSizeInBits() / 8;
@@ -2251,8 +2250,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVStore(Function *F, unsigned Width,
       Store = B.CreateStore(Lane, GEP, false);
     }
   } else {
-    PointerType *VecPtrTy = VecDataTy->getPointerTo(PtrTy->getAddressSpace());
-    Value *VecBase = B.CreateBitCast(GEPBase, VecPtrTy, "vstore_ptr");
+    Value *VecBase = B.CreateBitCast(GEPBase, PtrTy, "vstore_ptr");
     Store = B.CreateStore(Data, VecBase, false);
 
     const unsigned Align = VecDataTy->getScalarSizeInBits() / 8;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index a84ee21df5e6f..a01c67fc005f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -1208,7 +1208,7 @@ BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) {
     auto *const WIInfoS = getWorkItemInfoStructTy(M);
     WIInfo.ID = SchedParamIndices::WI;
     WIInfo.ParamPointeeTy = WIInfoS;
-    WIInfo.ParamTy = WIInfoS->getPointerTo();
+    WIInfo.ParamTy = PointerType::get(WIInfoS, /*AddressSpace=*/0);
     WIInfo.ParamName = "wi-info";
     WIInfo.ParamDebugName = WIInfoS->getStructName().str();
     WIInfo.PassedExternally = false;
@@ -1224,7 +1224,7 @@ BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) {
     auto *const WGInfoS = getWorkGroupInfoStructTy(M);
     WGInfo.ID = SchedParamIndices::WG;
     WGInfo.ParamPointeeTy = WGInfoS;
-    WGInfo.ParamTy = WGInfoS->getPointerTo();
+    WGInfo.ParamTy = PointerType::get(WGInfoS, /*AddressSpace=*/0);
     WGInfo.ParamName = "wg-info";
     WGInfo.ParamDebugName = WGInfoS->getStructName().str();
     WGInfo.PassedExternally = true;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index b3381b85ff360..e6b2de85afc4e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -354,7 +354,8 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
 
   // change all our functions to take a pointer to the new structTy we created
   const AttributeSet defaultAttrs;
-  addParamToAllRequiredFunctions(M, structTy->getPointerTo(), defaultAttrs);
+  addParamToAllRequiredFunctions(
+      M, PointerType::get(structTy, /*AddressSpace=*/0), defaultAttrs);
 
   // Check if we have debug info, if so we need to fix it up to turn global
   // variable entries into local variable ones.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index f8c49fa1fbe7b..39ff78796a3bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -153,14 +153,10 @@ static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
   int64_t byte = 0;
   // Initially we use 64bit loads and stores, in order to avoid emitting too
   // many instructions.
-  // We can't just get an Int64PtrTy because we need the correct address space
-  Type *DstInt64PtrTy = B.getInt64Ty()->getPointerTo(
-      cast<PointerType>(DstPtr->getType())->getAddressSpace());
 
   for (; byte <= Bytes - 8; byte += 8) {
     Value *Idx = B.getIntN(PtrBits, byte);
-    Value *OffsetDstPtr = B.CreateBitCast(
-        B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx), DstInt64PtrTy, DstName);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx);
     MS = B.CreateStore(StoredValue64, OffsetDstPtr, IsVolatile);
 
     // Set alignments for store to be minimum of that from
@@ -223,19 +219,12 @@ static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
   int64_t byte = 0;
   // Initially we use 64bit loads and stores, in order to avoid emitting too
   // many instructions...
-  // We can't just get an Int64PtrTy because we need the correct address space
   Type *Int64Ty = B.getInt64Ty();
-  Type *SrcInt64PtrTy = Int64Ty->getPointerTo(
-      cast<PointerType>(SrcPtr->getType())->getAddressSpace());
-  Type *DstInt64PtrTy = Int64Ty->getPointerTo(
-      cast<PointerType>(DstPtr->getType())->getAddressSpace());
 
   for (; byte <= Length - 8; byte += 8) {
     Value *Idx = B.getIntN(PtrBits, byte);
-    Value *OffsetSrcPtr = B.CreateBitCast(
-        B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx), SrcInt64PtrTy);
-    Value *OffsetDstPtr = B.CreateBitCast(
-        B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx), DstInt64PtrTy, DstName);
+    Value *OffsetSrcPtr = B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx);
     LoadInst *LoadValue =
         B.CreateLoad(Int64Ty, OffsetSrcPtr, IsVolatile, SrcName);
     MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
index fa3fa92c1b418..9d66e14e73eef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
@@ -105,10 +105,7 @@ PreservedAnalyses RemoveIntPtrPass::run(Function &F,
 
         if (index) {
           Value *operand = int_ptr->getOperand(0);
-          Value *cast_operand = B.CreateBitCast(
-              operand, i8_ty->getPointerTo(
-                           operand->getType()->getPointerAddressSpace()));
-          Value *new_gep = B.CreateGEP(i8_ty, cast_operand, index, name);
+          Value *new_gep = B.CreateGEP(i8_ty, operand, index, name);
           Value *new_cast = B.CreatePtrToInt(new_gep, bin_op->getType(), name);
           bin_op->replaceAllUsesWith(new_cast);
           bin_op->eraseFromParent();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 50de808f10e45..4cf4132384e3a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -88,8 +88,6 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
 
   // Trivial case: contiguous load.
   ConstantInt *CIntStride = dyn_cast<ConstantInt>(Stride);
-  PointerType *VecPtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
-  Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
   if (CIntStride && CIntStride->getSExtValue() == 1) {
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
@@ -101,11 +99,11 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
         VECZ_FAIL();
       }
       auto *Mask = createAllTrueMask(B, multi_llvm::getVectorElementCount(Ty));
-      const SmallVector<llvm::Value *, 2> Args = {VecPtr, Mask, EVL};
-      const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtr->getType()};
+      const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, Ptr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
     }
-    return B.CreateAlignedLoad(Ty, VecPtr, MaybeAlign(Alignment));
+    return B.CreateAlignedLoad(Ty, Ptr, MaybeAlign(Alignment));
   }
 
   if (EVL) {
@@ -157,8 +155,6 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
   // Trivial case: contiguous store.
   ConstantInt *CIntStride = dyn_cast<ConstantInt>(Stride);
   if (CIntStride && CIntStride->getSExtValue() == 1) {
-    PointerType *VecPtrTy = VecTy->getPointerTo(PtrTy->getAddressSpace());
-    Value *VecPtr = B.CreateBitCast(Ptr, VecPtrTy);
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
       const auto Legality = isVPStoreLegal(F, VecTy, Alignment);
@@ -170,12 +166,12 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
       }
       auto *Mask =
           createAllTrueMask(B, multi_llvm::getVectorElementCount(VecTy));
-      const SmallVector<llvm::Value *, 3> Args = {Data, VecPtr, Mask, EVL};
+      const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
       const SmallVector<llvm::Type *, 2> Tys = {Data->getType(),
-                                                VecPtr->getType()};
+                                                Ptr->getType()};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
     }
-    return B.CreateAlignedStore(Data, VecPtr, MaybeAlign(Alignment));
+    return B.CreateAlignedStore(Data, Ptr, MaybeAlign(Alignment));
   }
 
   if (EVL) {
@@ -231,8 +227,6 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
 
   // Use LLVM intrinsics for masked vector loads.
   if (Ty->isVectorTy()) {
-    PtrTy = Ty->getPointerTo(PtrTy->getAddressSpace());
-    Ptr = B.CreateBitCast(Ptr, PtrTy);
     const Function *F = B.GetInsertBlock()->getParent();
     const auto Legality = isVPLoadLegal(F, Ty, Alignment);
     if (EVL && Legality.isVPLegal()) {
@@ -338,8 +332,6 @@ Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
 
   // Use LLVM intrinsics for masked vector Stores.
   if (DataTy->isVectorTy()) {
-    PtrTy = DataTy->getPointerTo(PtrTy->getAddressSpace());
-    Ptr = B.CreateBitCast(Ptr, PtrTy);
     const Function *F = B.GetInsertBlock()->getParent();
     const auto Legality = isVPStoreLegal(F, DataTy, Alignment);
     if (EVL && Legality.isVPLegal()) {
@@ -693,7 +685,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
   B.CreateStore(src, alloc);
 
   // Re-interpret the allocation as a pointer to the element type
-  auto *const eltptrTy = eltTy->getPointerTo();
+  auto *const eltptrTy = PointerType::get(eltTy, /*AddressSpace=*/0);
   auto *const bcastalloc =
       B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
 
@@ -777,7 +769,7 @@ Value *TargetInfo::createScalableBroadcast(IRBuilder<> &B, Value *vector,
 
   auto *const eltTy = cast<llvm::VectorType>(ty)->getElementType();
 
-  auto *const eltptrTy = eltTy->getPointerTo();
+  auto *const eltptrTy = PointerType::get(eltTy, /*AddressSpace=*/0);
   auto *const bcastalloc =
       B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
   auto *const stepsRem = TargetInfo::createBroadcastIndexVector(
@@ -840,7 +832,7 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
   B.CreateStore(into, alloc);
 
   // Re-interpret the allocation as a pointer to the element type
-  auto *const eltptrTy = scalarTy->getPointerTo();
+  auto *const eltptrTy = PointerType::get(scalarTy, /*AddressSpace=*/0);
   auto *const bcastalloc =
       B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
 
@@ -915,9 +907,7 @@ TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
   bool isVPLegal = isMaskLegal && isVPVectorLegal(*F, Ty);
   if (isVPLegal) {
     const unsigned PtrBitWidth =
-        TM_ ? TM_->createDataLayout().getPointerTypeSizeInBits(
-                  Ty->getPointerTo())
-            : 64;
+        TM_ ? TM_->createDataLayout().getPointerSizeInBits(/*AS=*/0) : 64;
     auto &Ctx = Ty->getContext();
     auto *const IntTy = IntegerType::get(Ctx, PtrBitWidth);
     auto *const IntVecTy =
@@ -982,7 +972,7 @@ llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
   auto *const eltTy = srcTy->getElementType();
 
   // Re-interpret the allocation as a pointer to the element type
-  auto *const eltptrTy = eltTy->getPointerTo();
+  auto *const eltptrTy = PointerType::get(eltTy, /*AddressSpace=*/0);
   auto *const bcastalloc =
       B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
 

From 626b71d360afa3927591ff92f025e379c60ec32d Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 15 Nov 2024 11:24:46 +0000
Subject: [PATCH 128/182] Fix vecz lit test for splat issues after llvm 20
 changes to output

---
 .../define_interleaved_store.ll               |  2 +-
 .../define_interleaved_store_as_masked.ll     |  2 +-
 .../vector_phi_uniform.ll                     |  2 +-
 .../vector_phi_varying.ll                     |  8 ++---
 .../llvm/VectorPredication/load_add_store.ll  | 16 ++++-----
 .../llvm/VectorWidening/interleaved_safety.ll |  2 +-
 .../lit/llvm/VectorWidening/widen_fshl.ll     |  2 +-
 .../lit/llvm/VectorWidening/widen_fshr.ll     |  2 +-
 .../vecz/test/lit/llvm/cmpxchg.ll             |  6 ++--
 .../vecz/test/lit/llvm/define_gather_load.ll  |  2 +-
 .../lit/llvm/define_gather_load_as_masked.ll  |  2 +-
 .../test/lit/llvm/define_interleaved_load.ll  |  2 +-
 .../llvm/define_interleaved_load_as_masked.ll |  2 +-
 .../test/lit/llvm/define_interleaved_store.ll |  2 +-
 .../define_interleaved_store_as_masked.ll     |  2 +-
 .../test/lit/llvm/define_scatter_store.ll     |  2 +-
 .../llvm/define_scatter_store_as_masked.ll    |  2 +-
 .../lit/llvm/insertelement_runtime_index.ll   | 14 ++++----
 .../vecz/test/lit/llvm/interleaved_safety.ll  |  4 +--
 .../vecz/test/lit/llvm/masked_atomics.ll      | 10 +++---
 .../vecz/test/lit/llvm/masked_cmpxchg.ll      |  6 ++--
 .../test/lit/llvm/packetization_branch.ll     |  2 +-
 .../lit/llvm/packetization_uniform_branch.ll  |  4 +--
 .../llvm/packetize_uniform_default_reduce.ll  |  2 +-
 .../llvm/packetize_uniform_loops_reduce.ll    |  2 +-
 .../test/lit/llvm/packetize_uniform_reduce.ll |  2 +-
 .../vecz/test/lit/llvm/squash_extract_sext.ll | 14 ++++----
 .../lit/llvm/squash_extract_sext_bigendian.ll | 14 ++++----
 .../lit/llvm/squash_extract_zext_bigendian.ll | 14 ++++----
 .../test/lit/llvm/subgroup_shuffle_down.ll    | 16 ++++-----
 .../vecz/test/lit/llvm/subgroup_shuffle_up.ll | 34 +++++++++----------
 .../test/lit/llvm/subgroup_shuffle_xor.ll     | 16 ++++-----
 ..._transform_uniform_condition_packetized.ll |  2 +-
 .../vecz/test/lit/llvm/vector_phi_uniform.ll  |  2 +-
 34 files changed, 108 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
index 8f246189cd678..93fb9fa339d13 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -76,7 +76,7 @@ attributes #3 = { nobuiltin nounwind }
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #[[ATTRS:[0-9]+]]
+; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]]
 ; CHECK: ret void
 
 ; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
index 1b49ee53f5bbe..96c5af4ff09ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -76,7 +76,7 @@ attributes #3 = { nobuiltin nounwind }
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #[[ATTRS:[0-9]+]]
+; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]]
 ; CHECK: ret void
 
 ; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
index dca8e8649bd00..9f7e53bff1ac7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -83,5 +83,5 @@ declare i64 @__mux_get_global_size(i32)
 ; This test checks if a uniform <4 x i32> phi is not scalarized
 ; CHECK: define spir_kernel void @__vecz_v4_vector_loop
 ; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ]
-; CHECK: %[[INC]] = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: %[[INC]] = add <4 x i32> %storemerge, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
index 010cafb0a3b70..5de908b0f3bd9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
@@ -87,8 +87,8 @@ declare i64 @__mux_get_global_size(i32)
 ; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
 ; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
 ; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
-; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
-; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
-; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
-; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 7b274d63d6a6b..8da3755e1f378 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -45,10 +45,10 @@ entry:
 ; CHECK_4F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
 ; CHECK_4F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 4)
 ; CHECK_4F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32
-; CHECK_4F: [[LHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
-; CHECK_4F: [[RHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
-; CHECK_4F: [[ADD:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[LHS]], <4 x i32> [[RHS]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
-; CHECK_4F: call void @llvm.vp.store.v4i32.p0(<4 x i32> [[ADD]], ptr {{%.*}}, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[VL]])
+; CHECK_4F: [[LHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+; CHECK_4F: [[RHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+; CHECK_4F: [[ADD:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[LHS]], <4 x i32> [[RHS]], <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+; CHECK_4F: call void @llvm.vp.store.v4i32.p0(<4 x i32> [[ADD]], ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
 
 ; CHECK_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_i32(
 ; CHECK_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
@@ -84,10 +84,10 @@ entry:
 ; CHECK_V4_2F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32
 ; Each WI performs 4 elements, so multiply the VL by 4
 ; CHECK_V4_2F: [[SVL:%.*]] = shl nuw nsw i32 [[VL]], 2
-; CHECK_V4_2F: [[LHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
-; CHECK_V4_2F: [[RHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
-; CHECK_V4_2F: [[ADD:%.*]] = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> [[LHS]], <8 x i32> [[RHS]], <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
-; CHECK_V4_2F: call void @llvm.vp.store.v8i32.p0(<8 x i32> [[ADD]], ptr {{%.*}}, <8 x i1> <i1 true, i1 true, i1 true, i1  true, i1 true, i1 true, i1 true, i1 true>, i32 [[SVL]])
+; CHECK_V4_2F: [[LHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+; CHECK_V4_2F: [[RHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+; CHECK_V4_2F: [[ADD:%.*]] = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> [[LHS]], <8 x i32> [[RHS]], <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+; CHECK_V4_2F: call void @llvm.vp.store.v8i32.p0(<8 x i32> [[ADD]], ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
 
 ; CHECK_V4_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_v4i32(
 ; CHECK_V4_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
index 5d0789d9dd23e..7811de134bb56 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -79,7 +79,7 @@ attributes #3 = { nobuiltin nounwind }
 
 ; And in between them there should be a barrier call
 ; CHECK: call void @__mux_work_group_barrier
-; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
+; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}}
 ; CHECK: load <16 x double>
 ; CHECK: load <16 x double>
 ; CHECK: load <16 x double>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
index b0300297dd961..bba75141b6eca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
@@ -42,7 +42,7 @@ declare i8 @llvm.fshl.i8(i8, i8, i8)
 ; It checks that the fshl intrinsic of i8 gets widened by a factor of 16
 ; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}}
 ; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}}
-; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> <{{(i8 4, )+i8 4}}>)
+; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> {{<(i8 4(, )?)+>|splat \(i8 4\)}})
 ; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}}
 
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
index 270a1c69545e7..0f0cc9e699349 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
@@ -42,7 +42,7 @@ declare i8 @llvm.fshr.i8(i8, i8, i8)
 ; It checks that the fshr intrinsic of i8 gets widened by a factor of 16
 ; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}}
 ; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}}
-; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> <{{(i8 2, )+i8 2}}>)
+; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> {{<(i8 2(, )?)+>|splat \(i8 2\)}})
 ; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}}
 
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
index bf2175364861f..3871ad80a0efe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
@@ -28,9 +28,9 @@ entry:
 
 ; Test that this cmpxchg is packetized by generating a call to an all-true masked version.
 ; CHECK: [[A0:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(
-; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>,
-; CHECK-SAME: <4 x i32> <i32 2, i32 2, i32 2, i32 2>,
-; CHECK-SAME: <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}},
+; CHECK-SAME: <4 x i32> {{<(i32 2(, )?)+>|splat \(i32 2\)}},
+; CHECK-SAME: <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}
   %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
 ; CHECK: [[EXT0:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 0
   %val0 = extractvalue { i32, i1 } %old0, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
index eff4f12e6bfa7..1f1a971c98b85 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -39,7 +39,7 @@ declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
-; CHECK: %[[V1:[0-9]+]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>,
+; CHECK: %[[V1:[0-9]+]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}},
 ; CHECK: ret <4 x i64> %[[V1]]
 
 ; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
index c25c29af33ede..84c2c2cadcd1a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -39,7 +39,7 @@ declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
-; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> undef)
+; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x i64> undef)
 ; CHECK: ret <4 x i64>
 
 ; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
index 1e059c4a8525d..ce30f09a99424 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -58,5 +58,5 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %[[TMP1:.*]] = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> undef)
 ; CHECK: ret <4 x double> %[[TMP2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
index b1334c7aafb7d..d095cfe7d104d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -75,5 +75,5 @@ attributes #3 = { nobuiltin nounwind }
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %1 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> undef)
 ; CHECK: ret <4 x double> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
index e46e743c04a80..265b4c9586159 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -59,5 +59,5 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
index 60834e45633b8..b9440e1f9fd18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -76,5 +76,5 @@ attributes #3 = { nobuiltin nounwind }
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
index a035ca05d3dc8..39cbd38d2cb5c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -40,7 +40,7 @@ declare i64 @__mux_get_global_id(i32)
 ; Test if the scatter store is defined correctly
 ; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry
-; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
 ; CHECK: ret void
 
 ; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
index fd7a7570b2527..281afd0867c02 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -40,7 +40,7 @@ declare i64 @__mux_get_global_id(i32)
 ; Test if the scatter store is defined correctly
 ; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
-; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
 ; CHECK: ret void
 
 ; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
index 6ffafa29877cd..b6d927747a650 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
@@ -40,13 +40,13 @@ entry:
 
 ; Four icmps and selects
 ; CHECK: icmp eq <4 x i32> %{{.+}}, zeroinitializer
-; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
-; CHECK: icmp eq <4 x i32> %{{.+}}, <i32 1, i32 1, i32 1, i32 1>
-; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
-; CHECK: icmp eq <4 x i32> %{{.+}}, <i32 2, i32 2, i32 2, i32 2>
-; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
-; CHECK: icmp eq <4 x i32> %{{.+}}, <i32 3, i32 3, i32 3, i32 3>
-; CHECK: select <4 x i1> %{{.+}}, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}}
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
 
 ; Four stores
 ; CHECK: store <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
index 0269e453ce22e..a2a3fc4023ce6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -80,13 +80,13 @@ attributes #3 = { nobuiltin nounwind }
 
 ; And in between them there should be a barrier call
 ; CHECK: call void @__mux_work_group_barrier
-; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
+; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}}
 ; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
 ; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
 
 ; There shouldn't be any more interleaved loads or stores left
 ; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1
-; CHECK-NOT: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> <double 1.600000e+01, double 1.600000e+01, double 1.600000e+01, double 1.600000e+01>
+; CHECK-NOT: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}}
 
 ; There should be some sufflevector instructions after the simplification
 ; CHECK: shufflevector
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
index 7413f6ca6b345..452dcae3f22c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
@@ -24,7 +24,7 @@ define spir_kernel void @test_fn(ptr %p) {
 entry:
 ; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
 ; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> <i64 3, i64 3, i64 3, i64 3>, 
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> {{<(i64 3(, )?)+>|splat \(i64 3\)}}, 
   %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp sgt i64 3, %call
 ; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
@@ -33,16 +33,16 @@ entry:
 
 if.then:                                          ; preds = %entry
 ; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
-; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]]
   %old0 = atomicrmw add ptr %p, i32 1 acquire
 ; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
-; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]]
   %old1 = atomicrmw add ptr %wi_p_i32, i32 1 acquire
 ; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(
-; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i1> [[CMP]]
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]]
   %old2 = atomicrmw umin ptr %wi_p_i32, i32 1 monotonic, align 2
 ; CHECK: = call <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(
-; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i1> [[CMP]]
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x float> {{<(float 1.000000e\+00(, )?)+>|splat \(float 1.000000e\+00\)}}, <4 x i1> [[CMP]]
   %old3 = atomicrmw volatile fmax ptr %wi_p_i32, float 1.0 syncscope("singlethread") seq_cst
   br label %if.end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
index 80576d6aa3f15..60d213f6879ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
@@ -24,7 +24,7 @@ define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
 entry:
 ; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
 ; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> <i64 3, i64 3, i64 3, i64 3>, 
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> {{<(i64 3(, )?)+>|splat \(i64 3\)}}, 
   %call = call i64 @__mux_get_global_id(i32 0)
   %cmp = icmp sgt i64 3, %call
 ; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
@@ -33,8 +33,8 @@ entry:
 
 if.then:                                          ; preds = %entry
 ; CHECK: [[CALL:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(
-; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>,
-; CHECK-SAME: <4 x i32> <i32 2, i32 2, i32 2, i32 2>, <4 x i1> [[CMP]]
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}},
+; CHECK-SAME: <4 x i32> {{<(i32 2(, )?)+>|splat \(i32 2\)}}, <4 x i1> [[CMP]]
   %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
   %val0 = extractvalue { i32, i1 } %old0, 0
   %success0 = extractvalue { i32, i1 } %old0, 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
index ddbcd7e1b220b..f6628bcf3f6e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -54,7 +54,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %[[GID:.+]] = add <4 x i64> %[[GID_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK: %[[CMP3:.+]] = icmp eq <4 x i64> %[[A_SPLAT]], %[[GID]]
-; CHECK: %[[NOT_CMP4:.+]] = xor <4 x i1> %[[CMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK: %[[NOT_CMP4:.+]] = xor <4 x i1> %[[CMP3]], {{<(i1 true(, )?)+>|splat \(i1 true\)}}
 
 ; CHECK: %[[IDX:.+]] = sext i32 %a to i64
 ; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr %b, i64 %[[IDX]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
index 7218a621cafc7..1d343b32bef17 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -92,12 +92,12 @@ declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: if.then:
 ; CHECK: %[[GEP1:.+]] = getelementptr i32, ptr %b, <4 x i64>
-; CHECK: store <4 x i32> <i32 11, i32 11, i32 11, i32 11>, ptr %{{.+}}, align 4
+; CHECK: store <4 x i32> {{<(i32 11(, )?)+>|splat \(i32 11\)}}, ptr %{{.+}}, align 4
 ; CHECK: br label %if.end
 
 ; CHECK: if.else:
 ; CHECK: %[[GEP2:.+]] = getelementptr i32, ptr %b, <4 x i64>
-; CHECK: store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, ptr %{{.+}}, align 4
+; CHECK: store <4 x i32> {{<(i32 13(, )?)+>|splat \(i32 13\)}}, ptr %{{.+}}, align 4
 ; CHECK: br label %if.end
 
 ; CHECK: if.end:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
index c05abe83b1c59..d6399fa4ec372 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -160,6 +160,6 @@ if.end:                                           ; preds = %entry, %if.then
 ; CHECK: phi i32
 ; CHECK: mul i32 %{{.+}}, 3
 ; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
-; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS3Dv4_b(<4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS3Dv4_b(<4 x i32> {{<(i32 5(, )?)+>|splat \(i32 5\)}}
 ; CHECK: shl i32 %{{.+}}, 1
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
index 0d216f4fbec55..02e0405b267b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -67,7 +67,7 @@ for.end:                                          ; preds = %for.cond
 ; CHECK: insertelement <4 x i64> {{poison|undef}}, i64 %{{.+}}, {{(i32|i64)}} 0
 ; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: phi <4 x i32>
-; CHECK: mul <4 x i32> %{{.+}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
 ; CHECK: urem <4 x i64>
 ; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
index d29a4b8d2cc53..0d71d96d7011e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -67,7 +67,7 @@ for.end:                                          ; preds = %for.cond
 ; CHECK: insertelement <4 x i64> {{poison|undef}}, i64 %{{.+}}, {{(i32|i64)}} 0
 ; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: phi <4 x i32>
-; CHECK: mul <4 x i32> %{{.+}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
 ; CHECK: urem <4 x i64>
 ; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
index d5e661c641d58..a8eb1b595f9ce 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
@@ -56,13 +56,13 @@ attributes #2 = { nobuiltin nounwind }
 ; CHECK-NOT: shufflevector
 ; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
 ; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
-; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}}
+; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}}
+; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
 ; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]]
 ; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]]
 ; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
index 59da087e4699c..b292e06626c50 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
@@ -56,13 +56,13 @@ attributes #2 = { nobuiltin nounwind }
 ; CHECK-NOT: shufflevector
 ; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
 ; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
-; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}}
+; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}}
+; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
 ; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]]
 ; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]]
 ; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
index a695f54b37a53..a2ef40270c4ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
@@ -56,13 +56,13 @@ attributes #2 = { nobuiltin nounwind }
 ; CHECK-NOT: shufflevector
 ; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
 ; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
-; CHECK:  %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK:  %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK:  %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK:  %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK:  %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK:  %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}}
+; CHECK:  %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}}
+; CHECK:  %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
 ; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[ZEXT0]], %[[ZEXT1]]
 ; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[ZEXT2]]
 ; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[ZEXT3]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
index 3e90d729f5b77..e2cc382506e6e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
@@ -24,9 +24,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
 ; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
 
-; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, <i32 1, i32 1, i32 1, i32 1>
-; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
 
 ; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
@@ -65,9 +65,9 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 }
 
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out)
-; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, <i32 2, i32 2, i32 2, i32 2>
-; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}}
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
 
 ; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
@@ -159,8 +159,8 @@ define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4
 
 ; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, [[DELTALD]]
-; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
 
 ; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
index a3e645e88ac17..3c6650d26ac38 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
@@ -24,17 +24,17 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
 ; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
 
-; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, <i32 1, i32 1, i32 1, i32 1>
-; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 
-; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
 ; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
 ; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
 
-; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], <i32 4, i32 4, i32 4, i32 4> 
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} 
 
 ; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
 ; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
@@ -77,17 +77,17 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 }
 
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out)
-; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, <i32 2, i32 2, i32 2, i32 2>
-; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}}
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 
-; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
 ; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
 ; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
 
-; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], <i32 4, i32 4, i32 4, i32 4> 
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} 
 
 ; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
 ; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
@@ -183,16 +183,16 @@ define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4
 
 ; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[DELTALD]]
-; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 
-; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
 ; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
 ; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
 
-; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], <i32 4, i32 4, i32 4, i32 4> 
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} 
 
 ; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
 ; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
index e84ec6ba216d0..5f3f1815805ce 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
@@ -24,9 +24,9 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; The XOR'd sub-group local IDs
 ; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
 ; Which mux sub-group each of the XOR'd sub-group local IDs correspond to
-; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; Which vector group element each of the XOR'd sub-group local IDs correspond to
-; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 
 ; Extract the first XOR'd vector-local sub-group local ID from the vector of vector indices
 ; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
@@ -72,8 +72,8 @@ define spir_kernel void @kernel_varying_data_const_value(ptr %in, ptr %out) {
 ; This should just be the same as the previous kernel. The uniform value doesn't change anything.
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out)
 ; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
-; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
 ; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
 ; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
@@ -120,8 +120,8 @@ define spir_kernel void @kernel_uniform_data_uniform_value(half %data, i32 %val,
 ; This should just be the same as the previous kernel. The varying value doesn't change anything.
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out)
 ; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
-; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 ; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
 ; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
 ; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
@@ -157,8 +157,8 @@ define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, p
 
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out)
 ; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
-; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
 
 ; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
 ; CHECK: [[MULIDXELT0:%.*]] = mul i32 [[IDXELT0]], 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
index 9dc3c74401b57..36bf8e240fc54 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
@@ -42,5 +42,5 @@ declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: %[[SELECT:.+]] = select i1 %cond, ptr %c0, ptr %c1
 ; CHECK: %[[BASE:.+]] = getelementptr i64, ptr %[[SELECT]], i64 0
-; CHECK: store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %[[BASE]], align 4
+; CHECK: store <4 x i64> {{<(i64 1(, )?)+>|splat \(i64 1\)}}, ptr %[[BASE]], align 4
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
index dca8e8649bd00..9f7e53bff1ac7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -83,5 +83,5 @@ declare i64 @__mux_get_global_size(i32)
 ; This test checks if a uniform <4 x i32> phi is not scalarized
 ; CHECK: define spir_kernel void @__vecz_v4_vector_loop
 ; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ]
-; CHECK: %[[INC]] = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: %[[INC]] = add <4 x i32> %storemerge, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
 ; CHECK: ret void

From 91d164efbf6714b5558f820c6fd5c2d65cc27005 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 20 Nov 2024 13:06:06 +0000
Subject: [PATCH 129/182] [LLVM 20] Update tests.

loop_call_instantiation sees an extra flag added to icmp. Allow that.

struct_select hits a new optimization that happens because both
structure elements are the same type. Use different types to get the
same IR we got before.
---
 .../vecz/test/lit/llvm/loop_call_instantiation.ll             | 4 ++--
 .../compiler_passes/vecz/test/lit/llvm/struct_select.ll       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
index 200647016c69c..78c181ed64348 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -39,7 +39,7 @@ declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
 ; CHECK: [[LOOPHEADER1:instloop.header.*]]:
 ; CHECK: %[[INSTANCE1:instance.*]] = phi i32 [ 0, {{.+}} ], [ %[[V7:[0-9]+]], %[[LOOPBODY1:instloop.body.*]] ]
-; CHECK: %[[V3:[0-9]+]] = icmp ult i32 %[[INSTANCE1]], 4
+; CHECK: %[[V3:[0-9]+]] = icmp {{(samesign )?}}ult i32 %[[INSTANCE1]], 4
 ; CHECK: br i1 %[[V3]], label %[[LOOPBODY1]], label {{.+}}
 
 ; CHECK: [[LOOPBODY1]]:
@@ -51,7 +51,7 @@ declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
 
 ; CHECK: [[LOOPHEADER2:instloop.header.*]]:
 ; CHECK: %[[INSTANCE3:.+]] = phi i32 [ %[[V11:[0-9]+]], %[[LOOPBODY2:instloop.body.*]] ], [ 0, {{.+}} ]
-; CHECK: %[[V8:[0-9]+]] = icmp ult i32 %[[INSTANCE3]], 4
+; CHECK: %[[V8:[0-9]+]] = icmp {{(samesign )?}}ult i32 %[[INSTANCE3]], 4
 ; CHECK: br i1 %[[V8]], label %[[LOOPBODY2]], label {{.+}}
 
 ; CHECK: [[LOOPBODY2]]:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
index ae3dedf111200..d9e4308e07701 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-%struct_type = type { i32, i32 }
+%struct_type = type { i32, i64 }
 
 define spir_kernel void @test(%struct_type* %in1, %struct_type* %in2, %struct_type* %out) {
 entry:

From d899e8f44b0371fa26f54aa5885aa0c00bd93d5d Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 22 Nov 2024 15:47:19 +0000
Subject: [PATCH 130/182] LLVM 20: Update lit tests.

LLVM 20 detects more situations where a splat is simpler than what was
generated before.
---
 .../llvm/ScalableVectors/broadcast_vector.ll  | 24 +++++++++----------
 .../test/lit/llvm/ScalableVectors/cmpxchg.ll  |  6 ++---
 .../define_interleaved_store.ll               |  4 ++--
 .../define_interleaved_store_as_masked.ll     |  4 ++--
 .../ScalableVectors/define_subgroup_scans.ll  |  4 ++--
 .../llvm/ScalableVectors/extract_element.ll   |  8 +++----
 .../llvm/ScalableVectors/insert_element.ll    |  2 +-
 .../llvm/ScalableVectors/interleaved_load.ll  |  2 +-
 .../ScalableVectors/packetize_mask_varying.ll |  2 +-
 .../test/lit/llvm/ScalableVectors/select.ll   |  4 ++--
 .../ScalableVectors/select_scalar_vector.ll   |  4 ++--
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |  2 +-
 .../define_interleaved_load_store.ll          |  4 ++--
 .../llvm/VectorPredication/load_add_store.ll  |  4 ++--
 .../test/lit/llvm/VectorPredication/udiv.ll   |  2 +-
 15 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 519bb696cd52e..5e0520a55f42b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -100,7 +100,7 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF8000020000000, {{(i32|i64)}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    store <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> (undef|poison), float 0x7FF8000020000000, (i32|i64) 0\), <vscale x 16 x float> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF8000020000000\)}}, ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
@@ -108,10 +108,10 @@ entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{(i32|i64)}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> undef)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
@@ -125,13 +125,13 @@ entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 8388607, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 8388607\)}}
 ; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> (undef|poison), float 0x7FF0000020000000, (i32|i64) 0\), <vscale x 16 x float> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF0000020000000\)}}
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
@@ -144,10 +144,10 @@ entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC1:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16
 ; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x float> {{(undef|poison)}})
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> {{(undef|poison)}})
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align
@@ -155,7 +155,7 @@ entry:
 ; CHECK-NEXT:    store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i32> {{(undef|poison)}})
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i32> {{(undef|poison)}})
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
@@ -169,11 +169,11 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK:    [[FIXLEN_MASK_ALLOC:%.*]] = alloca <4 x i8>, align 4
 ; CHECK:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK:    [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
 ; CHECK:    store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4
 ; CHECK:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i8, ptr [[FIXLEN_MASK_ALLOC]], <vscale x 16 x i64> [[TMP0]]
-; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> {{(undef|poison)}})
+; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i8> {{(undef|poison)}})
 ; CHECK:    [[BMASK:%.*]] = trunc <vscale x 16 x i8> [[TMP1]] to <vscale x 16 x i1>
 ; CHECK:    {{.*}} = and <vscale x 16 x i1> {{.*}}, [[BMASK]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
index 85b4c865d0e07..bfa7f69334400 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
@@ -29,9 +29,9 @@ entry:
 ; Test that this cmpxchg is packetized by generating a call to an all-true masked version.
 ; CHECK: [[A0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @__vecz_b_nxv4_masked_cmpxchg_align4_acquire_monotonic_1_u9nxv4u3ptru5nxv4ju5nxv4ju5nxv4b(
 ; CHECK-SAME: <vscale x 4 x ptr> [[SPLAT_PTR]],
-; CHECK-SAME: <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-SAME: <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-SAME: <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-SAME: <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 1, i64 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 1\)}}
+; CHECK-SAME: <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 2, i64 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; CHECK-SAME: <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, i64 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}
   %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
 ; CHECK: [[EXT0:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], 0
   %val0 = extractvalue { i32, i1 } %old0, 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index 6c0be75737891..497e9a54c4e7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -59,7 +59,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, <vscale x 4 x i32> zeroinitializer
 ; CHECK: %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
-; CHECK: %3 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %2
+; CHECK: %3 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %2
 ; CHECK: %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
-; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32{{( immarg)?}} 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32{{( immarg)?}} 8, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}})
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index f6a350a65b384..9d8a468504b3a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -58,9 +58,9 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:   %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
-; CHECK:   %3 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %2
+; CHECK:   %3 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %2
 ; CHECK:   %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
-; CHECK:   call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32 immarg 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK:   call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32 immarg 8, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}})
 ; CHECK:   ret void
 ; CHECK: }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 12b2856ce481d..6a8a686d0903f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -58,7 +58,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 ; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
 ;------- there will be a bitcast here if pointers are typed
 ; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> undef)
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -93,7 +93,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
 ;------- there will be a bitcast here if pointers are typed
 ; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> undef)
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index 91c989df3c499..77ea2d3f7bea2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -114,8 +114,8 @@ entry:
 ; EE-UNI-VEC: [[T4:%.*]] = shufflevector <vscale x 4 x i64> [[T3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; EE-UNI-VEC: [[STEP:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; EE-UNI-VEC: [[T5:%.*]] = add <vscale x 4 x i64> [[T4]], [[STEP]]
-; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
-; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 2, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> (undef|poison), i64 3, (i32|i64) 0\), <vscale x 4 x i64> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i64 3\)}}
+; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> (undef|poison), i64 2, (i32|i64) 0\), <vscale x 4 x i64> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i64 2\)}}
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
 ; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i64> [[T6]], [[MOD]]
@@ -128,10 +128,10 @@ entry:
 ; EE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
 ; EE-INDICES: [[T0:%.*]] = getelementptr i32, ptr addrspace(1) %idxs, i64 %call
 ; EE-INDICES: [[T2:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) [[T0]], align 4
-; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
 ; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
-; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{i32|i64}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
 ; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T4]], [[T3]]
 ; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
 ; EE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index b2dcb47b5aeb3..d2ed9cef94e72 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -99,7 +99,7 @@ entry:
 ; IE-INDICES: [[VAL:%.*]] = uitofp <vscale x 4 x i64> {{%.*}} to <vscale x 4 x float>
 ; IE-INDICES: store <vscale x 16 x float> {{%.*}}, ptr [[ALLOC]], align 64
 ; IE-INDICES: [[T1:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
-; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
 ; IE-INDICES: [[T3:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T2]], {{%.*}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index d4f4c5339754d..708edd894d717 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -53,7 +53,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <vscale x 4 x ptr addrspace(1)> [[TMP1]], <vscale x 4 x i64> [[TMP5]]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)) [[MASKED_ATTRS:#[0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}) [[MASKED_ATTRS:#[0-9]+]]
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index 61682c1baff07..b391a57c27ba3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -39,7 +39,7 @@ if.end:
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_mask_varying
 ; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
index f6a8addb32062..9a693646c7ad2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -55,7 +55,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
 ; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
 ; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 4 x i32> [[lhs]], [[rhs]]
-; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 4 x i1> [[cmp]], <vscale x 4 x i32> [[rhs]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 4 x i1> [[cmp]], <vscale x 4 x i32> [[rhs]], <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 4, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 4\)}}
 ; CHECK: store <vscale x 4 x i32> [[sel]],
 
 ; CHECK: define spir_kernel void @__vecz_nxv4_select_vector_vector
@@ -63,5 +63,5 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: [[y:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
 ; CHECK: [[z:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
 ; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 8 x i32> [[x]], [[y]]
-; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[z]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[z]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> (undef|poison), i32 4, (i32|i64) 0\), <vscale x 8 x i32> (undef|poison), <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index f4fa88cb151ff..0d58887f98584 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
 ; CHECK: store <vscale x 4 x i8> [[sext]], ptr [[alloc:%.*]], align 4
 ; CHECK: [[idx0:%.*]] = call <vscale x 8 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv8i32()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 1, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> (undef|poison), i32 1, (i32|i64) 0\), <vscale x 8 x i32> (undef|poison), <vscale x 8 x i32> zeroinitializer\)|splat \(i32 1\)}}
 
 ; Note that since we just did a lshr 1 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
@@ -53,5 +53,5 @@ entry:
 ; CHECK: [[addrs:%.*]] = getelementptr i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
 ; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[addrs]],
 ; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
-; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{(i32|i64)}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> (undef|poison), i32 4, (i32|i64) 0\), <vscale x 8 x i32> (undef|poison), <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 0745027793052..110fc935b9e5c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -33,7 +33,7 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat
 ; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index bf8a3f08a104c..4ea882d804124 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -59,7 +59,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:   %3 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
-; CHECK:   %4 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %3
+; CHECK:   %4 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %3
 ; CHECK:   %5 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %4
 ; CHECK:   %6 = call <vscale x 4 x double> @llvm.vp.gather.nxv4f64.nxv4p1(<vscale x 4 x ptr addrspace(1)> %5, <vscale x 4 x i1> %1, i32 %2)
 ; CHECK:   ret <vscale x 4 x double> %6
@@ -73,7 +73,7 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK:  %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
 ; CHECK:  %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:  %4 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
-; CHECK:  %5 = mul <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, {{i32|i64}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %4
+; CHECK:  %5 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %4
 ; CHECK:  %6 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %5
 ; CHECK:  call void @llvm.vp.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %6, <vscale x 4 x i1> %2, i32 %3)
 ; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 8da3755e1f378..7e8f0770dc215 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -58,7 +58,7 @@ entry:
 ; CHECK_1S: [[T1:%.*]] = shl i64 [[T0]], 2
 ; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
-; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 4 x i1> (undef|poison), <vscale x 4 x i32> zeroinitializer\)]], i32 [[VL]])
+; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> (shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 4 x i1> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
 ; CHECK_1S: [[RHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
 ; CHECK_1S: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[LHS]], <vscale x 4 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
 ; CHECK_1S: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
@@ -99,7 +99,7 @@ entry:
 ; CHECK_V4_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
 ; Each WI performs 4 elements, so multiply the VL by 4
 ; CHECK_V4_1S: [[SVL:%.*]] = shl i32 [[VL]], 2
-; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 16 x i1> (undef|poison), <vscale x 16 x i32> zeroinitializer\)]], i32 [[SVL]])
+; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> (shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 16 x i1> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[SVL]])
 ; CHECK_V4_1S: [[RHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
 ; CHECK_V4_1S: [[ADD:%.*]] = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> [[LHS]], <vscale x 16 x i32> [[RHS]], [[TRUEMASK]], i32 [[SVL]])
 ; CHECK_V4_1S: call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index 7cd87a3cd55a1..bf082b4530bc8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK: [[T1:%.*]] = shl i64 [[T0]], 1
 ; CHECK: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK: [[VL:%.*]] = trunc i64 [[T2]] to i32
-; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 2 x i1> (undef|poison), <vscale x 2 x i32> zeroinitializer\)]], i32 [[VL]])
+; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> (shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 2 x i1> (undef|poison), <vscale x 2 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
 ; CHECK: [[RHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
 ; CHECK: [[ADD:%.*]] = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> [[LHS]], <vscale x 2 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
 ; CHECK: call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])

From 3524c8cf1d168063a9f97781d887ae2711aa41ba Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 6 Dec 2024 16:05:15 +0000
Subject: [PATCH 131/182] [LLVM 20] Allow getelementptr nuw flag.

LLVM 20 adds more flags to getelementptr. Allow this.
---
 .../vecz/test/lit/llvm/constant_address.ll    |  2 +-
 .../lit/llvm/constant_address_with_uniform.ll |  2 +-
 .../vecz/test/lit/llvm/emit_memintrinsics.ll  | 60 +++++++++----------
 .../vecz/test/lit/llvm/gep_duplication.ll     |  2 +-
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
index a191e7314efc8..76354dce7a39f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -54,5 +54,5 @@ attributes #1 = { nounwind readnone }
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT: %conv = trunc i64 %gid to i32
-; CHECK-NEXT: %arrayidx = getelementptr inbounds {{i32|i8}}, ptr addrspace(1) %out, i64 {{3|12}}
+; CHECK-NEXT: %arrayidx = getelementptr inbounds {{(nuw )?}}{{i32|i8}}, ptr addrspace(1) %out, i64 {{3|12}}
 ; CHECK-NEXT: store i32 %conv, ptr addrspace(1) %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
index c58dfa1e0229d..02d303d887a2d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -36,6 +36,6 @@ entry:
 ; CHECK: define spir_kernel void @__vecz_v4_test
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: %gid = call i32 @__mux_get_global_id(i32 0)
-; CHECK-NEXT: %arrayidx = getelementptr inbounds {{i32|i8}}, ptr addrspace(1) %out, i32 {{3|12}}
+; CHECK-NEXT: %arrayidx = getelementptr inbounds {{(nuw )?}}{{i32|i8}}, ptr addrspace(1) %out, i32 {{3|12}}
 ; CHECK: store i32 %gid, ptr addrspace(1) %arrayidx, align 4
 ; CHECK: store <4 x ptr addrspace(1)> %{{.+}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
index 1b37af3f30ab0..2aa8fefac7a28 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -98,45 +98,45 @@ declare i64 @__mux_get_local_id(i32)
 ; Check if the generated loads and stores are in place
 ; Check the stores for the first memset
 ; CHECK: store i64 %ms64val, ptr %sa
-; CHECK: %[[V14:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 8
+; CHECK: %[[V14:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 8
 ; CHECK: store i64 %ms64val, ptr %[[V14]]
-; CHECK: %[[V15:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 16
+; CHECK: %[[V15:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 16
 ; CHECK: store i64 %ms64val, ptr %[[V15]]
-; CHECK: %[[V16:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 24
+; CHECK: %[[V16:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 24
 ; CHECK: store i64 %ms64val, ptr %[[V16]]
-; CHECK: %[[V17:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 32
+; CHECK: %[[V17:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 32
 ; CHECK: store i64 %ms64val, ptr %[[V17]]
-; CHECK: %[[V18:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 40
+; CHECK: %[[V18:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 40
 ; CHECK: store i64 %ms64val, ptr %[[V18]]
-; CHECK: %[[V19:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 48
+; CHECK: %[[V19:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 48
 ; CHECK: store i64 %ms64val, ptr %[[V19]]
-; CHECK: %[[V20:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 56
-; CHECK-EQ14: %[[V20:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 3, i64 8
-; CHECK: %[[V21:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 64
-; CHECK: %[[V22:[0-9]+]] = getelementptr inbounds i8, ptr %sa, i64 72
+; CHECK: %[[V20:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 56
+; CHECK-EQ14: %[[V20:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %sa, i64 0, i32 3, i64 8
+; CHECK: %[[V21:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 64
+; CHECK: %[[V22:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 72
 
 ; Check the stores for the second memset
 ; CHECK: store i64 0, ptr addrspace(1) %[[SB_I8AS]]
-; CHECK: %[[V24:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 8
+; CHECK: %[[V24:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 8
 ; CHECK: store i64 0, ptr addrspace(1) %[[V24]]
-; CHECK: %[[V26:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 16
+; CHECK: %[[V26:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 16
 ; CHECK: store i64 0, ptr addrspace(1) %[[V26]]
-; CHECK: %[[V28:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 24
+; CHECK: %[[V28:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 24
 ; CHECK: store i64 0, ptr addrspace(1) %[[V28]]
-; CHECK: %[[V30:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 32
+; CHECK: %[[V30:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 32
 ; CHECK: store i64 0, ptr addrspace(1) %[[V30]]
-; CHECK: %[[V32:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 40
+; CHECK: %[[V32:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 40
 ; CHECK: store i64 0, ptr addrspace(1) %[[V32]]
-; CHECK: %[[V33:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 48
+; CHECK: %[[V33:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 48
 ; CHECK: store i64 0, ptr addrspace(1) %[[V33]]
-; CHECK: %[[V35T:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 56
-; CHECK-EQ14: %[[V35T:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %sb, i64 0, i32 3, i64 8
+; CHECK: %[[V35T:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 56
+; CHECK-EQ14: %[[V35T:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %sb, i64 0, i32 3, i64 8
 ; CHECK-EQ14: %[[V35:[0-9]+]] = bitcast i8* %[[V35T]] to i64*
 ; CHECK-EQ14: %[[SB_I8AS18:.+]] = addrspacecast i64* %[[V35]] to i64 addrspace(1)*
 ; CHECK: store i64 0, ptr addrspace(1) %[[V35T]]
-; CHECK: %[[V36:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 64
+; CHECK: %[[V36:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 64
 ; CHECK: store i64 0, ptr addrspace(1) %[[V36]]
-; CHECK: %[[V38:[0-9]+]] = getelementptr inbounds i8, ptr addrspace(1) %[[SB_I8AS]], i64 72
+; CHECK: %[[V38:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 72
 ; CHECK: store i64 0, ptr addrspace(1) %[[V38]]
 
 
@@ -167,32 +167,32 @@ declare i64 @__mux_get_local_id(i32)
 ; CHECK:end:                                              ; preds = %middle, %entry
 ; CHECK: %[[SB_I8AS42:.+]] = load i64, ptr addrspace(1) %[[SB_I8AS]]
 ; CHECK: store i64 %[[SB_I8AS42]], ptr %result2
-; CHECK: %[[V42:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 8
+; CHECK: %[[V42:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 8
 ; CHECK: %[[SB_I8AS44:.+]] = load i64, ptr addrspace(1) %[[V24]]
 ; CHECK: store i64 %[[SB_I8AS44]], ptr %[[V42]]
-; CHECK: %[[V43:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 16
+; CHECK: %[[V43:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 16
 ; CHECK: %[[SB_I8AS46:.+]] = load i64, ptr addrspace(1) %[[V26]]
 ; CHECK: store i64 %[[SB_I8AS46]], ptr %[[V43]]
-; CHECK: %[[V44:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 24
+; CHECK: %[[V44:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 24
 ; CHECK: %[[SB_I8AS48:.+]] = load i64, ptr addrspace(1) %[[V28]]
 ; CHECK: store i64 %[[SB_I8AS48]], ptr %[[V44]]
-; CHECK: %[[V45:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 32
+; CHECK: %[[V45:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 32
 ; CHECK: %[[SB_I8AS50:.+]] = load i64, ptr addrspace(1) %[[V30]]
 ; CHECK: store i64 %[[SB_I8AS50]], ptr %[[V45]]
-; CHECK: %[[V46:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 40
+; CHECK: %[[V46:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 40
 ; CHECK: %[[SB_I8AS52:.+]] = load i64, ptr addrspace(1) %[[V32]]
 ; CHECK: store i64 %[[SB_I8AS52]], ptr %[[V46]]
-; CHECK: %[[V47:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 48
+; CHECK: %[[V47:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 48
 ; CHECK: %[[SB_I8AS54:.+]] = load i64, ptr addrspace(1) %[[V33]]
 ; CHECK: store i64 %[[SB_I8AS54]], ptr %[[V47]]
-; CHECK: %[[V48:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 56
-; CHECK-EQ14: %[[V48:[0-9]+]] = getelementptr inbounds %struct.S2, %struct.S2* %result2, i64 0, i32 3, i64 8
+; CHECK: %[[V48:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 56
+; CHECK-EQ14: %[[V48:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %result2, i64 0, i32 3, i64 8
 ; CHECK: %[[SB_I8AS56:.+]] = load i64, ptr addrspace(1) %[[V35T]]
 ; CHECK: store i64 %[[SB_I8AS56]], ptr %[[V48]]
-; CHECK: %[[V49:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 64
+; CHECK: %[[V49:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 64
 ; CHECK: %[[SB_I8AS58:.+]] = load i64, ptr addrspace(1) %[[V36]]
 ; CHECK: store i64 %[[SB_I8AS58]], ptr %[[V49]]
-; CHECK: %[[V50:[0-9]+]] = getelementptr inbounds i8, ptr %result2, i64 72
+; CHECK: %[[V50:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 72
 ; CHECK: %[[SB_I8AS60:.+]] = load i64, ptr addrspace(1) %[[V38]]
 ; CHECK: store i64 %[[SB_I8AS60]], ptr %[[V50]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index 5ac166cedf570..7f766f04b74ca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -26,7 +26,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; combination of instcombine and GVN).
 ; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
 ; CHECK: entry:
-; CHECK: getelementptr inbounds {{\[2 x i32]|i8}}, ptr %myStruct, {{i64 0, i64 1|i64 4}}
+; CHECK: getelementptr inbounds {{(nuw )?}}{{\[2 x i32]|i8}}, ptr %myStruct, {{i64 0, i64 1|i64 4}}
 ; CHECK-NOT: getelementptr {{.*}}%myStruct
 define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) {
 entry:

From a9bb9655dad377865043f1b5a7c1c260e51b4aff Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 10 Jan 2025 15:23:58 +0000
Subject: [PATCH 132/182] [compiler] Handle skipped vectorized functions.

In corner cases, we may end up with a vectorized function that is
unconditionally skipped because the trip count can be determined at
compilation time to be 0. The code that was already there to handle this
was not quite right.
---
 .../source/work_item_loops_pass.cpp           | 34 ++++++++++++-------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 074b1eb95bde9..4584a745e41e8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -607,20 +607,26 @@ struct ScheduleGenerator {
           idsTail[1] = tailGroupCall->getOperand(3);
           idsTail[2] = tailGroupCall->getOperand(4);
           getUniformValues(tailUniformBlock, *barrierTail, idsTail);
-        }
 
-        // If both barrier structs had to be used, we need to merge the result.
-        if (mainUniformBlock && tailUniformBlock) {
-          block = BasicBlock::Create(context, "ca_merge_uniform_load", func);
-          BranchInst::Create(block, tailUniformBlock);
-          BranchInst::Create(block, mainUniformBlock);
-
-          for (size_t i = 0; i != 3; ++i) {
-            auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2,
-                                             "uniform_merge", block);
-            mergePhi->addIncoming(idsMain[i], mainUniformBlock);
-            mergePhi->addIncoming(idsTail[i], tailUniformBlock);
-            idsMain[i] = mergePhi;
+          if (mainUniformBlock) {
+            // If both barrier structs had to be used, we need to merge the
+            // result.
+            block = BasicBlock::Create(context, "ca_merge_uniform_load", func);
+            BranchInst::Create(block, tailUniformBlock);
+            BranchInst::Create(block, mainUniformBlock);
+
+            for (size_t i = 0; i != 3; ++i) {
+              auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2,
+                                               "uniform_merge", block);
+              mergePhi->addIncoming(idsMain[i], mainUniformBlock);
+              mergePhi->addIncoming(idsTail[i], tailUniformBlock);
+              idsMain[i] = mergePhi;
+            }
+          } else {
+            // Otherwise we can use the tail.
+            for (size_t i = 0; i != 3; ++i) {
+              idsMain[i] = idsTail[i];
+            }
           }
         }
 
@@ -1017,6 +1023,8 @@ struct ScheduleGenerator {
                       // No main iterations at all!
                       mainPreheaderBB = nullptr;
                       mainExitBB = block;
+                      nextSubgroupIV = ivs1[0];
+                      nextScanIV = ivs1[1];
                     }
                   } else {
                     mainPreheaderBB = BasicBlock::Create(

From acfd588d23bbcdbe00edb7031175a816269fe642 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 15 Jan 2025 11:18:20 +0000
Subject: [PATCH 133/182] [compiler] Avoid out of bounds access.

If !isScan, ivs1 is not guaranteed to hold at least two items and we
could fail trying to retrieve ivs1[1] even if the result would be
unused.
---
 .../compiler_pipeline/source/work_item_loops_pass.cpp         | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 4584a745e41e8..74546cff23aad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -1024,7 +1024,9 @@ struct ScheduleGenerator {
                       mainPreheaderBB = nullptr;
                       mainExitBB = block;
                       nextSubgroupIV = ivs1[0];
-                      nextScanIV = ivs1[1];
+                      if (isScan) {
+                        nextScanIV = ivs1[1];
+                      }
                     }
                   } else {
                     mainPreheaderBB = BasicBlock::Create(

From 9e7b4794776f558fc78413a834b039abf80dd3f4 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 17 Jan 2025 12:26:48 +0000
Subject: [PATCH 134/182] [mux] Remove degenerate subgroup support.

Older versions of oneAPI Construction Kit used an implementation of
subgroups where the subgroup size was always equal to the workgroup
size. This implementation is no longer used by any targets, but the code
to support it was still in place. This commit removes it.
---
 .../include/compiler/utils/attributes.h              | 10 ----------
 .../compiler_pipeline/source/attributes.cpp          | 12 ------------
 .../source/work_item_loops_pass.cpp                  |  5 +----
 3 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
index 851847a725d69..cc19a11db7f9e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
@@ -162,16 +162,6 @@ void setBarrierSchedule(llvm::CallInst &CI, BarrierSchedule Sched);
 /// @return the execution schedule for this barrier
 BarrierSchedule getBarrierSchedule(const llvm::CallInst &CI);
 
-/// @brief Marks a kernel's subgroups as degenerate
-///
-/// @param[in] F Function in which to encode the information.
-void setHasDegenerateSubgroups(llvm::Function &F);
-
-/// @brief Returns whether the kernel has degenerate subgroups.
-///
-/// @param[in] F Function to check.
-bool hasDegenerateSubgroups(const llvm::Function &F);
-
 /// @brief Marks a function as not explicitly using subgroups
 ///
 /// May be set even with unresolved external functions, assuming those don't
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
index 77ecd0513b4b0..e04e5a0aef614 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
@@ -186,18 +186,6 @@ BarrierSchedule getBarrierSchedule(const CallInst &CI) {
   return BarrierSchedule::Unordered;
 }
 
-static constexpr const char *MuxDegenerateSubgroupsAttrName =
-    "mux-degenerate-subgroups";
-
-void setHasDegenerateSubgroups(Function &F) {
-  F.addFnAttr(MuxDegenerateSubgroupsAttrName);
-}
-
-bool hasDegenerateSubgroups(const Function &F) {
-  const Attribute Attr = F.getFnAttribute(MuxDegenerateSubgroupsAttrName);
-  return Attr.isValid();
-}
-
 static constexpr const char *MuxNoSubgroupsAttrName = "mux-no-subgroups";
 
 void setHasNoExplicitSubgroups(Function &F) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 74546cff23aad..0fa90f90e8b58 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -1913,8 +1913,6 @@ PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
     // don't want to create another wrapper where the scalar tail is the
     // 'main', unless that tail is useful as a fallback sub-group kernel. A
     // fallback sub-group kernel is one for which:
-    // * The 'main' is not a degenerate sub-group kernel. These are always safe
-    // to run so the fallback is unnecessary.
     // * The 'main' has a required sub-group size that isn't the scalar size.
     // * The 'main' and 'tail' kernels both make use of sub-group builtins. If
     // neither do, there's no need for the fallback.
@@ -1922,8 +1920,7 @@ PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
     // cleanly divides the known local work-group size.
     if (P.SkippedTailF || (P.TailInfo && P.TailInfo->vf.isScalar())) {
       const auto *TailF = P.SkippedTailF ? P.SkippedTailF : P.TailF;
-      if (hasDegenerateSubgroups(*P.MainF) ||
-          getReqdSubgroupSize(*P.MainF).value_or(1) != 1 ||
+      if (getReqdSubgroupSize(*P.MainF).value_or(1) != 1 ||
           (!GSGI.usesSubgroups(*P.MainF) && !GSGI.usesSubgroups(*TailF))) {
         RedundantMains.insert(TailF);
       } else if (auto wgs = parseRequiredWGSMetadata(*P.MainF)) {

From da04975e987b09120ef6ec645e8938e8d24d7cc7 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Sat, 18 Jan 2025 01:08:35 +0000
Subject: [PATCH 135/182] [compiler] Do not mix kernels with different
 sub-group sizes.

Per OpenCL 3.0 API 3.2.1 Mapping Work-items Onto an Nd-range, all
sub-groups within a work-group will be the same size, apart from the
sub-group with the maximum index which may be smaller if the size of the
work-group is not evenly divisible by the size of the sub-groups. We
were not meeting this requirement: in cases where we would not or could
not generate a predicated vectorized kernel, we would execute the scalar
kernel in a loop for any remaining work items, possibly resulting in
multiple sub-groups that are smaller than the maximum sub-group size.

To avoid this situation, we need to avoid mixing vector and scalar
kernels if those kernels use different sub-group sizes. If we can handle
all items with vector kernels, possibly with predication, continue to do
so. If the vector and scalar kernels do not depend on the sub-group
size, also continue to handle this as before. If the vector and scalar
kernels do depend on the sub-group size, and the vector kernel cannot
handle all work items, we need to switch to the scalar kernel for all
work items.

This includes a small optimization where if we know the kernel does not
use sub-group information, we avoid setting sub-group IDs.

This includes one change to createLoop which permits nullptr PHIs. They
will be skipped over, and are useful since PHIs must be referred to by
index in the callback function. This allows indices to be constant even
when the caller has multiple optional PHIs.

This also includes one bugfix to ControlFlowConversionPass to fix a
crash seen now, where we use the result of createMasked{Load,Store}
before checking whether it succeeded.

This also includes one improvement to CompileKernelToBin.cmake. If the
executed command fails, it will now be printed in a format that can be
copied and pasted.
---
 .../source/pass_functions.cpp                 |   4 +
 .../source/work_item_loops_pass.cpp           | 321 ++++++++++--------
 .../control_flow_conversion_pass.cpp          |   3 +-
 3 files changed, 183 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 1287e1009df0c..d13a1751507f1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -519,6 +519,9 @@ llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
 
   // Set up all of our user PHIs
   for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    // For convenience to callers, permit nullptr and skip over it.
+    if (!currIVs[i]) continue;
+
     auto *const phi = loopIR.CreatePHI(currIVs[i]->getType(), 2);
     llvm::cast<llvm::PHINode>(phi)->addIncoming(currIVs[i],
                                                 entryIR.GetInsertBlock());
@@ -542,6 +545,7 @@ llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
 
   // Update all of our PHIs
   for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    if (!currIVs[i]) continue;
     llvm::cast<llvm::PHINode>(currIVs[i])->addIncoming(nextIVs[i], latch);
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 0fa90f90e8b58..6ea8d484fc16d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -203,8 +203,8 @@ struct ScheduleGenerator {
   AllocaInst *nextID = nullptr;
   Value *mainLoopLimit = nullptr;
   Value *peel = nullptr;
+  bool noExplicitSubgroups = false;
   bool emitTail = true;
-  bool isVectorPredicated = false;
   bool wrapperHasMain = false;
   bool wrapperHasTail = false;
 
@@ -726,8 +726,10 @@ struct ScheduleGenerator {
         mainPreheaderBB->moveAfter(block);
         mainExitBB->moveAfter(mainPreheaderBB);
 
-        subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB);
-        subgroupMergePhi->addIncoming(i32Zero, block);
+        if (!noExplicitSubgroups) {
+          subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB);
+          subgroupMergePhi->addIncoming(i32Zero, block);
+        }
 
         auto *const needMain =
             CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, zero,
@@ -744,7 +746,9 @@ struct ScheduleGenerator {
       wrapperHasMain = true;
       // Subgroup induction variables
       compiler::utils::CreateLoopOpts outer_opts;
-      outer_opts.IVs = {i32Zero};
+      if (!noExplicitSubgroups) {
+        outer_opts.IVs = {i32Zero};
+      }
 
       // looping through num groups in the third (outermost) dimension
       mainExitBB = compiler::utils::createLoop(
@@ -788,32 +792,41 @@ struct ScheduleGenerator {
                           MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                         IRBuilder<> ir(block);
 
-                        // set our subgroup id
-                        ir.CreateCall(set_subgroup_id, {ivs0[0]})
-                            ->setCallingConv(set_subgroup_id->getCallingConv());
+                        if (!noExplicitSubgroups) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
 
                         createWorkItemLoopBody(barrierMain, ir, block,
                                                barrierID, dim_0, dim_1, dim_2,
                                                accum, VF);
 
-                        nextSubgroupIV =
-                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
-                        ivsNext0[0] = nextSubgroupIV;
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
 
                         return block;
                       });
 
-                  // Don't forget to update the subgroup IV phi.
-                  ivsNext1[0] = nextSubgroupIV;
+                  if (!noExplicitSubgroups) {
+                    // Don't forget to update the subgroup IV phi.
+                    ivsNext1[0] = nextSubgroupIV;
+                  }
 
                   return exit0;
                 });
 
-            // Don't forget to update the subgroup IV phi.
-            ivsNext2[0] = nextSubgroupIV;
+            if (!noExplicitSubgroups) {
+              // Don't forget to update the subgroup IV phi.
+              ivsNext2[0] = nextSubgroupIV;
 
-            if (subgroupMergePhi) {
-              subgroupMergePhi->addIncoming(nextSubgroupIV, exit1);
+              if (subgroupMergePhi) {
+                subgroupMergePhi->addIncoming(nextSubgroupIV, exit1);
+              }
             }
 
             return exit1;
@@ -861,7 +874,9 @@ struct ScheduleGenerator {
       wrapperHasTail = true;
       // Subgroup induction variables
       compiler::utils::CreateLoopOpts outer_opts;
-      outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+      if (!noExplicitSubgroups) {
+        outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+      }
 
       // looping through num groups in the third (outermost) dimension
       tailExitBB = compiler::utils::createLoop(
@@ -899,7 +914,7 @@ struct ScheduleGenerator {
                           MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                         IRBuilder<> ir(block);
 
-                        if (set_subgroup_id) {
+                        if (!noExplicitSubgroups) {
                           // set our subgroup id
                           ir.CreateCall(set_subgroup_id, {ivs0[0]})
                               ->setCallingConv(
@@ -910,21 +925,27 @@ struct ScheduleGenerator {
                             *barrierTail, ir, block, barrierID, dim_0, dim_1,
                             dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
 
-                        nextSubgroupIV =
-                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
-                        ivsNext0[0] = nextSubgroupIV;
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
 
                         return block;
                       });
 
-                  // Don't forget to update the subgroup IV phi.
-                  ivsNext1[0] = nextSubgroupIV;
+                  if (!noExplicitSubgroups) {
+                    // Don't forget to update the subgroup IV phi.
+                    ivsNext1[0] = nextSubgroupIV;
+                  }
 
                   return exit0;
                 });
 
-            // Don't forget to update the subgroup IV phi.
-            ivsNext2[0] = nextSubgroupIV;
+            if (!noExplicitSubgroups) {
+              // Don't forget to update the subgroup IV phi.
+              ivsNext2[0] = nextSubgroupIV;
+            }
 
             return exit1;
           });
@@ -960,11 +981,11 @@ struct ScheduleGenerator {
 
     // The subgroup induction variable, set to the value of the subgroup ID at
     // the end of the last loop (i.e. beginning of the next loop)
-    Value *nextSubgroupIV = i32Zero;
+    Value *nextSubgroupIV = noExplicitSubgroups ? nullptr : i32Zero;
 
     // The work-group scan induction variable, set to the current scan value at
     // the end of the last loop (i.e. beginning of the next loop)
-    Value *nextScanIV = accum;
+    Value *nextScanIV = isScan ? accum : nullptr;
 
     // We need to ensure any subgroup IV is defined on the path in which
     // the vector loop is skipped.
@@ -973,12 +994,8 @@ struct ScheduleGenerator {
     PHINode *scanMergePhi = nullptr;
 
     compiler::utils::CreateLoopOpts outer_opts;
-    outer_opts.IVs.push_back(i32Zero);
-    outer_opts.loopIVNames.push_back("sg.z");
-    if (isScan) {
-      outer_opts.IVs.push_back(nextScanIV);
-      outer_opts.loopIVNames.push_back("scan.z");
-    }
+    outer_opts.IVs = {nextSubgroupIV, nextScanIV};
+    outer_opts.loopIVNames = {"sg.z", "scan.z"};
 
     // looping through num groups in the third (outermost) dimension
     return compiler::utils::createLoop(
@@ -993,10 +1010,7 @@ struct ScheduleGenerator {
 
           compiler::utils::CreateLoopOpts middle_opts;
           middle_opts.IVs = ivs2.vec();
-          middle_opts.loopIVNames.push_back("sg.y");
-          if (isScan) {
-            middle_opts.loopIVNames.push_back("scan.y");
-          }
+          middle_opts.loopIVNames = {"sg.y", "scan.y"};
 
           // looping through num groups in the second dimension
           BasicBlock *exit1 = compiler::utils::createLoop(
@@ -1023,7 +1037,9 @@ struct ScheduleGenerator {
                       // No main iterations at all!
                       mainPreheaderBB = nullptr;
                       mainExitBB = block;
-                      nextSubgroupIV = ivs1[0];
+                      if (!noExplicitSubgroups) {
+                        nextSubgroupIV = ivs1[0];
+                      }
                       if (isScan) {
                         nextScanIV = ivs1[1];
                       }
@@ -1037,9 +1053,11 @@ struct ScheduleGenerator {
                     mainPreheaderBB->moveAfter(block);
                     mainExitBB->moveAfter(mainPreheaderBB);
 
-                    subgroupMergePhi =
-                        PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB);
-                    subgroupMergePhi->addIncoming(ivs1[0], block);
+                    if (!noExplicitSubgroups) {
+                      subgroupMergePhi =
+                          PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB);
+                      subgroupMergePhi->addIncoming(ivs1[0], block);
+                    }
 
                     if (isScan) {
                       scanMergePhi = PHINode::Create(accum->getType(), 2,
@@ -1072,10 +1090,7 @@ struct ScheduleGenerator {
                   compiler::utils::CreateLoopOpts inner_vf_opts;
                   inner_vf_opts.indexInc = VF;
                   inner_vf_opts.IVs = ivs1.vec();
-                  inner_vf_opts.loopIVNames.push_back("sg.x.main");
-                  if (isScan) {
-                    inner_vf_opts.loopIVNames.push_back("scan.y.main");
-                  }
+                  inner_vf_opts.loopIVNames = {"sg.x.main", "scan.x.main"};
 
                   mainExitBB = compiler::utils::createLoop(
                       mainPreheaderBB, mainExitBB, zero, mainLoopLimit,
@@ -1085,7 +1100,7 @@ struct ScheduleGenerator {
                           MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                         IRBuilder<> ir(block);
 
-                        if (set_subgroup_id) {
+                        if (!noExplicitSubgroups) {
                           // set our subgroup id
                           ir.CreateCall(set_subgroup_id, {ivs0[0]})
                               ->setCallingConv(
@@ -1112,10 +1127,12 @@ struct ScheduleGenerator {
                                                barrierID, dim_0, dim_1, dim_2,
                                                accum, VF);
 
-                        nextSubgroupIV =
-                            ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
-                                         "sg.x.main.inc");
-                        ivsNext0[0] = nextSubgroupIV;
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
+                                           "sg.x.main.inc");
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
 
                         // Move the exit after the loop block, as it reads more
                         // logically.
@@ -1129,10 +1146,12 @@ struct ScheduleGenerator {
 
                   if (subgroupMergePhi) {
                     subgroupMergePhi->addIncoming(nextSubgroupIV, mainLoopBB);
+                    nextSubgroupIV = subgroupMergePhi;
                   }
 
                   if (scanMergePhi) {
                     scanMergePhi->addIncoming(nextScanIV, mainLoopBB);
+                    nextScanIV = scanMergePhi;
                   }
                 }
                 assert(mainExitBB && "didn't create a loop exit block!");
@@ -1178,17 +1197,13 @@ struct ScheduleGenerator {
                   assert(barrierTail);
                   wrapperHasTail = true;
                   // Subgroup induction variables
-                  SmallVector<Value *, 2> subgroupIVs0 = {
-                      subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
-                  if (isScan) {
-                    subgroupIVs0.push_back(scanMergePhi ? scanMergePhi
-                                                        : nextScanIV);
-                  }
+                  SmallVector<Value *, 2> subgroupIVs0 = {nextSubgroupIV,
+                                                          nextScanIV};
 
                   BasicBlock *tailLoopBB = nullptr;
                   if (barrierTail->getVFInfo().IsVectorPredicated) {
                     IRBuilder<> ir(tailPreheaderBB);
-                    if (set_subgroup_id) {
+                    if (!noExplicitSubgroups) {
                       // set our subgroup id
                       ir.CreateCall(set_subgroup_id, {subgroupIVs0[0]})
                           ->setCallingConv(set_subgroup_id->getCallingConv());
@@ -1215,9 +1230,12 @@ struct ScheduleGenerator {
                                            barrierID, zero, dim_1, dim_2, accum,
                                            /*VF*/ nullptr, mainLoopLimit);
 
-                    nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0],
-                                                  ConstantInt::get(i32Ty, 1),
-                                                  "sg.x.tail.inc");
+                    if (!noExplicitSubgroups) {
+                      nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0],
+                                                    ConstantInt::get(i32Ty, 1),
+                                                    "sg.x.tail.inc");
+                    }
+
                     assert(tailExitBB);
                     ir.CreateBr(tailExitBB);
                     tailLoopBB = tailPreheaderBB;
@@ -1226,10 +1244,8 @@ struct ScheduleGenerator {
                     inner_scalar_opts.disableVectorize = true;
                     inner_scalar_opts.IVs.assign(subgroupIVs0.begin(),
                                                  subgroupIVs0.end());
-                    inner_scalar_opts.loopIVNames.push_back("sg.x.tail");
-                    if (isScan) {
-                      inner_scalar_opts.loopIVNames.push_back("scan.x.tail");
-                    }
+                    inner_scalar_opts.loopIVNames = {"sg.x.tail",
+                                                     "scan.x.tail"};
 
                     tailExitBB = compiler::utils::createLoop(
                         tailPreheaderBB, tailExitBB, zero, peel,
@@ -1239,7 +1255,7 @@ struct ScheduleGenerator {
                             MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
                           IRBuilder<> ir(block);
 
-                          if (set_subgroup_id) {
+                          if (!noExplicitSubgroups) {
                             // set our subgroup id
                             ir.CreateCall(set_subgroup_id, {ivs0[0]})
                                 ->setCallingConv(
@@ -1269,10 +1285,12 @@ struct ScheduleGenerator {
                               *barrierTail, ir, block, barrierID, dim_0, dim_1,
                               dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
 
-                          nextSubgroupIV =
-                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
-                                           "sg.x.tail.inc");
-                          ivsNext0[0] = nextSubgroupIV;
+                          if (!noExplicitSubgroups) {
+                            nextSubgroupIV = ir.CreateAdd(
+                                ivs0[0], ConstantInt::get(i32Ty, 1),
+                                "sg.x.tail.inc");
+                            ivsNext0[0] = nextSubgroupIV;
+                          }
 
                           tailLoopBB = block;
                           // Move the exit after the loop block, as it reads
@@ -1309,21 +1327,29 @@ struct ScheduleGenerator {
                         ->addIncoming(scanMergePhi, mainExitBB);
                   }
                 }
-                // Don't forget to update the subgroup IV phi.
-                ivsNext1[0] = nextSubgroupIV;
+
+                if (!noExplicitSubgroups) {
+                  // Don't forget to update the subgroup IV phi.
+                  ivsNext1[0] = nextSubgroupIV;
+                }
+
                 if (isScan) {
                   // ... or the scan IV phi.
                   ivsNext1[1] = nextScanIV;
                 }
+
                 return tailExitBB;
               });
 
-          // Don't forget to update the subgroup IV phi.
-          ivsNext2[0] = nextSubgroupIV;
+          if (!noExplicitSubgroups) {
+            // Don't forget to update the subgroup IV phi.
+            ivsNext2[0] = nextSubgroupIV;
+          }
           if (isScan) {
             // ... or the scan IV phi.
             ivsNext2[1] = nextScanIV;
           }
+
           return exit1;
         });
   }
@@ -1478,50 +1504,96 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   // happening.
   // We want to insert a call to __mux__set_max_sub_group_size after these
   // assumptions, to keep track of the last one we've inserted.
-  Instruction *setMaxSubgroupSizeInsertPt = nullptr;
   for (auto i = 0; i < 3; i++) {
     auto *const nonZero = entryIR.CreateICmpNE(
         localSizeDim[i], ConstantInt::get(localSizeDim[i]->getType(), 0));
-    setMaxSubgroupSizeInsertPt = entryIR.CreateAssumption(nonZero);
+    entryIR.CreateAssumption(nonZero);
   }
 
-  const bool isVectorPredicated = barrierMain.getVFInfo().IsVectorPredicated;
+  // There are four cases:
+  //
+  // 1. If !emitTail: in this case, only the main function will be called. The
+  // main function may be a scalar function, may be a predicated vector
+  // function, or may be an unpredicated vector function where the local size is
+  // known to be a multiple of the vectorization factor.
+  //
+  // 2. Otherwise, if tailInfo->IsVectorPredicated: in this case, the main
+  // function will be unpredicated and will be called for any multiples of vf,
+  // and one tail call will handle any remainder. vf of the main function and
+  // the tail function are the same.
+  //
+  // 3. Otherwise, if hasNoExplicitSubgroups(refF): in this case, the main
+  // function will be unpredicated and will be called for any multiples of vf,
+  // and one tail loop will handle any remainder. vf of the main function is
+  // used.
+  //
+  // 4. Otherwise: if local_size_x is a multiple of the main function's vf, the
+  // main function will handle the full loop and the main function's vf is used,
+  // else the tail function will handle the full loop and the tail function's vf
+  // is used.
+  //
+  // Unless hasNoExplicitSubgroups(refF), the subgroups are calculated as
+  //
+  //    get_max_sub_group_size() = min(vf, local_size_x)
+  //    get_num_sub_groups() = ((local_size_x + vector_width - 1) / vf)
+  //      * local_size_y * local_size_z
+  //
+  // If hasNoExplicitSubgroups(refF) (even for cases 1 and 2), the subgroups are
+  // not calculated.
+
+  const bool noExplicitSubgroups = hasNoExplicitSubgroups(refF);
 
   Value *mainLoopLimit = localSizeDim[workItemDim0];
   Value *peel = nullptr;
+
+  Value *effectiveVF = VF;
+
   if (emitTail) {
-    peel = entryIR.CreateSRem(mainLoopLimit, VF, "peel");
+    auto *const rem = entryIR.CreateSRem(mainLoopLimit, VF, "rem");
+    if (tailInfo->IsVectorPredicated || noExplicitSubgroups) {
+      peel = rem;
+    } else {
+      // We must have no more than one iteration with a subgroup size below the
+      // maximum subgroup size. To meet this requirement, if the tail is scalar
+      // and the vector size does not divide the workgroup size, do not use the
+      // vectorized kernel at all.
+      auto *const remcond = entryIR.CreateICmpNE(
+          rem, Constant::getNullValue(rem->getType()), "remcond");
+      peel = entryIR.CreateSelect(
+          remcond, mainLoopLimit,
+          Constant::getNullValue(mainLoopLimit->getType()), "peel");
+      effectiveVF = entryIR.CreateSelect(
+          remcond, materializeVF(entryIR, barrierTail->getVFInfo().vf), VF);
+    }
     mainLoopLimit = entryIR.CreateSub(mainLoopLimit, peel, "mainLoopLimit");
   }
 
-  // Set the number of subgroups in this kernel
-  {
+  // Set the subgroup maximum size and number of subgroups in this kernel
+  // wrapper.
+  if (!noExplicitSubgroups) {
+    auto setMaxSubgroupSizeFn =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M);
+    assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size");
     auto setNumSubgroupsFn =
         BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetNumSubGroups, M);
     assert(setNumSubgroupsFn && "Missing __mux_set_num_sub_groups");
-    // First, compute Z * Y
-    auto *const numSubgroupsZY = entryIR.CreateMul(
-        localSizeDim[workItemDim2], localSizeDim[workItemDim1], "sg.zy");
-    // Now multiply by the number of subgroups in the X dimension.
-    auto *numSubgroupsX = entryIR.CreateUDiv(mainLoopLimit, VF, "sg.main.x");
-    // Add on any tail iterations here.
-    if (peel) {
-      numSubgroupsX = entryIR.CreateAdd(numSubgroupsX, peel, "sg.x");
-    } else if (isVectorPredicated) {
-      // Vector predication will use an extra subgroup to mop up any remainder.
-      auto *const leftover = entryIR.CreateSRem(mainLoopLimit, VF, "peel");
-      auto *hasLeftover = entryIR.CreateICmp(
-          CmpInst::ICMP_NE, leftover, ConstantInt::get(leftover->getType(), 0),
-          "sg.has.vp");
-      hasLeftover = entryIR.CreateZExt(hasLeftover, numSubgroupsX->getType());
-      numSubgroupsX = entryIR.CreateAdd(numSubgroupsX, hasLeftover, "sg.x");
-    }
-    auto *numSubgroups =
-        entryIR.CreateMul(numSubgroupsZY, numSubgroupsX, "sg.zyx");
-    if (numSubgroups->getType() != i32Ty) {
-      numSubgroups = entryIR.CreateTrunc(numSubgroups, i32Ty);
-    }
-    entryIR.CreateCall(setNumSubgroupsFn, {numSubgroups});
+    auto *const localSizeInVecDim = localSizeDim[workItemDim0];
+    auto *const localSizeInNonVecDim = entryIR.CreateMul(
+        localSizeDim[workItemDim1], localSizeDim[workItemDim2], "wg.yz");
+    auto *maxSubgroupSize = entryIR.CreateBinaryIntrinsic(
+        Intrinsic::umin, localSizeInVecDim, effectiveVF, {}, "sg.x");
+    entryIR.CreateCall(setMaxSubgroupSizeFn,
+                       {entryIR.CreateTrunc(maxSubgroupSize, i32Ty)});
+    auto *const numSubgroupsInVecDim = entryIR.CreateUDiv(
+        entryIR.CreateAdd(
+            localSizeInVecDim,
+            entryIR.CreateSub(effectiveVF,
+                              ConstantInt::get(effectiveVF->getType(), 1))),
+        effectiveVF, "sgs.x");
+    auto *const numSubgroups =
+        entryIR.CreateMul(numSubgroupsInVecDim, localSizeInNonVecDim, "sgs");
+    entryIR.CreateCall(setNumSubgroupsFn,
+                       {entryIR.CreateTrunc(numSubgroups, i32Ty)});
   }
 
   if (barrierMain.hasLiveVars()) {
@@ -1530,7 +1602,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
     // This catches cases where we need two loop iterations, e.g., VF=4 and
     // size=7, where rounding down would give one.
     Value *numerator = mainLoopLimit;
-    if (isVectorPredicated) {
+    if (mainInfo.IsVectorPredicated) {
       Value *const vf_minus_1 =
           entryIR.CreateSub(VF, ConstantInt::get(VF->getType(), 1));
       numerator = entryIR.CreateAdd(mainLoopLimit, vf_minus_1);
@@ -1546,7 +1618,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   // barriers, even when the main kernel does not.
   if (emitTail && barrierTail->hasLiveVars()) {
     Value *size0 = peel;
-    if (barrierTail->getVFInfo().IsVectorPredicated) {
+    if (tailInfo->IsVectorPredicated) {
       // If the tail is predicated, it will only have a single (vectorized) item
       // along the X axis, or none.
       auto *const hasLeftover = entryIR.CreateICmp(
@@ -1584,8 +1656,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   schedule.wrapperDbgLoc = wrapperDbgLoc;
   schedule.nextID = nextID;
   schedule.mainLoopLimit = mainLoopLimit;
+  schedule.noExplicitSubgroups = noExplicitSubgroups;
   schedule.emitTail = emitTail;
-  schedule.isVectorPredicated = isVectorPredicated;
   schedule.peel = peel;
 
   // Make call instruction for first new kernel. It follows wrapper function's
@@ -1726,45 +1798,6 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   bbs[kBarrier_EndID]->moveAfter(&new_wrapper->back());
   bbs[kBarrier_EndID]->setName("kernel.exit");
 
-  // Set the subgroup maximum size in this kernel wrapper.
-  // There are three cases:
-  //
-  // 1. With no vectorization:
-  //    get_max_sub_group_size() = mux sub-group size
-  //
-  // 2. With predicated vectorization:
-  //    get_max_sub_group_size() = min(vector_width,
-  //    local_size_in_vectorization_dimension)
-  //
-  // 3. Without predicated vectorization:
-  //    get_max_sub_group_size() = local_size_in_vectorization_dimension
-  //    < vector_width ? mux sub-group size : vector_width
-  {
-    // Reset the insertion point back to the wrapper entry block, after VF was
-    // materialized.
-    entryIR.SetInsertPoint(setMaxSubgroupSizeInsertPt);
-    auto setMaxSubgroupSizeFn =
-        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M);
-    assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size");
-    // Assume no vectorization to begin with i.e. get_max_sub_group_size() = mux
-    // sub-group size.
-    Value *maxSubgroupSize = entryIR.getInt32(getMuxSubgroupSize(refF));
-    if (schedule.wrapperHasMain) {
-      auto *localSizeInVecDim = localSizeDim[workItemDim0];
-      auto *cmp = entryIR.CreateICmpULT(localSizeInVecDim, VF);
-      if (isVectorPredicated) {
-        maxSubgroupSize = entryIR.CreateSelect(cmp, localSizeInVecDim, VF);
-      } else {
-        maxSubgroupSize = entryIR.CreateSelect(
-            cmp, ConstantInt::get(VF->getType(), getMuxSubgroupSize(refF)), VF);
-      }
-      if (maxSubgroupSize->getType() != i32Ty) {
-        maxSubgroupSize = entryIR.CreateTrunc(maxSubgroupSize, i32Ty);
-      }
-    }
-    entryIR.CreateCall(setMaxSubgroupSizeFn, {maxSubgroupSize});
-  }
-
   // Remap any constant expression which take a reference to the old function
   // FIXME: What about the main function?
   for (auto *user : make_early_inc_range(refF.users())) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 67c256ffb4cec..2b5c3aa9e3441 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -1302,9 +1302,10 @@ bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp(
           Ctx, memOp.getDataOperand(), memOp.getPointerOperand(), wideMask,
           /*VL*/ nullptr, memOp.getAlignment(), I->getName());
     }
+    VECZ_FAIL_IF(!newVal);
+
     newVal->insertBefore(I->getIterator());
 
-    VECZ_FAIL_IF(!newVal);
     if (!I->getType()->isVoidTy()) {
       I->replaceAllUsesWith(newVal);
     }

From 94c6f9499c845d3776120e60f5e3bb8163c45061 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 22 Jan 2025 15:53:38 +0000
Subject: [PATCH 136/182] [vecz] Handle missing gather/scatter functions.

If we encounter types we do not know how to mangle, we cannot generate
gather or scatter functions. Just use scalar loads and stores for this.
---
 .../vecz/source/transform/packetizer.cpp      |  2 +
 .../vecz/test/lit/llvm/packetize_i48.ll       | 50 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 2299c8a22da7a..2e68e53306284 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2642,12 +2642,14 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
         auto *gather =
             createGather(Ctx, packetVecTy, ptrPacket[i], maskPacket[i], EVL,
                          op.getAlignment(), name);
+        PACK_FAIL_IF(!gather);
         gather->insertBefore(op.getInstr()->getIterator());
         results.push_back(gather);
       } else {
         auto *scatter =
             createScatter(Ctx, dataPacket[i], ptrPacket[i], maskPacket[i], EVL,
                           op.getAlignment(), name);
+        PACK_FAIL_IF(!scatter);
         scatter->insertBefore(op.getInstr()->getIterator());
         results.push_back(scatter);
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
new file mode 100644
index 0000000000000..d86793e433c67
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+
+define spir_kernel void @test(ptr %0, ptr %1) {
+entry:
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %ptr.0 = getelementptr i32, ptr %0, i64 %lid
+  %ptr.1 = getelementptr i32, ptr %1, i64 %lid
+  %val = load i48, ptr %ptr.0
+  store i48 %val, ptr %ptr.1
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @test
+; CHECK: load i48
+; CHECK-NOT: load i48
+; CHECK: store i48
+; CHECK-NOT: store i48
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test
+; CHECK: load i48
+; CHECK: load i48
+; CHECK: load i48
+; CHECK: load i48
+; CHECK-NOT: load i48
+; CHECK: store i48
+; CHECK: store i48
+; CHECK: store i48
+; CHECK: store i48
+; CHECK-NOT: store i48

From 15fdf9fdef241b32508ea74d52bfd0347c1fc95b Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 24 Jan 2025 13:18:00 +0000
Subject: [PATCH 137/182] Update for LLVM 20.

* dib.createObjectPointerType now takes an extra argument indicating
  whether we have an implicit or an explicit 'this'. Pass this along.
* PointerUnion::get has been deprecated in favor of llvm::cast. Since
  this works on LLVM 19 too, just use this unconditionally.
---
 .../compiler_pipeline/source/work_item_loops_pass.cpp  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 6ea8d484fc16d..e1cebe3463b0e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -292,9 +292,8 @@ struct ScheduleGenerator {
       // Create intrinsic
 #if LLVM_VERSION_GREATER_EQUAL(19, 0)
       if (!module.IsNewDbgInfoFormat) {
-        auto *const DII = DIB.insertDeclare(barrier.getDebugAddr(), new_var,
-                                            expr, wrapperDbgLoc, block)
-                              .get<Instruction *>();
+        auto *const DII = cast<Instruction *>(DIB.insertDeclare(
+            barrier.getDebugAddr(), new_var, expr, wrapperDbgLoc, block));
 
         // Bit of a HACK to produce the same debug output as the Mem2Reg
         // pass used to do.
@@ -302,9 +301,8 @@ struct ScheduleGenerator {
         ConvertDebugDeclareToDebugValue(DVIntrinsic, SI, DIB);
       } else {
         auto *const DVR = static_cast<DbgVariableRecord *>(
-            DIB.insertDeclare(barrier.getDebugAddr(), new_var, expr,
-                              wrapperDbgLoc, block)
-                .get<DbgRecord *>());
+            cast<DbgRecord *>(DIB.insertDeclare(barrier.getDebugAddr(), new_var,
+                                                expr, wrapperDbgLoc, block)));
 
         // This is nasty, but LLVM errors out on trailing debug info, we need a
         // subsequent instruction even if we delete it immediately afterwards.

From e57031f46bef7f592a345e2933696ca65d832fe9 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 24 Jan 2025 14:51:47 +0000
Subject: [PATCH 138/182] [vecz] Handle vectors of size 1.

Vectors of size 1 are valid, but were internally getting mishandled by
the packetization treating them as scalars. Check whether the vecz
source instruction was using vectors and if so, keep it using vectors.
---
 .../transform/packetization_helpers.cpp       |  2 +-
 .../vecz/test/lit/llvm/vector_size_1.ll       | 38 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index d515a87acd7c2..10a98067fee77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -486,7 +486,7 @@ PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
   if (auto *const vecTy = dyn_cast<FixedVectorType>(vec->getType())) {
     assert(isa<FixedVectorType>(vecTy) && "Must be a fixed vector type here!");
     const unsigned scalarWidth = vecTy->getNumElements() / width;
-    if (scalarWidth > 1) {
+    if (scalarWidth > 1 || scalar->getType()->isVectorTy()) {
       auto *const undef = UndefValue::get(vec->getType());
 
       // Build shuffle mask to perform the subvector extracts.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
new file mode 100644
index 0000000000000..3b6d84de22076
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(ptr %src, ptr %dst) {
+entry:
+  %lid = tail call i32 @__mux_get_sub_group_local_id()
+  %lid.i64 = zext i32 %lid to i64
+  %src.i = getelementptr i64, ptr %src, i64 %lid.i64
+  %val = load <1 x i64>, ptr %src.i, align 8
+  %vec = shufflevector <1 x i64> %val, <1 x i64> zeroinitializer, <8 x i32> zeroinitializer
+  %dst.i = getelementptr <8 x i64>, ptr %dst, i64 %lid.i64
+  store <8 x i64> %vec, ptr %dst.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @test
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test
+
+declare i32 @__mux_get_sub_group_local_id()

From 40b912c7e906368694959c481457e1256a2b1ab7 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 29 Jan 2025 10:29:32 +0000
Subject: [PATCH 139/182] LLVM 20: Update for getFirstNonPHIOrDbg.

LLVM 20 updates getFirstNonPHIOrDbg() to return an iterator rather than
an Instruction *. We can use &* to reliably get an Instruction * across
LLVM versions.

The variable name insert_point suggests that we should instead change
the type of the variable to be an iterator, but despite the name, it is
not used merely as an insert point. This should be revisited in the
future but may result in changes in behavior that should probably be
kept separate from any compatibility fixes.
---
 .../compiler_pipeline/source/barrier_regions.cpp              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 07c8b97082a5c..11847ededd12e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -1185,7 +1185,7 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
   }
 
   BasicBlock *new_kernel_entry_block = &(new_kernel->getEntryBlock());
-  Instruction *insert_point = new_kernel_entry_block->getFirstNonPHIOrDbg();
+  Instruction *insert_point = &*new_kernel_entry_block->getFirstNonPHIOrDbg();
   auto *const cloned_barrier_call =
       region.barrier_inst ? insert_point : nullptr;
 
@@ -1290,7 +1290,7 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
   }
 
   // Iterate instruction from insert point at entry basic block.
-  insert_point = new_kernel_entry_block->getFirstNonPHIOrDbg();
+  insert_point = &*new_kernel_entry_block->getFirstNonPHIOrDbg();
   const RemapFlags remapFlags =
       RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs;
   BasicBlock::iterator b_iter = insert_point->getIterator();

From a5ca431e79be2011766bbecd53a8ee8e6db3bdde Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 13 Feb 2025 14:38:52 +0000
Subject: [PATCH 140/182] LLVM 21: Update for Attribute::NoCapture removal.

LLVM 21 removes the nocapture attribute in favor of captures(none). Make
sure we handle that.

In tests where nocapture was not relevant to what was being tested and
where it can be removed, remove it. In other tests, change the CHECK
lines to permit both nocapture and captures(none).
---
 .../vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
index d23ac4380018b..9ce6828f9a284 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -22,7 +22,7 @@ target triple = "spir64-unknown-unknown"
 
 declare i64 @__mux_get_global_id(i32)
 
-define spir_kernel void @foo(float addrspace(1)* nocapture readonly %a, i32 addrspace(1)* nocapture %out) {
+define spir_kernel void @foo(float addrspace(1)* readonly %a, i32 addrspace(1)* %out) {
 entry:
   %call = tail call i64 @__mux_get_global_id(i32 0) #2
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 %call
@@ -41,6 +41,6 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) nocapture readonly %a, ptr addrspace(1) nocapture %out)
+; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) readonly %a, ptr addrspace(1) %out)
 ; CHECK:  [[CMP:%.*]] = fcmp oeq <vscale x 2 x float> %{{.*}}, zeroinitializer
 ; CHECK:  %{{.*}} = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, <vscale x 2 x i1> [[CMP]], {{.*}}, i32 {{.*}})

From 04d65a4f01ad2c7ef2cd0a98eee2028d36090548 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 18 Feb 2025 10:11:49 +0000
Subject: [PATCH 141/182] Update for LLVM 20, 21.

More functions have had their overloads taking Instruction * deprecated
in favour of using iterators to mark insertion points. Make the
suggested changes.

For DIBuilder, use a new multi_llvm::DIBuilder class to remain
compatible with LLVM 18 and 19.
---
 .../include/multi_llvm/dibuilder.h            | 108 ++++++++++++++++++
 .../source/barrier_regions.cpp                |   4 +-
 .../source/pass_functions.cpp                 |   6 +-
 .../source/transform/instantiation_pass.cpp   |   2 +-
 .../source/transform/pre_linearize_pass.cpp   |   2 +-
 .../vecz/source/transform/scalarizer.cpp      |  16 +--
 .../vecz/source/vectorization_helpers.cpp     |  12 +-
 7 files changed, 131 insertions(+), 19 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
new file mode 100644
index 0000000000000..b7276c25d77c8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
@@ -0,0 +1,108 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_DIBUILDER_H_INCLUDED
+#define MULTI_LLVM_DIBUILDER_H_INCLUDED
+
+#include <llvm/IR/DIBuilder.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <type_traits>
+
+namespace multi_llvm {
+// TODO In order to enable the use of OCK in DPC++ which currently uses the
+// older DIBuilder interface, we do not yet condition this on LLVM version, we
+// dynamically detect which version of DIBuilder we have. This should be updated
+// after DPC++'s next pulldown to drop the use of DIBuilderWrapperNeeded and
+// base it entirely on LLVM major version.
+#if LLVM_VERSION_GREATER_EQUAL(20, 0) && 0
+using DIBuilder = llvm::DIBuilder;
+#else
+template <typename DIBuilder>
+struct DIBuilderWrapper : DIBuilder {
+  using DIBuilder::DIBuilder;
+
+#if LLVM_VERSION_GREATER_EQUAL(19, 0)
+  llvm::BasicBlock *getBasicBlock(llvm::InsertPosition InsertPt) {
+    return InsertPt.getBasicBlock();
+  }
+#else
+  llvm::BasicBlock *getBasicBlock(llvm::BasicBlock::iterator InsertPt) {
+    // Cannot handle sentinels.
+    return InsertPt->getParent();
+  }
+#endif
+
+  auto insertDeclare(llvm::Value *Storage, llvm::DILocalVariable *VarInfo,
+                     llvm::DIExpression *Expr, const llvm::DILocation *DL,
+                     llvm::BasicBlock *InsertAtEnd) {
+    return DIBuilder::insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd);
+  }
+
+  auto insertDeclare(llvm::Value *Storage, llvm::DILocalVariable *VarInfo,
+                     llvm::DIExpression *Expr, const llvm::DILocation *DL,
+                     llvm::BasicBlock::iterator InsertPt) {
+    auto *InsertBB = getBasicBlock(InsertPt);
+    if (InsertPt == InsertBB->end()) {
+      return DIBuilder::insertDeclare(Storage, VarInfo, Expr, DL, InsertBB);
+    } else {
+      return DIBuilder::insertDeclare(Storage, VarInfo, Expr, DL, &*InsertPt);
+    }
+  }
+
+  auto insertDbgValueIntrinsic(llvm::Value *Val, llvm::DILocalVariable *VarInfo,
+                               llvm::DIExpression *Expr,
+                               const llvm::DILocation *DL,
+                               llvm::BasicBlock *InsertAtEnd) {
+    return DIBuilder::insertDbgValueIntrinsic(Val, VarInfo, Expr, DL,
+                                              InsertAtEnd);
+  }
+
+  auto insertDbgValueIntrinsic(llvm::Value *Val, llvm::DILocalVariable *VarInfo,
+                               llvm::DIExpression *Expr,
+                               const llvm::DILocation *DL,
+                               llvm::BasicBlock::iterator InsertPt) {
+    auto *InsertBB = getBasicBlock(InsertPt);
+    if (InsertPt == InsertBB->end()) {
+      return DIBuilder::insertDbgValueIntrinsic(Val, VarInfo, Expr, DL,
+                                                InsertBB);
+    } else {
+      return DIBuilder::insertDbgValueIntrinsic(Val, VarInfo, Expr, DL,
+                                                &*InsertPt);
+    }
+  }
+};
+
+template <typename DIBuilder, typename = void>
+static constexpr bool DIBuilderWrapperNeeded = true;
+
+template <typename DIBuilder>
+static constexpr bool DIBuilderWrapperNeeded<
+    DIBuilder, std::void_t<decltype(std::declval<DIBuilder &>().insertLabel(
+                   std::declval<llvm::DILabel *>(),
+                   std::declval<const llvm::DILocation *>(),
+                   std::declval<llvm::BasicBlock::iterator>()))>> = false;
+
+template <typename DIBuilder>
+using DIBuilderMaybeWrapped =
+    std::conditional_t<DIBuilderWrapperNeeded<DIBuilder>,
+                       DIBuilderWrapper<DIBuilder>, DIBuilder>;
+
+using DIBuilder = DIBuilderMaybeWrapped<llvm::DIBuilder>;
+#endif
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_DIBUILDER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 11847ededd12e..8d6c78af553f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -1151,9 +1151,9 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
         // multiple return instructions in a kernel, if it does then clone
         // the instruction first.
         if (nullptr == entry_call->getParent()) {
-          entry_call->insertBefore(new_ret);
+          entry_call->insertBefore(new_ret->getIterator());
         } else {
-          entry_call->clone()->insertBefore(new_ret);
+          entry_call->clone()->insertBefore(new_ret->getIterator());
         }
       }
     } else if (ReturnInst *ret =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index d13a1751507f1..392ee922235dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -208,7 +208,7 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
             llvm::dyn_cast<llvm::ConstantExpr>(constant)) {
       newInst = constantExpr->getAsInstruction();
       // insert the instruction at the beginning of the entry block
-      newInst->insertBefore(useFunc->getEntryBlock().getFirstNonPHI());
+      newInst->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
     } else if (llvm::ConstantVector *constantVec =
                    llvm::dyn_cast<llvm::ConstantVector>(constant)) {
       // If it is a ConstantVector then only handle the case where it is
@@ -226,7 +226,7 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
       llvm::Type *i32Ty = llvm::Type::getInt32Ty(constant->getContext());
       auto insert = llvm::InsertElementInst::Create(
           undef, splatVal, llvm::ConstantInt::get(i32Ty, 0));
-      insert->insertBefore(useFunc->getEntryBlock().getFirstNonPHI());
+      insert->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
       llvm::Value *zeros = llvm::ConstantAggregateZero::get(
           llvm::FixedVectorType::get(i32Ty, numEls));
       newInst = new llvm::ShuffleVectorInst(insert, undef, zeros);
@@ -242,7 +242,7 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
         if (insertedIns) {
           insertNext->insertAfter(insertedIns);
         } else {
-          insertNext->insertBefore(useFunc->getEntryBlock().getFirstNonPHI());
+          insertNext->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
         }
         insertedIns = insertNext;
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
index c6f69afa0abf2..c8d5cae91bc33 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -282,7 +282,7 @@ PacketRange InstantiationPass::instantiateByCloning(Instruction *I) {
       continue;
     }
     Instruction *Clone = I->clone();
-    Clone->insertBefore(I);
+    Clone->insertBefore(I->getIterator());
     P[i] = Clone;
     Clones.push_back(Clone);
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index c76152a57f3b7..e76564902b9d4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -138,7 +138,7 @@ bool hoistInstructions(BasicBlock &BB, BranchInst &Branch, bool exceptions) {
   bool modified = false;
   while (!BB.front().isTerminator()) {
     auto &I = BB.front();
-    I.moveBefore(&Branch);
+    I.moveBefore(*Branch.getParent(), Branch.getIterator());
     modified = true;
 
     if (!exceptions) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index b812d5b371f1e..4b05e3b52d03d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -20,7 +20,6 @@
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/Analysis/InstructionSimplify.h>
-#include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
@@ -28,6 +27,7 @@
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/dibuilder.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -682,7 +682,7 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
   // instructions and is used to avoid duplicate LLVM dbg.value's.
   SmallPtrSet<Value *, 4> VectorElements;
 
-  DIBuilder DIB(*Original->getModule(), false);
+  multi_llvm::DIBuilder DIB(*Original->getModule(), false);
 
   auto CreateAndInsertDIExpr = [&](auto InsertDIExpr) {
     const auto bitSize = Original->getType()->getScalarSizeInBits();
@@ -726,7 +726,8 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
 
     // Create new DbgVariableRecord across enabled SIMD lanes
     CreateAndInsertDIExpr([&](Value *LaneVal, DIExpression *DIExpr) {
-      DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
+      DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc,
+                                  Original->getIterator());
     });
   }
 #endif
@@ -753,10 +754,11 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
     }
 
     // Create new llvm.dbg.value() intrinsic across enabled SIMD lanes
-    CreateAndInsertDIExpr([&](Value *const LaneVal,
-                              DIExpression *const DIExpr) {
-      DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc, Original);
-    });
+    CreateAndInsertDIExpr(
+        [&](Value *const LaneVal, DIExpression *const DIExpr) {
+          DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc,
+                                      Original->getIterator());
+        });
   }
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 7308a828fcc29..9268d67116627 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -19,11 +19,11 @@
 #include <compiler/utils/attributes.h>
 #include <compiler/utils/metadata.h>
 #include <llvm/IR/Attributes.h>
-#include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/dibuilder.h>
 
 #include <optional>
 
@@ -252,7 +252,7 @@ Function *cloneFunctionToVector(const VectorizationUnit &VU) {
   }
 
   for (auto *Placeholder : Placeholders) {
-    Placeholder->insertBefore(&*InsertPt);
+    Placeholder->insertBefore(InsertPt);
   }
 
   return VectorizedFn;
@@ -274,7 +274,7 @@ void cloneDebugInfo(const VectorizationUnit &VU) {
   }
 
   // Create a DISubprogram entry for the vectorized kernel
-  DIBuilder DIB(*VU.scalarFunction()->getParent(), false);
+  multi_llvm::DIBuilder DIB(*VU.scalarFunction()->getParent(), false);
   DICompileUnit *CU =
       DIB.createCompileUnit(dwarf::DW_LANG_OpenCL, ScalarDI->getFile(), "",
                             ScalarDI->isOptimized(), "", 0);
@@ -359,7 +359,8 @@ void cloneDebugInfo(const VectorizationUnit &VU) {
 
           // New llvm.dbg.value() with correct scope
           DIB.insertDbgValueIntrinsic(DVI->getValue(), VectorLocal,
-                                      DVI->getExpression(), VectorLoc, DVI);
+                                      DVI->getExpression(), VectorLoc,
+                                      DVI->getIterator());
         } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(DII)) {
           // Find DILocalVariable the intrinsic references
           const DILocalVariable *const ScalarLocal = DDI->getVariable();
@@ -379,7 +380,8 @@ void cloneDebugInfo(const VectorizationUnit &VU) {
 
           // New llvm.dbg.declare() with correct scope
           DIB.insertDeclare(DDI->getAddress(), VectorLocal,
-                            DDI->getExpression(), VectorLoc, DDI);
+                            DDI->getExpression(), VectorLoc,
+                            DDI->getIterator());
         } else {
           continue;  // No other DbgInfoIntrinsic subclasses
         }

From 99b1f03940503db2ba5deeea2d0006a817abbe01 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 14 Mar 2025 13:34:48 +0000
Subject: [PATCH 142/182] Update for LLVM 21.

LLVM 21 has made changes to have modules store a triple, rather than a
triple string, and several APIs were adjusted accordingly. Update OCK to
handle the new APIs.
---
 .../vecz/test/lit/llvm/packetization_debug_info.ll       | 4 ++--
 .../compiler_passes/vecz/tools/source/veczc.cpp          | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index 688d5be8fd10c..963c8f2da6ac4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -56,7 +56,7 @@ entry:
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
   %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
-; CHECK-GE19: #dbg_value(i32 undef, [[DI_A:![0-9]+]], !DIExpression(),
+; CHECK-GE19: #dbg_value(i32 {{undef|poison}}, [[DI_A:![0-9]+]], !DIExpression(),
 ; CHECK-LT19: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_A:![0-9]+]], metadata !DIExpression())
 ; CHECK-SAME: [[A_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
@@ -65,7 +65,7 @@ entry:
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
   store i32 %2, i32* %a, align 4, !dbg !32
-; CHECK-GE19: #dbg_value(i32 undef, [[DI_B:![0-9]+]], !DIExpression(),
+; CHECK-GE19: #dbg_value(i32 {{undef|poison}}, [[DI_B:![0-9]+]], !DIExpression(),
 ; CHECK-LT19: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_B:![0-9]+]], metadata !DIExpression())
 ; CHECK-SAME: [[B_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 9ea491d9c1e66..197c4e9bfc928 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -141,9 +141,14 @@ static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
   }
   llvm::PassRegistry &registry = *llvm::PassRegistry::getPassRegistry();
   llvm::initializeAlwaysInlinerLegacyPassPass(registry);
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+  return target->createTargetMachine(triple, cpu_model, target_features, opts,
+                                     llvm::Reloc::Model::Static);
+#else
   return target->createTargetMachine(triple.getTriple(), cpu_model,
                                      target_features, opts,
                                      llvm::Reloc::Model::Static);
+#endif
 }
 
 static vecz::VeczPassOptions getDefaultPassOptions() {
@@ -362,7 +367,11 @@ int main(const int argc, const char *const argv[]) {
                         : nullptr);
   assert(!UserTriple.size() || tm);
   if (tm) {
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+    module->setTargetTriple(tm->getTargetTriple());
+#else
     module->setTargetTriple(tm->getTargetTriple().getTriple());
+#endif
     module->setDataLayout(tm->createDataLayout());
   }
 

From d9efef4391318982071ef50377355e795100e6b9 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 24 Mar 2025 11:54:59 +0000
Subject: [PATCH 143/182] Address clang & clang-tidy warnings.

When building as part of LLVM and/or with Clang, additional warnings are
raised that do not show up in our regular builds. This commit addresses
them. For the most part, these are NFC, but there are a few exceptions.

* hal_binary_encoder's constructor was failing to initialize device.
* A few code paths which are meant to be unreachable are handled
  differently to fail more reliably.
* One accidentally volatile store is changed to non-volatile.

Most of the other changes are in these categories:

* switch statements on enumeration types either do not handle all
  enumerators, and do have a default case, or do handle all enumerators,
  and do not have a default case. The default cases for exhaustive
  switches are placed after the switch statements instead by having each
  case label end in a return, possibly enabled through an immediately
  invoked lambda expression.
* Parameters and variables which are not considered in scope of the
  .clang-tidy due to appearing in generated sources and/or in macro
  expansions from non-project headers are renamed in accordance with the
  clang-tidy naming rules, because the suppression of the rule does not
  affect these.
* Unused variables are removed, unused named parameters are anonymized.
* Virtual classes without a virtual destructor are prevented from being
  destroyed through a base class.
* Virtual methods which override a base class method are annotated with
  the override keyword, and without the virtual keyword.
* Virtual classes that are never derived from are made non-virtual
  instead.
---
 .../compiler_pipeline/source/attributes.cpp   |   3 +-
 .../source/barrier_regions.cpp                |  24 ++--
 .../compiler_pipeline/source/builtin_info.cpp |   3 +-
 .../source/cl_builtin_info.cpp                |   2 +-
 .../source/mux_builtin_info.cpp               | 118 +++++++++---------
 .../source/work_item_loops_pass.cpp           |   5 +-
 .../vecz/source/reachability.cpp              |  18 +--
 .../control_flow_conversion_pass.cpp          |  12 +-
 .../vecz/source/transform/scalarizer.cpp      |   2 +-
 .../vecz/source/vectorization_context.cpp     |  19 ++-
 10 files changed, 104 insertions(+), 102 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
index e04e5a0aef614..584a719cc13b0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
@@ -152,9 +152,8 @@ std::optional<uint32_t> getDMAReqdSizeBytes(const Function &F) {
 static constexpr const char *BarrierScheduleAttrName = "mux-barrier-schedule";
 
 void setBarrierSchedule(CallInst &CI, BarrierSchedule Sched) {
-  StringRef Val;
+  StringRef Val = "unknown";
   switch (Sched) {
-    default:
     case BarrierSchedule::Unordered:
       Val = "unordered";
       break;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 8d6c78af553f8..33aeff7f1dc5d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -1044,11 +1044,15 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
   BasicBlock *entry_point = region.entry;
   LLVMContext &context = module_.getContext();
 
-  LLVM_DEBUG(dbgs() << "\n"; unsigned i = 0; for (auto *d : region.blocks) {
-    dbgs() << "entry block: " << entry_point->getName() << "\n";
-    dbgs() << "region visited path [" << i++ << "] = " << d->getName()
-           << "\n\n";
-    dbgs() << *d << "\n\n";
+  LLVM_DEBUG({
+    dbgs() << "\n";
+    unsigned I = 0;
+    for (auto *D : region.blocks) {
+      dbgs() << "entry block: " << entry_point->getName() << "\n";
+      dbgs() << "region visited path [" << I++ << "] = " << D->getName()
+             << "\n\n";
+      dbgs() << *D << "\n\n";
+    }
   });
 
   SmallVector<Type *, 8> new_func_params;
@@ -1475,14 +1479,14 @@ void compiler::utils::Barrier::SeperateKernelWithBarrier() {
   barrier_md->addOperand(num_barriers__md);
 
   LLVM_DEBUG({
-    for (const auto &kid : kernel_id_map_) {
-      dbgs() << "1. kernel_id[" << kid.first << "] = " << kid.second->getName()
+    for (const auto &Kid : kernel_id_map_) {
+      dbgs() << "1. kernel_id[" << Kid.first << "] = " << Kid.second->getName()
              << "\n";
     }
 
-    for (unsigned i = kBarrier_FirstID;
-         i < kernel_id_map_.size() + kBarrier_FirstID; i++) {
-      dbgs() << "2. kernel_id[" << i << "] = " << kernel_id_map_[i]->getName()
+    for (unsigned I = kBarrier_FirstID;
+         I < kernel_id_map_.size() + kBarrier_FirstID; I++) {
+      dbgs() << "2. kernel_id[" << I << "] = " << kernel_id_map_[I]->getName()
              << "\n";
     }
     dbgs() << "\n\n" << module_ << "\n\n";
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index 217d3ea306ba9..f6c4076dd830a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -1147,8 +1147,6 @@ BuiltinID BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
 #define SIMPLE_SCOPE_SWITCH(OP)                     \
   do {                                              \
     switch (Group.Scope) {                          \
-      default:                                      \
-        llvm_unreachable("Impossible scope kind");  \
       case GroupCollective::ScopeKind::SubGroup:    \
         return eMuxBuiltinSubgroup##OP;             \
       case GroupCollective::ScopeKind::WorkGroup:   \
@@ -1156,6 +1154,7 @@ BuiltinID BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
       case GroupCollective::ScopeKind::VectorGroup: \
         return eMuxBuiltinVecgroup##OP;             \
     }                                               \
+    llvm_unreachable("Impossible scope kind");      \
   } while (0)
 
 #define COMPLEX_SCOPE_SWITCH(OP, SUFFIX)               \
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 9490b236ac603..c78252cd0f9e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -2376,7 +2376,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode,
   DataPtr = B.CreateGEP(U16Ty, DataPtr, Offset, "vstore_base");
 
   // Store the ushort.
-  return B.CreateStore(Data, DataPtr, "vstore_half");
+  return B.CreateStore(Data, DataPtr);
 }
 
 /// @brief Emit the body of a relational builtin function.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index a01c67fc005f6..155d1380e0242 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -165,65 +165,67 @@ static Function *defineSubGroupGroupOpBuiltin(Function &F,
 
   IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
 
-  switch (GroupOp.Op) {
-    default:
-      llvm_unreachable("Unhandled group operation");
-    case GroupCollective::OpKind::Any:
-    case GroupCollective::OpKind::All:
-    case GroupCollective::OpKind::Broadcast:
-    case GroupCollective::OpKind::Reduction:
-    case GroupCollective::OpKind::ScanInclusive:
-      // In the trivial size=1 case, all of these operations just return the
-      // argument back again
-      B.CreateRet(Arg);
-      break;
-    case GroupCollective::OpKind::ScanExclusive: {
-      // In the trivial size=1 case, exclusive scans return the identity.
-      assert(!OverloadInfo.empty());
-      auto *const IdentityVal =
-          getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]);
-      assert(IdentityVal && "Unable to deduce identity val");
-      B.CreateRet(IdentityVal);
-      break;
-    }
-    case GroupCollective::OpKind::Shuffle:
-    case GroupCollective::OpKind::ShuffleXor:
-      // In the trivial size=1 case, all of these operations just return the
-      // argument back again. Any computed shuffle index other than the only
-      // one in the sub-group would be out of bounds anyway.
-      B.CreateRet(Arg);
-      break;
-    case GroupCollective::OpKind::ShuffleUp: {
-      auto *const Prev = F.getArg(0);
-      auto *const Curr = F.getArg(1);
-      auto *const Delta = F.getArg(2);
-      // In the trivial size=1 case, negative delta is the desired index (since
-      // we're subtracting it from zero). If it's greater than zero and less
-      // than the size, we return 'current', else if it's less than zero and
-      // greater than or equal to the negative size, we return 'prev'. So if
-      // 'delta' is zero, return 'current', else return 'prev'. Anything else
-      // is out of bounds so we can simplify things here.
-      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
-      auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel");
-      B.CreateRet(Sel);
-      break;
-    }
-    case GroupCollective::OpKind::ShuffleDown: {
-      auto *const Curr = F.getArg(0);
-      auto *const Next = F.getArg(1);
-      auto *const Delta = F.getArg(2);
-      // In the trivial size=1 case, the delta is the desired index (since
-      // we're adding it to zero). If it's less than the size, we return
-      // 'current', else if it's greater or equal to the size but less than
-      // twice the size, we return 'next'. So if 'delta' is zero, return
-      // 'current', else return 'next'. Anything else is out of bounds so we
-      // can simplify things here.
-      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
-      auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel");
-      B.CreateRet(Sel);
-      break;
+  [&] {
+    switch (GroupOp.Op) {
+      case GroupCollective::OpKind::Any:
+      case GroupCollective::OpKind::All:
+      case GroupCollective::OpKind::Broadcast:
+      case GroupCollective::OpKind::Reduction:
+      case GroupCollective::OpKind::ScanInclusive:
+        // In the trivial size=1 case, all of these operations just return the
+        // argument back again
+        B.CreateRet(Arg);
+        return;
+      case GroupCollective::OpKind::ScanExclusive: {
+        // In the trivial size=1 case, exclusive scans return the identity.
+        assert(!OverloadInfo.empty());
+        auto *const IdentityVal =
+            getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]);
+        assert(IdentityVal && "Unable to deduce identity val");
+        B.CreateRet(IdentityVal);
+        return;
+      }
+      case GroupCollective::OpKind::Shuffle:
+      case GroupCollective::OpKind::ShuffleXor:
+        // In the trivial size=1 case, all of these operations just return the
+        // argument back again. Any computed shuffle index other than the only
+        // one in the sub-group would be out of bounds anyway.
+        B.CreateRet(Arg);
+        return;
+      case GroupCollective::OpKind::ShuffleUp: {
+        auto *const Prev = F.getArg(0);
+        auto *const Curr = F.getArg(1);
+        auto *const Delta = F.getArg(2);
+        // In the trivial size=1 case, negative delta is the desired index
+        // (since we're subtracting it from zero). If it's greater than zero and
+        // less than the size, we return 'current', else if it's less than zero
+        // and greater than or equal to the negative size, we return 'prev'. So
+        // if 'delta' is zero, return 'current', else return 'prev'. Anything
+        // else is out of bounds so we can simplify things here.
+        auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+        auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel");
+        B.CreateRet(Sel);
+        return;
+      }
+      case GroupCollective::OpKind::ShuffleDown: {
+        auto *const Curr = F.getArg(0);
+        auto *const Next = F.getArg(1);
+        auto *const Delta = F.getArg(2);
+        // In the trivial size=1 case, the delta is the desired index (since
+        // we're adding it to zero). If it's less than the size, we return
+        // 'current', else if it's greater or equal to the size but less than
+        // twice the size, we return 'next'. So if 'delta' is zero, return
+        // 'current', else return 'next'. Anything else is out of bounds so we
+        // can simplify things here.
+        auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+        auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel");
+        B.CreateRet(Sel);
+        return;
+      }
     }
-  }
+
+    llvm_unreachable("Unhandled group operation");
+  }();
 
   return &F;
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index e1cebe3463b0e..bb97c8959d5ad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -1721,9 +1721,6 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
 
       auto *const exitBlock = [&]() {
         switch (barrierMain.getSchedule(i)) {
-          default:
-            assert(!"Unexpected barrier schedule enum");
-            LLVM_FALLTHROUGH;
           case BarrierSchedule::Unordered:
           case BarrierSchedule::ScalarTail:
             if (tailInfo && tailInfo->IsVectorPredicated) {
@@ -1737,6 +1734,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
           case BarrierSchedule::Linear:
             return schedule.makeLinearWorkItemLoops(block, i);
         }
+
+        llvm_unreachable("Unexpected barrier schedule enum");
       }();
 
       // the last basic block in our function!
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
index 2fbff40ef2e10..7703c8260b822 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
@@ -177,23 +177,23 @@ void Reachability::recalculate(Function &F) {
   }
 
   LLVM_DEBUG({
-    size_t i = 0;
+    size_t I = 0;
     for (auto &BB : F) {
-      auto &node = graph[i];
+      auto &Node = graph[I];
       dbgs() << BB.getName() << ":\n";
-      dbgs() << "[ " << node.X << ", " << node.Y << " ] : ";
-      dbgs() << "( " << node.dom << ", " << node.postDom << " ) : ";
-      for (const size_t s : node.successors) {
-        if (graph[s].X <= graph[i].X) {
+      dbgs() << "[ " << Node.X << ", " << Node.Y << " ] : ";
+      dbgs() << "( " << Node.dom << ", " << Node.postDom << " ) : ";
+      for (const size_t S : Node.successors) {
+        if (graph[S].X <= graph[I].X) {
           dbgs() << "!x!";
         }
-        if (graph[s].Y <= graph[i].Y) {
+        if (graph[S].Y <= graph[I].Y) {
           dbgs() << "!y!";
         }
-        dbgs() << s << "; ";
+        dbgs() << S << "; ";
       }
       dbgs() << "\n\n";
-      ++i;
+      ++I;
     }
   });
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 2b5c3aa9e3441..6791053403a0a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -2540,10 +2540,10 @@ bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
           dbgs() << " (empty)\n";
         } else {
           dbgs() << "\n";
-          for (const auto &pair : deferrals) {
-            for (BasicBlock *deferred : pair.second) {
-              LLVM_DEBUG(dbgs() << "\t(" << pair.first->getName() << ", "
-                                << deferred->getName() << ")\n");
+          for (const auto &Pair : deferrals) {
+            for (BasicBlock *Deferred : Pair.second) {
+              LLVM_DEBUG(dbgs() << "\t(" << Pair.first->getName() << ", "
+                                << Deferred->getName() << ")\n");
             }
           }
         }
@@ -2934,8 +2934,8 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
         dbgs() << "\t\t\tWorklist: [";
         if (!queue.empty()) {
           dbgs() << DR->getBlockTag(*queue.begin()).BB->getName();
-          for (auto it = std::next(queue.begin()); it != queue.end(); ++it) {
-            dbgs() << ", " << DR->getBlockTag(*it).BB->getName();
+          for (auto It = std::next(queue.begin()); It != queue.end(); ++It) {
+            dbgs() << ", " << DR->getBlockTag(*It).BB->getName();
           }
           dbgs() << "]\n";
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 4b05e3b52d03d..f90948271c008 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -1143,10 +1143,10 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
         }
         Lane = Lane ? B.CreateOr(Lane, SrcPart) : SrcPart;
       }
+      assert(Lane && "No bits found for lane");
       if (DstEleTy != DstEleIntTy) {
         Lane = B.CreateBitCast(Lane, DstEleTy);
       }
-      assert(Lane && "No bits found for lane");
       P->set(i, Lane);
     }
     return P;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index c1a735bb35dbd..1fbccf2e6fbda 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -607,31 +607,30 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
   // Mangle ordering
   auto mangleOrdering = [&O](AtomicOrdering Ordering) {
     switch (Ordering) {
-      default:
-        O << static_cast<unsigned>(Ordering);
-        break;
       case AtomicOrdering::Acquire:
         O << "acquire";
-        break;
+        return;
       case AtomicOrdering::AcquireRelease:
         O << "acqrel";
-        break;
+        return;
       case AtomicOrdering::Monotonic:
         O << "monotonic";
-        break;
+        return;
       case AtomicOrdering::NotAtomic:
         O << "notatomic";
-        break;
+        return;
       case AtomicOrdering::Release:
         O << "release";
-        break;
+        return;
       case AtomicOrdering::SequentiallyConsistent:
         O << "seqcst";
-        break;
+        return;
       case AtomicOrdering::Unordered:
         O << "unordered";
-        break;
+        return;
     }
+
+    O << static_cast<unsigned>(Ordering);
   };
 
   mangleOrdering(I.Ordering);

From 9555a7372098b36f12e8384c943279ee2386ef52 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 25 Mar 2025 18:24:58 +0000
Subject: [PATCH 144/182] Address more clang & clang-tidy warnings.

* Add more const where we can.
* Use contains() rather than count() to check for existence.
* Stop looking up options through llvm::cl::getRegisteredOptions() when
  we can use their corresponding variables instead.
* Stop creating unused .blend PHI nodes.
* Enforce W^X in our loader.
* Use remove_if() rather than stable_partition() to remove released
  semaphores.
* Ensure driver::CompilerInfo is always initialized.
---
 .../include/compiler/utils/pass_machinery.h   |  3 +
 .../source/barrier_regions.cpp                | 20 +++----
 .../source/pass_machinery.cpp                 | 14 +++--
 .../source/prepare_barriers_pass.cpp          |  4 +-
 .../source/analysis/divergence_analysis.cpp   |  2 +-
 .../source/analysis/liveness_analysis.cpp     |  4 +-
 .../vecz/source/control_flow_boscc.cpp        | 18 +++---
 .../vecz/source/ir_cleanup.cpp                |  2 +-
 .../source/transform/basic_mem2reg_pass.cpp   |  2 +-
 .../control_flow_conversion_pass.cpp          | 22 +++----
 .../interleaved_group_combine_pass.cpp        |  2 +-
 .../transform/packetization_helpers.cpp       |  2 +-
 .../source/transform/pre_linearize_pass.cpp   |  4 +-
 .../source/transform/scalarization_pass.cpp   |  2 +-
 .../vecz/source/transform/scalarizer.cpp      | 60 +++++++++----------
 .../transform/simplify_infinite_loop_pass.cpp | 28 ---------
 .../vecz/source/vecz_pass_builder.cpp         |  4 +-
 17 files changed, 85 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
index 0113dcbebbf5b..997071ed1ef2d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
@@ -34,11 +34,14 @@ class TargetMachine;
 
 namespace compiler {
 namespace utils {
+extern bool VerifyEachIsEnabled;
 
 /// @brief Mirror's LLVM's DebugLogging options in its `opt` tool. Clang has
 /// a boolean on/off version.
 enum class DebugLogging { None, Normal, Verbose, Quiet };
 
+extern DebugLogging DebugPasses;
+
 /// @brief A class that manages the lifetime and initialization of all
 /// components required to set up a new-style LLVM pass manager.
 class PassMachinery {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 33aeff7f1dc5d..7c806722bcc4a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -646,7 +646,7 @@ Function *compiler::utils::Barrier::GenerateFakeKernel(
 
   for (auto *bb : region.blocks) {
     BasicBlock *new_bb = BasicBlock::Create(context, "", new_kernel);
-    if (region.barrier_blocks.count(bb)) {
+    if (region.barrier_blocks.contains(bb)) {
       ReturnInst::Create(context, nullptr, new_bb);
     } else {
       bb->getTerminator()->clone()->insertInto(new_bb, new_bb->end());
@@ -679,7 +679,7 @@ void compiler::utils::Barrier::GatherBarrierRegionBlocks(
   size_t index = 0;
   while (index < region.blocks.size()) {
     BasicBlock *BB = region.blocks[index++];
-    if (barrier_successor_set_.count(BB)) {
+    if (barrier_successor_set_.contains(BB)) {
       region.barrier_blocks.insert(BB);
     } else {
       for (BasicBlock *succ : successors(BB)) {
@@ -716,14 +716,14 @@ void compiler::utils::Barrier::GatherBarrierRegionUses(
       if (PHINode *pn = dyn_cast<PHINode>(&I)) {
         for (unsigned i = 0, e = pn->getNumIncomingValues(); i != e; i++) {
           Value *val = pn->getIncomingValue(i);
-          if (CheckValidUse(val) && !ignore.count(val)) {
+          if (CheckValidUse(val) && !ignore.contains(val)) {
             if (auto *inst = dyn_cast<Instruction>(val)) {
               BasicBlock *incoming = pn->getIncomingBlock(i);
               BasicBlock *parent = inst->getParent();
               // If the incoming edge comes from outside the region, it is
               // going to get removed anyway, so disregard it
-              if (bbmap.count(incoming)) {
-                if (!bbmap.count(parent)) {
+              if (bbmap.contains(incoming)) {
+                if (!bbmap.contains(parent)) {
                   region.uses_ext.insert(val);
                 } else if (!DT.dominates(bbmap[parent], bbmap[incoming])) {
                   region.uses_int.insert(val);
@@ -734,10 +734,10 @@ void compiler::utils::Barrier::GatherBarrierRegionUses(
         }
       } else {
         for (Value *val : I.operands()) {
-          if (CheckValidUse(val) && !ignore.count(val)) {
+          if (CheckValidUse(val) && !ignore.contains(val)) {
             if (auto *inst = dyn_cast<Instruction>(val)) {
               BasicBlock *parent = inst->getParent();
-              if (!bbmap.count(parent)) {
+              if (!bbmap.contains(parent)) {
                 region.uses_ext.insert(val);
               } else if (!DT.dominates(bbmap[parent], BBclone)) {
                 region.uses_int.insert(val);
@@ -847,7 +847,7 @@ void compiler::utils::Barrier::TidyLiveVariables() {
       removals.push_back(v);
     } else if (auto *cast = dyn_cast<CastInst>(v)) {
       Value *op = cast->getOperand(0);
-      if (whole_live_variables_set_.count(op)) {
+      if (whole_live_variables_set_.contains(op)) {
         removals.push_back(v);
       }
     }
@@ -1126,7 +1126,7 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
     vmap[block] = cloned_bb;
 
     // Remove last terminator from clone block with barrier.
-    if (region.barrier_blocks.count(block)) {
+    if (region.barrier_blocks.contains(block)) {
       cloned_bb->getTerminator()->eraseFromParent();
 
       // Return the next barrier's id.
@@ -1450,7 +1450,7 @@ BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
     new_inst->insertInto(new_bb, new_bb->end());
 
     // Record live variables' defs which are in current kernel.
-    if (whole_live_variables_set_.count(&i)) {
+    if (whole_live_variables_set_.contains(&i)) {
       live_defs_info.insert(&i);
     }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
index bcb564de9c784..355cc9bc17a03 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
@@ -23,7 +23,6 @@ using namespace llvm;
 
 namespace compiler {
 namespace utils {
-
 // Note that Clang has three on/off options for debugging pass managers:
 // `-fdebug-pass-manager`, `-fdebug-pass-structure`, and
 // `-fdebug-pass-arguments``.
@@ -49,9 +48,10 @@ namespace utils {
 // While clang also pushes `mdebug-pass` onto LLVM, it only works for the
 // legacy pass manager, and so we choose to only support and model the
 // `debug-pass-manager` form.
-static cl::opt<DebugLogging> DebugPM(
-    "debug-pass-manager", cl::Hidden, cl::ValueOptional,
-    cl::desc("Print pass management debugging information"),
+DebugLogging DebugPasses;
+static cl::opt<DebugLogging, true> DebugPM(
+    "debug-pass-manager", cl::location(DebugPasses), cl::Hidden,
+    cl::ValueOptional, cl::desc("Print pass management debugging information"),
     cl::init(DebugLogging::None),
     cl::values(
         clEnumValN(DebugLogging::Normal, "", ""),
@@ -61,8 +61,10 @@ static cl::opt<DebugLogging> DebugPM(
             DebugLogging::Verbose, "verbose",
             "Print extra information about adaptors and pass managers")));
 
-static cl::opt<bool> VerifyEach("verify-each",
-                                cl::desc("Verify after each transform"));
+bool VerifyEachIsEnabled;
+static cl::opt<bool, true> VerifyEach("verify-each",
+                                      cl::location(VerifyEachIsEnabled),
+                                      cl::desc("Verify after each transform"));
 
 PassMachinery::PassMachinery(LLVMContext &Ctx, TargetMachine *TM,
                              bool VerifyEach, DebugLogging debugLogLevel)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
index 15b138e8a1af3..cb49bce92fdf2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -54,7 +54,7 @@ PreservedAnalyses compiler::utils::PrepareBarriersPass::run(
 
         // If it's one of our kernels don't inline it, and definitely don't
         // delete it either. No need to inline already dead functions, either!
-        if (!Callee->isDefTriviallyDead() && Kernels.count(Callee) == 0) {
+        if (!Callee->isDefTriviallyDead() && !Kernels.contains(Callee)) {
           FuncsWithBarriers.insert(Callee);
         }
       }
@@ -91,7 +91,7 @@ PreservedAnalyses compiler::utils::PrepareBarriersPass::run(
 
         // The function we inlined into now contains a barrier, so add it
         // to the set.
-        if (!InfoF->isDefTriviallyDead() && Kernels.count(InfoF) == 0) {
+        if (!InfoF->isDefTriviallyDead() && !Kernels.contains(InfoF)) {
           FuncsWithBarriers.insert(InfoF);
         }
       } else {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
index 8fe467d75e3fd..4a9e3a2594a0a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -365,7 +365,7 @@ void DivergenceResult::markDivLoopDivBlocks(BasicBlock &BB, Loop &L,
   L.getExitBlocks(exits);
   const auto &divergentExits = escapePoints(BB, L);
   for (BasicBlock *E : exits) {
-    if (divergentExits.count(E)) {
+    if (divergentExits.contains(E)) {
       markDivergent(*E);
     }
     // All loop exits of a divergent loop need their PHIs marked varying.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
index d7ef1aecfa07e..9ec9917acfb19 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -231,8 +231,8 @@ void LivenessResult::Impl::calculateMaxRegistersInBlock(const BasicBlock *BB) {
     // Operands are live so they use a register. Increment registerCount if not
     // in live out or already counted.
     for (const auto *operand : inst.operand_values()) {
-      if (definesVariable(*operand) && !liveOut.count(operand) &&
-          !seenButNotInLiveOut.count(operand)) {
+      if (definesVariable(*operand) && !liveOut.contains(operand) &&
+          !seenButNotInLiveOut.contains(operand)) {
         registersUsed++;
         seenButNotInLiveOut.insert(operand);
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 04a36e8755999..44fe2dae7a73f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -174,7 +174,7 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
           duplicatedLoops.push_back(loop);
         }
 
-        if (!duplicatedLoopSet.count(loop)) {
+        if (!duplicatedLoopSet.contains(loop)) {
           newBTag.loop = LTag;
           loop->addBasicBlockToLoop(newB, *LI);
         }
@@ -193,7 +193,7 @@ bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
   // Since we added all loops by their headers in DCBI order, inner loops will
   // always follow outer loops, so there is no need to sort them.
   for (Loop *L : duplicatedLoops) {
-    if (!LMap.count(L) && !noDuplicateLoops.count(L)) {
+    if (!LMap.contains(L) && !noDuplicateLoops.contains(L)) {
       VECZ_FAIL_IF(!duplicateUniformLoops(L));
     }
   }
@@ -526,7 +526,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
   // then the loop now has 2 preheaders. We thus need to blend them into one
   // single preheader.
   for (auto *const LTag : DR->getLoopOrdering()) {
-    if (!LTag->isLoopDivergent() && !LMap.count(LTag->loop)) {
+    if (!LTag->isLoopDivergent() && !LMap.contains(LTag->loop)) {
       BasicBlock *predicatedPreheader = LTag->preheader;
       if (BasicBlock *uniformPreheader = getBlock(predicatedPreheader)) {
         BasicBlock *header = LTag->header;
@@ -577,7 +577,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
     const Loop *L = pair.first;
 
     if (Loop *parentL = L->getParentLoop()) {
-      if (LMap.count(parentL)) {
+      if (LMap.contains(parentL)) {
         continue;
       }
     }
@@ -783,7 +783,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
   BasicBlock *connectionPoint = target;
 
   const auto *const LTag = DR->getTag(predicatedB).loop;
-  const bool needsStore = LTag && LMap.count(LTag->loop);
+  const bool needsStore = LTag && LMap.contains(LTag->loop);
   if (needsStore) {
     // 'store' is a block that will contain all the uniform versions of the
     // live in instructions of the predicated target.
@@ -818,7 +818,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
 
     // 'store' belongs in the first outer loop non duplicated.
     Loop *parentLoop = LTag->loop->getParentLoop();
-    while (parentLoop && LMap.count(parentLoop)) {
+    while (parentLoop && LMap.contains(parentLoop)) {
       parentLoop = parentLoop->getParentLoop();
     }
     if (parentLoop) {
@@ -867,7 +867,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendConnectionPoint(
       // because of 'CP'. These blocks are all the blocks that have more than
       // one predecessor, that belong to the same region as 'CP', and that
       // succeed it.
-      if (!region.blendPoints.count(CP)) {
+      if (!region.blendPoints.contains(CP)) {
         // The first blend point impacted by 'CP' is 'CP' itself.
         region.blendPoints.insert({CP, {CP}});
 
@@ -968,7 +968,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
 
   for (const auto &tag : DR->getBlockOrdering()) {
     BasicBlock *blendPoint = tag.BB;
-    if (blendBlocks.count(blendPoint) == 0) {
+    if (!blendBlocks.contains(blendPoint)) {
       continue;
     }
 
@@ -986,7 +986,7 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
     }
 
     for (auto *liveInVal : liveness->getBlockInfo(blendPoint).LiveIn) {
-      if (blendedValues.count(liveInVal)) {
+      if (blendedValues.contains(liveInVal)) {
         continue;
       }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
index 497cf0e8985de..d862839961420 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -62,7 +62,7 @@ bool AreUsersDead(Instruction *I,
     }
 
     // I is held by a non-dead user.
-    if (!DeadList.count(UserI) && !WorkList.count(UserI)) {
+    if (!DeadList.contains(UserI) && !WorkList.contains(UserI)) {
       return false;
     }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index 89f9a0fecf5e9..5d9d78e36d7b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -162,7 +162,7 @@ bool BasicMem2RegPass::canPromoteAlloca(AllocaInst *Alloca) const {
 
   // Stores must precede other users.
   for (Instruction &I : EntryBB) {
-    if (NonStoreUsers.count(&I)) {
+    if (NonStoreUsers.contains(&I)) {
       return false;
     } else if (&I == TheStore) {
       break;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 6791053403a0a..94d5527e61c47 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -2161,7 +2161,7 @@ bool ControlFlowConversionState::Impl::generateDivergentLoopResultUpdates(
   // the outer loop otherwise.
   auto *const ParentL = LTag.loop->getParentLoop();
   auto *const ParentLT = ParentL ? &DR->getTag(ParentL) : nullptr;
-  if (!ParentLT || !ParentLT->loopResultPrevs.count(LLV)) {
+  if (!ParentLT || !ParentLT->loopResultPrevs.contains(LLV)) {
     PHI->addIncoming(getDefaultValue(PHI->getType()), LTag.preheader);
   } else {
     BasicBlock *LLVDef = cast<Instruction>(LLV)->getParent();
@@ -2343,7 +2343,7 @@ void addDeferral(BasicBlock *newSrc, BasicBlock *deferred,
   auto newSrcIt = deferrals.find(newSrc);
   if (newSrcIt != deferrals.end()) {
     // If the deferral edge already exists, there is no need to add it again.
-    if (newSrcIt->second.count(deferred)) {
+    if (newSrcIt->second.contains(deferred)) {
       LLVM_DEBUG(dbgs() << "\t\tDeferral (" << newSrc->getName() << ", "
                         << deferred->getName() << ") already exists\n");
       return;
@@ -2354,7 +2354,7 @@ void addDeferral(BasicBlock *newSrc, BasicBlock *deferred,
     // If the deferral edge already exists the other way around, we don't want
     // to add it the opposite way, in risk of creating an infinite loop in the
     // CFG.
-    if (deferredIt->second.count(newSrc)) {
+    if (deferredIt->second.contains(newSrc)) {
       LLVM_DEBUG(dbgs() << "\t\tOpposite deferral (" << deferred->getName()
                         << ", " << newSrc->getName() << ") already exists\n");
       return;
@@ -2438,7 +2438,7 @@ bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
       for (BasicBlock *succ : successors(BB)) {
         size_t nextIndex = ~size_t(0);
         for (BasicBlock *deferred : availableTargets) {
-          if (targeted.count(deferred)) {
+          if (targeted.contains(deferred)) {
             continue;
           }
 
@@ -2449,7 +2449,7 @@ bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
         }
 
         const size_t succIndex = DR->getTagIndex(succ);
-        if (!targeted.count(succ)) {
+        if (!targeted.contains(succ)) {
           // If we have not found a target or there is a better one.
           if (nextIndex == ~size_t(0) || nextIndex > succIndex) {
             nextIndex = succIndex;
@@ -2688,7 +2688,7 @@ bool ControlFlowConversionState::Impl::generateSelects() {
         BasicBlock *cur = B;
         while (cur->hasNPredecessors(1) && !incomings.empty()) {
           cur = cur->getSinglePredecessor();
-          if (incomings.count(cur)) {
+          if (incomings.contains(cur)) {
             break;
           }
         }
@@ -2698,7 +2698,7 @@ bool ControlFlowConversionState::Impl::generateSelects() {
         //   selects),
         // - if the last block of the chain is not an incoming block, and
         // - if the last block of the chain is a convergence block.
-        if (!DR->isBlend(*B) && !incomings.count(cur) &&
+        if (!DR->isBlend(*B) && !incomings.contains(cur) &&
             cur->hasNPredecessorsOrMore(2) && PHI->getNumIncomingValues() > 1) {
           // All PHI nodes have the same incoming blocks so we update the exit
           // masks of the incoming blocks of the first PHI node here.
@@ -2845,7 +2845,7 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
       const bool isEntryMask = PHI == maskInfo.entryMask;
       for (unsigned idx = 0; idx < PHI->getNumIncomingValues(); ++idx) {
         BasicBlock *incoming = PHI->getIncomingBlock(idx);
-        if (preds.count(incoming)) {
+        if (preds.contains(incoming)) {
           continue;
         }
         // If the incoming block is no longer a predecessor, transform it into
@@ -2887,7 +2887,7 @@ bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
 
       // And add any new incoming blocks that do not replace any previous.
       for (BasicBlock *pred : preds) {
-        if (!incomings.count(pred)) {
+        if (!incomings.contains(pred)) {
           PHI->addIncoming(getDefaultValue(PHI->getType()), pred);
         }
       }
@@ -3045,7 +3045,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
                       << ":\n");
     for (Instruction &I : *dst) {
       // Don't try to blend a blend value.
-      if (blends.count(&I)) {
+      if (blends.contains(&I)) {
         continue;
       }
 
@@ -3113,7 +3113,7 @@ bool ControlFlowConversionState::Impl::blendInstructions() {
             }
             // If 'opDef' is a loop live value, set an entry point in the loop
             // header.
-            if (srcLoop->loopLiveValues.count(opDef)) {
+            if (srcLoop->loopLiveValues.contains(opDef)) {
               LLVM_DEBUG(dbgs()
                          << "\t\t\tFound persisted value of the operand: "
                          << srcLoop->loopResultPrevs[opDef] << "\n");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index 9614bd6f4806d..79513f649ae2c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -223,7 +223,7 @@ bool InterleavedGroupCombinePass::InterleavedGroupInfo::canDeinterleaveMask(
   // If it finds any of the mask's own operands as group members or in
   // between group members, the mask cannot be (trivially) moved.
   while (IA) {
-    if (Ops.count(IA)) {
+    if (Ops.contains(IA)) {
       // We found something the mask depends on, so we can't de-interleave...
       return false;
     } else if (IA == Data.front()) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 10a98067fee77..d7a6be208004f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -539,7 +539,7 @@ void Packetizer::Result::getPacketValues(unsigned width,
 
 PacketRange Packetizer::Result::createPacket(unsigned width) const {
   assert(info && "Can't create a packet on a fail state");
-  assert(info->packets.count(width) == 0 &&
+  assert(!info->packets.contains(width) &&
          "Shouldn't create the same packet twice");
 
   const auto start = packetizer.packetData.size();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index e76564902b9d4..597b91f09979f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -228,7 +228,7 @@ PreservedAnalyses PreLinearizePass::run(Function &F,
       VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions);
 
   InstructionCost boscc_cost;
-  UniformValueResult *UVR = nullptr;
+  const UniformValueResult *UVR = nullptr;
   if (VU.choices().linearizeBOSCC()) {
     boscc_cost = calculateBoolReductionCost(F.getContext(), F.getParent(), TTI,
                                             VU.width());
@@ -261,7 +261,7 @@ PreservedAnalyses PreLinearizePass::run(Function &F,
       SmallVector<BasicBlock *, 2> hoistable;
       SmallPtrSet<BasicBlock *, 2> new_succs;
       for (auto *succ : successors(BB)) {
-        if (targets.count(succ) == 0) {
+        if (!targets.contains(succ)) {
           if (single_succs[succ] == nullptr || pred_size(succ) != 1 ||
               LI.getLoopFor(succ) != block_loop || !isTrivialBlock(*succ)) {
             simple = false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
index 285d4ab334e81..c48b94b56109f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -258,7 +258,7 @@ PreservedAnalyses ScalarizationPass::run(llvm::Function &F,
         }
 
         if (I.getType()->isVectorTy() && UVR.isVarying(&I) &&
-            tracer.visited.count(&I) == 0) {
+            !tracer.visited.contains(&I)) {
           SR.setNeedsScalarization(&I);
           NeedsScalarization = true;
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index f90948271c008..baf050562fee1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -170,7 +170,7 @@ bool Scalarizer::scalarizeAll() {
   // Beware of instructions not being processed strictly in dominance order.
   DenseSet<Instruction *> ScalarLeaves;
   for (Value *V : ToScalarize) {
-    if (Failures.count(V)) {
+    if (Failures.contains(V)) {
       continue;
     }
 
@@ -178,7 +178,7 @@ bool Scalarizer::scalarizeAll() {
     // its operands fixing up to use the scalarized versions.
     for (auto *U : V->users()) {
       if (auto *I = dyn_cast<Instruction>(U)) {
-        if (ScalarizeSet.count(I) == 0) {
+        if (!ScalarizeSet.contains(I)) {
           ScalarLeaves.insert(I);
         }
       }
@@ -245,7 +245,7 @@ Value *Scalarizer::scalarizeOperands(Instruction *I) {
   // No special-case handling, so just gather any scalarized operands
   for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
     auto *Op = I->getOperand(i);
-    if (ScalarizeSet.count(Op)) {
+    if (ScalarizeSet.contains(Op)) {
       I->setOperand(i, getGather(Op));
     }
   }
@@ -296,7 +296,7 @@ Value *Scalarizer::scalarizeOperandsPrintf(CallInst *CI) {
       VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
       PacketMask PM;
       PM.enableAll(SimdWidth);
-      SimdPacket *OpPacket = scalarize(Op.get(), PM);
+      const SimdPacket *OpPacket = scalarize(Op.get(), PM);
       VECZ_STAT_FAIL_IF(!OpPacket, VeczScalarizeFailCall);
       for (unsigned i = 0; i < OpPacket->size(); ++i) {
         Value *Lane = OpPacket->at(i);
@@ -411,7 +411,7 @@ Value *Scalarizer::scalarizeReduceIntrinsic(IntrinsicInst *Intrin) {
   IRBuilder<> B(Intrin);
   PM.enableAll(SimdWidth);
 
-  SimdPacket *Packet = scalarize(Vec, PM);
+  const SimdPacket *Packet = scalarize(Vec, PM);
   VECZ_STAT_FAIL_IF(!Packet, VeczScalarizeFailReduceIntrinsic);
 
   Type *const VecEleTy = VecTy->getElementType();
@@ -481,7 +481,7 @@ Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) {
   const unsigned SimdWidth = VecSrcTy->getNumElements();
   PacketMask PM;
   PM.enableAll(SimdWidth);
-  SimdPacket *SrcPacket = scalarize(BC->getOperand(0), PM);
+  const SimdPacket *SrcPacket = scalarize(BC->getOperand(0), PM);
   VECZ_FAIL_IF(!SrcPacket);
 
   Type *DstTy = BC->getDestTy();
@@ -532,7 +532,7 @@ SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) {
   }
 
   // This value hasn't been scheduled for scalarization, so extract instead
-  if (!V->getType()->isVoidTy() && ScalarizeSet.count(V) == 0) {
+  if (!V->getType()->isVoidTy() && !ScalarizeSet.contains(V)) {
     return extractLanes(V, PM);
   }
 
@@ -691,7 +691,7 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
       if (LaneVal && !isa<UndefValue>(LaneVal)) {
         // Check if the LaneVal SIMD Value is already processed
         // and a Debug Value Intrinsic has been created for it.
-        if (VectorElements.find(LaneVal) != VectorElements.end()) {
+        if (VectorElements.contains(LaneVal)) {
           continue;
         }
         // DWARF bit piece expressions are used to describe part of an
@@ -870,7 +870,7 @@ SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
   Value *VectorData = Store->getValueOperand();
 
   // Emit scalarized data values.
-  SimdPacket *DataPacket = scalarize(VectorData, PM);
+  const SimdPacket *DataPacket = scalarize(VectorData, PM);
   VECZ_FAIL_IF(!DataPacket);
 
   // Absorb redundant bitcasts
@@ -951,10 +951,10 @@ SimdPacket *Scalarizer::scalarizeBinaryOp(BinaryOperator *BinOp,
   auto *VecDataTy = dyn_cast<FixedVectorType>(LHS->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
-  SimdPacket *LHSPacket = scalarize(LHS, PM);
+  const SimdPacket *LHSPacket = scalarize(LHS, PM);
   VECZ_FAIL_IF(!LHSPacket);
   Value *RHS = BinOp->getOperand(1);
-  SimdPacket *RHSPacket = scalarize(RHS, PM);
+  const SimdPacket *RHSPacket = scalarize(RHS, PM);
   VECZ_FAIL_IF(!RHSPacket);
   SimdPacket *P = getPacket(BinOp, SimdWidth);
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -979,7 +979,7 @@ SimdPacket *Scalarizer::scalarizeFreeze(FreezeInst *FreezeI, PacketMask PM) {
   auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
-  SimdPacket *SrcPacket = scalarize(Src, PM);
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
 
   // Create scalarized freeze.
@@ -1000,7 +1000,7 @@ SimdPacket *Scalarizer::scalarizeUnaryOp(UnaryOperator *UnOp, PacketMask PM) {
   auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
-  SimdPacket *SrcPacket = scalarize(Src, PM);
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
   SimdPacket *P = getPacket(UnOp, SimdWidth);
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -1047,7 +1047,7 @@ SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) {
   auto *VecDstTy = dyn_cast<FixedVectorType>(CastI->getType());
   VECZ_STAT_FAIL_IF(!VecDstTy || (VecDstTy->getNumElements() != SimdWidth),
                     VeczScalarizeFailCast);
-  SimdPacket *SrcPacket = scalarize(Src, PM);
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
 
   // Create scalarized casts.
@@ -1113,7 +1113,7 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
              j * SrcEleSize < (i + 1) * DstEleSize; ++j) {
           SPM.enable(j);
         }
-        SimdPacket *SrcPacket = scalarize(Src, SPM);
+        const SimdPacket *SrcPacket = scalarize(Src, SPM);
         VECZ_FAIL_IF(!SrcPacket);
         assert(SrcPacket == &S &&
                "Scalarization of Src should update existing packet");
@@ -1154,7 +1154,7 @@ SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
 
   // Handle same width vector -> vector casts, quite a more straighforward
   // affair.
-  SimdPacket *SrcPacket = scalarize(Src, PM);
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
   VECZ_FAIL_IF(!SrcPacket);
   Type *DstEleTy = VecDstTy->getElementType();
   SimdPacket *P = getPacket(BC, SimdWidth);
@@ -1174,10 +1174,10 @@ SimdPacket *Scalarizer::scalarizeICmp(ICmpInst *ICmp, PacketMask PM) {
   auto *VecDataTy = dyn_cast<FixedVectorType>(ICmp->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
-  SimdPacket *LHSPacket = scalarize(LHS, PM);
+  const SimdPacket *LHSPacket = scalarize(LHS, PM);
   VECZ_FAIL_IF(!LHSPacket);
   Value *RHS = ICmp->getOperand(1);
-  SimdPacket *RHSPacket = scalarize(RHS, PM);
+  const SimdPacket *RHSPacket = scalarize(RHS, PM);
   VECZ_FAIL_IF(!RHSPacket);
   SimdPacket *P = getPacket(ICmp, SimdWidth);
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -1197,10 +1197,10 @@ SimdPacket *Scalarizer::scalarizeFCmp(FCmpInst *FCmp, PacketMask PM) {
   auto *VecDataTy = dyn_cast<FixedVectorType>(FCmp->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
-  SimdPacket *LHSPacket = scalarize(LHS, PM);
+  const SimdPacket *LHSPacket = scalarize(LHS, PM);
   VECZ_FAIL_IF(!LHSPacket);
   Value *RHS = FCmp->getOperand(1);
-  SimdPacket *RHSPacket = scalarize(RHS, PM);
+  const SimdPacket *RHSPacket = scalarize(RHS, PM);
   VECZ_FAIL_IF(!RHSPacket);
   SimdPacket *P = getPacket(FCmp, SimdWidth);
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -1217,7 +1217,7 @@ SimdPacket *Scalarizer::scalarizeFCmp(FCmpInst *FCmp, PacketMask PM) {
 SimdPacket *Scalarizer::scalarizeSelect(SelectInst *Select, PacketMask PM) {
   IRBuilder<> B(Select);
   Value *Cond = Select->getCondition();
-  SimdPacket *CondPacket = nullptr;
+  const SimdPacket *CondPacket = nullptr;
   if (Cond->getType()->isVectorTy()) {
     CondPacket = scalarize(Cond, PM);
     VECZ_FAIL_IF(!CondPacket);
@@ -1226,10 +1226,10 @@ SimdPacket *Scalarizer::scalarizeSelect(SelectInst *Select, PacketMask PM) {
   auto *VecDataTy = dyn_cast<FixedVectorType>(Select->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
-  SimdPacket *TruePacket = scalarize(TrueVal, PM);
+  const SimdPacket *TruePacket = scalarize(TrueVal, PM);
   VECZ_FAIL_IF(!TruePacket);
   Value *FalseVal = Select->getFalseValue();
-  SimdPacket *FalsePacket = scalarize(FalseVal, PM);
+  const SimdPacket *FalsePacket = scalarize(FalseVal, PM);
   VECZ_FAIL_IF(!FalsePacket);
   SimdPacket *P = getPacket(Select, SimdWidth);
   for (unsigned i = 0; i < SimdWidth; i++) {
@@ -1257,14 +1257,14 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
   // Scalarize mask
   Value *MaskOperand = MaskedOp.getMaskOperand();
   VECZ_FAIL_IF(!MaskOperand);
-  SimdPacket *MaskPacket = scalarize(MaskedOp.getMaskOperand(), PM);
+  const SimdPacket *MaskPacket = scalarize(MaskedOp.getMaskOperand(), PM);
   VECZ_FAIL_IF(!MaskPacket);
 
   Value *VecPtr = MaskedOp.getPointerOperand();
   VECZ_FAIL_IF(!VecPtr);
 
   // Scalarize data packet if this is a store
-  SimdPacket *DataPacket = nullptr;
+  const SimdPacket *DataPacket = nullptr;
   if (MaskedOp.isStore()) {
     DataPacket = scalarize(MaskedOp.getDataOperand(), PM);
     VECZ_FAIL_IF(!DataPacket);
@@ -1415,7 +1415,7 @@ SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
     }
     SmallVector<Value *, 4> Ops;
     for (unsigned i = 0; i < NumArgs; i++) {
-      SimdPacket *OpPacket = OpPackets[i];
+      const SimdPacket *OpPacket = OpPackets[i];
       if (OpPacket) {
         Ops.push_back(OpPacket->at(j));
       } else {
@@ -1479,12 +1479,12 @@ SimdPacket *Scalarizer::scalarizeShuffleVector(ShuffleVectorInst *Shuffle,
   }
 
   // Scalarize each vector operand as needed.
-  SimdPacket *LHSPacket = nullptr;
+  const SimdPacket *LHSPacket = nullptr;
   if (LHSMask.Value != 0) {
     LHSPacket = scalarize(LHS, LHSMask);
     VECZ_FAIL_IF(!LHSPacket);
   }
-  SimdPacket *RHSPacket = nullptr;
+  const SimdPacket *RHSPacket = nullptr;
   if (RHSMask.Value != 0) {
     RHSPacket = scalarize(RHS, RHSMask);
     VECZ_FAIL_IF(!RHSPacket);
@@ -1537,7 +1537,7 @@ SimdPacket *Scalarizer::scalarizeInsertElement(InsertElementInst *Insert,
   if (CIndex) {
     OpMask.disable(IndexInt);
   }
-  SimdPacket *VecP = scalarize(Vec, OpMask);
+  const SimdPacket *VecP = scalarize(Vec, OpMask);
   VECZ_FAIL_IF(!VecP);
 
   // For each lane, we need to select either the original vector element (from
@@ -1571,7 +1571,7 @@ SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) {
   const unsigned simdWidth = vecDataTy->getNumElements();
 
   Value *const ptr = GEP->getPointerOperand();
-  SimdPacket *ptrPacket = nullptr;
+  const SimdPacket *ptrPacket = nullptr;
   if (ptr->getType()->isVectorTy()) {
     ptrPacket = scalarize(ptr, PM);
     VECZ_FAIL_IF(!ptrPacket);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
index f104f7b6e85ee..2e23b6157b024 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -74,22 +74,6 @@ PreservedAnalyses vecz::SimplifyInfiniteLoopPass::run(
     assert(AR.DT.verify() &&
            "SimplifyInfiniteLoopPass: Dominator Tree failed verification");
 
-    std::unordered_set<Instruction *> toBlend;
-    // Find all instructions used in `target` that may be defined after the
-    // infinite loop, for which adding the edge from the infinite loop to the
-    // return block may break the SSA form.
-    for (Instruction &I : *target) {
-      if (!isa<PHINode>(&I)) {
-        for (Value *op : I.operands()) {
-          if (Instruction *opI = dyn_cast<Instruction>(op)) {
-            if (opI->getParent() != target) {
-              toBlend.insert(opI);
-            }
-          }
-        }
-      }
-    }
-
     // Update the phi nodes in the return block because we added a new
     // predecessor to it.
     for (Instruction &I : *target) {
@@ -97,18 +81,6 @@ PreservedAnalyses vecz::SimplifyInfiniteLoopPass::run(
         PHI->addIncoming(UndefValue::get(PHI->getType()), virtualExit);
       }
     }
-    // Add new phi nodes for instructions computed in `toBlend`.
-    for (Instruction *I : toBlend) {
-      PHINode *PHI = PHINode::Create(I->getType(), 2, I->getName() + ".blend");
-      PHI->insertBefore(target->begin());
-      for (BasicBlock *pred : predecessors(target)) {
-        if (pred != virtualExit) {
-          PHI->addIncoming(I, pred);
-        } else {
-          PHI->addIncoming(UndefValue::get(PHI->getType()), pred);
-        }
-      }
-    }
 
     modified = true;
   } else if (loopExitBlocks.size() == 1) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index 883dd9330d0c1..61b56aa70eb98 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -124,10 +124,10 @@ void VeczPassMachinery::addClassToPassNames() {
   // Register a callback which skips all passes once we've failed to vectorize
   // a function.
   PIC.registerShouldRunOptionalPassCallback([&](StringRef, llvm::Any IR) {
-    const Function **FPtr = any_cast<const Function *>(&IR);
+    const Function *const *FPtr = any_cast<const Function *>(&IR);
     const Function *F = FPtr ? *FPtr : nullptr;
     if (!F) {
-      if (const auto **L = any_cast<const Loop *>(&IR)) {
+      if (const auto *const *L = any_cast<const Loop *>(&IR)) {
         F = (*L)->getHeader()->getParent();
       } else {
         // Always run module passes

From 56538f37848d6391704a124fd82fc940ac453be5 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 1 Apr 2025 16:51:54 +0100
Subject: [PATCH 145/182] [vecz] Fix packetization of PHIs of literal structs.

We packetize literal structs elementwise, but were failing to account
for that in the handling of PHIs. Add a test that covers this.
---
 .../include/transform/packetization_helpers.h | 31 ++++++++++++++
 .../transform/packetization_helpers.cpp       | 30 --------------
 .../vecz/source/transform/packetizer.cpp      | 14 ++-----
 .../test/lit/llvm/packetize_phi_struct.ll     | 41 +++++++++++++++++++
 4 files changed, 75 insertions(+), 41 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index da3a9ad2f1c20..991d30ee992ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -30,6 +30,8 @@
 
 #include <memory>
 
+#include "debugging.h"
+
 namespace llvm {
 class Value;
 class ShuffleVectorInst;
@@ -225,6 +227,35 @@ struct PacketInfo {
   }
 };
 
+inline llvm::Type *getWideType(llvm::Type *ty, llvm::ElementCount factor) {
+  if (!ty->isVectorTy()) {
+    // The wide type of a struct literal is the wide type of each of its
+    // elements.
+    if (auto *structTy = llvm::dyn_cast<llvm::StructType>(ty);
+        structTy && structTy->isLiteral()) {
+      llvm::SmallVector<llvm::Type *, 4> wideElts(structTy->elements());
+      for (unsigned i = 0, e = wideElts.size(); i != e; i++) {
+        wideElts[i] = getWideType(wideElts[i], factor);
+      }
+      return llvm::StructType::get(ty->getContext(), wideElts);
+    } else if (structTy) {
+      VECZ_ERROR("Can't create wide type for structure type");
+    }
+    return llvm::VectorType::get(ty, factor);
+  }
+  const bool isScalable = llvm::isa<llvm::ScalableVectorType>(ty);
+  assert((!factor.isScalable() || !isScalable) &&
+         "Can't widen a scalable vector by a scalable amount");
+  auto *vecTy = llvm::cast<llvm::VectorType>(ty);
+  const unsigned elts = vecTy->getElementCount().getKnownMinValue();
+  // If we're widening a scalable type then set the fixed factor to scalable
+  // here.
+  if (isScalable && !factor.isScalable()) {
+    factor = llvm::ElementCount::getScalable(factor.getKnownMinValue());
+  }
+  ty = vecTy->getElementType();
+  return llvm::VectorType::get(ty, factor * elts);
+}
 }  // namespace vecz
 
 #endif  // VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index d7a6be208004f..0342a23a7fa89 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -45,36 +45,6 @@ using namespace llvm;
 using namespace vecz;
 
 namespace {
-inline Type *getWideType(Type *ty, ElementCount factor) {
-  if (!ty->isVectorTy()) {
-    // The wide type of a struct literal is the wide type of each of its
-    // elements.
-    if (auto *structTy = dyn_cast<StructType>(ty);
-        structTy && structTy->isLiteral()) {
-      SmallVector<Type *, 4> wideElts(structTy->elements());
-      for (unsigned i = 0, e = wideElts.size(); i != e; i++) {
-        wideElts[i] = getWideType(wideElts[i], factor);
-      }
-      return StructType::get(ty->getContext(), wideElts);
-    } else if (structTy) {
-      VECZ_ERROR("Can't create wide type for structure type");
-    }
-    return VectorType::get(ty, factor);
-  }
-  const bool isScalable = isa<ScalableVectorType>(ty);
-  assert((!factor.isScalable() || !isScalable) &&
-         "Can't widen a scalable vector by a scalable amount");
-  auto *vecTy = cast<llvm::VectorType>(ty);
-  const unsigned elts = vecTy->getElementCount().getKnownMinValue();
-  // If we're widening a scalable type then set the fixed factor to scalable
-  // here.
-  if (isScalable && !factor.isScalable()) {
-    factor = ElementCount::getScalable(factor.getKnownMinValue());
-  }
-  ty = vecTy->getElementType();
-  return VectorType::get(ty, factor * elts);
-}
-
 Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
                                const vecz::TargetInfo &TI, IRBuilder<> &B,
                                bool URem);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 2e68e53306284..3fca1cb8609dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -96,16 +96,6 @@ Type *getPaddedType(Type *Ty) {
   }
   return Ty;
 }
-
-Type *getWideType(Type *Ty, ElementCount Factor) {
-  unsigned Elts = 1;
-  if (Ty->isVectorTy()) {
-    auto *VecTy = cast<FixedVectorType>(Ty);
-    Elts = VecTy->getNumElements();
-    Ty = VecTy->getElementType();
-  }
-  return VectorType::get(Ty, Factor * Elts);
-}
 }  // namespace
 
 using ValuePacket = SmallVector<Value *, 16>;
@@ -1998,7 +1988,9 @@ ValuePacket Packetizer::Impl::packetizePHI(PHINode *Phi) {
 
   auto *wideTy = ty;
   unsigned packetWidth = 0;
-  if (ty->isVectorTy() || VectorType::isValidElementType(ty)) {
+  if (auto structTy = dyn_cast<StructType>(ty);
+      ty->isVectorTy() || VectorType::isValidElementType(ty) ||
+      (structTy && structTy->isLiteral())) {
     packetWidth = getPacketWidthForType(ty);
     wideTy =
         getWideType(Phi->getType(), SimdWidth.divideCoefficientBy(packetWidth));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
new file mode 100644
index 0000000000000..a18a022d0c607
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_foo()
+define spir_kernel void @foo() {
+; CHECK-LABEL: entry:
+entry:
+  ; CHECK: %0 = call { <4 x i64>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align8_monotonic_monotonic_1_Dv4_u3ptrDv4_mDv4_mDv4_b(
+  %0 = cmpxchg ptr null, i64 0, i64 0 monotonic monotonic, align 8
+  ; CHECK: br label %bb.1
+  br label %bb.1
+
+; CHECK-LABEL: bb.1:
+bb.1:
+  ; CHECK: %1 = phi { <4 x i64>, <4 x i1> } [ %0, %bb.1 ], [ %0, %entry ]
+  %1 = phi { i64, i1 } [ %0, %bb.1 ], [ %0, %entry ]
+  ; CHECK: %2 = extractvalue { <4 x i64>, <4 x i1> } %1, 0
+  %2 = extractvalue { i64, i1 } %1, 0
+  ; %3 = call { <4 x i64>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align8_monotonic_monotonic_1_Dv4_u3ptrDv4_mDv4_mDv4_b(
+  %3 = cmpxchg ptr null, i64 0, i64 %2 monotonic monotonic, align 8
+  ; CHECK: br label %bb.1
+  br label %bb.1
+}

From 79c9b402bee119fee79629cbec61aecd05b875b7 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 4 Apr 2025 15:29:47 +0100
Subject: [PATCH 146/182] [LLVM 21] Take address space into account for
 legality.

---
 .../multi_llvm/target_transform_info.h        | 74 +++++++++++++++++++
 .../vecz/include/vecz/vecz_target_info.h      | 23 +++---
 .../vecz/source/vector_target_info.cpp        | 58 +++++++++------
 3 files changed, 122 insertions(+), 33 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
new file mode 100644
index 0000000000000..06e0522759524
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
@@ -0,0 +1,74 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
+#define MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
+
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+namespace detail {
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedLoadImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                           llvm::Align Alignment, unsigned)
+    -> decltype(TTI.isLegalMaskedLoad(Ty, Alignment)) {
+  return TTI.isLegalMaskedLoad(Ty, Alignment);
+}
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                            llvm::Align Alignment, unsigned)
+    -> decltype(TTI.isLegalMaskedStore(Ty, Alignment)) {
+  return TTI.isLegalMaskedStore(Ty, Alignment);
+}
+
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+// TODO: Make this depend only on LLVM version once we do not have to remain
+// compatible with slightly older LLVM 21 snapshots.
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedLoadImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                           llvm::Align Alignment, unsigned AddrSpace)
+    -> decltype(TTI.isLegalMaskedLoad(Ty, Alignment, AddrSpace)) {
+  return TTI.isLegalMaskedLoad(Ty, Alignment, AddrSpace);
+}
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                            llvm::Align Alignment, unsigned AddrSpace)
+    -> decltype(TTI.isLegalMaskedStore(Ty, Alignment, AddrSpace)) {
+  return TTI.isLegalMaskedStore(Ty, Alignment, AddrSpace);
+}
+#endif
+
+}  // namespace detail
+
+bool isLegalMaskedLoad(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty,
+                       llvm::Align Alignment, unsigned AddrSpace) {
+  return detail::isLegalMaskedLoadImpl(TTI, Ty, Alignment, AddrSpace);
+}
+
+bool isLegalMaskedStore(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty,
+                        llvm::Align Alignment, unsigned AddrSpace) {
+  return detail::isLegalMaskedStoreImpl(TTI, Ty, Alignment, AddrSpace);
+}
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
index d922eb6e9bd7a..fb5b1aa69cdf6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -570,8 +570,10 @@ class TargetInfo {
   /// @param[in] F The function in which the instruction will be created.
   /// @param[in] Ty Type of the vector to load.
   /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
   virtual VPMemOpLegality isVPLoadLegal(const llvm::Function *F, llvm::Type *Ty,
-                                        unsigned Alignment) const;
+                                        unsigned Alignment,
+                                        unsigned AddrSpace) const;
 
   /// @return A VPMemOpLegality enum stating whether we can create a vp.store or
   /// a masked.store intrinsic.
@@ -579,9 +581,10 @@ class TargetInfo {
   /// @param[in] F The function in which the instruction will be created.
   /// @param[in] Ty Type of the vector to store.
   /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
   virtual VPMemOpLegality isVPStoreLegal(const llvm::Function *F,
-                                         llvm::Type *Ty,
-                                         unsigned Alignment) const;
+                                         llvm::Type *Ty, unsigned Alignment,
+                                         unsigned AddrSpace) const;
 
   /// @return A VPMemOpLegality enum stating whether we can create a vp.gather
   /// or a masked.gather intrinsic.
@@ -589,9 +592,10 @@ class TargetInfo {
   /// @param[in] F The function in which the instruction will be created.
   /// @param[in] Ty Type of the vector to gather.
   /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
   virtual VPMemOpLegality isVPGatherLegal(const llvm::Function *F,
-                                          llvm::Type *Ty,
-                                          unsigned Alignment) const;
+                                          llvm::Type *Ty, unsigned Alignment,
+                                          unsigned AddrSpace) const;
 
   /// @return A VPMemOpLegality enum stating whether we can create a vp.scatter
   /// or a masked.scatter intrinsic.
@@ -599,9 +603,10 @@ class TargetInfo {
   /// @param[in] F The function in which the instruction will be created.
   /// @param[in] Ty Type of the vector to scatter.
   /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
   virtual VPMemOpLegality isVPScatterLegal(const llvm::Function *F,
-                                           llvm::Type *Ty,
-                                           unsigned Alignment) const;
+                                           llvm::Type *Ty, unsigned Alignment,
+                                           unsigned AddrSpace) const;
 
   /// @brief Function to check whether a given type is valid as the element type
   /// of a scalable vector used in a VP intrinsic.
@@ -619,9 +624,9 @@ class TargetInfo {
   VPMemOpLegality checkMemOpLegality(
       const llvm::Function *F,
       llvm::function_ref<bool(const llvm::TargetTransformInfo &, llvm::Type *,
-                              unsigned)>
+                              unsigned, unsigned)>
           Checker,
-      llvm::Type *Ty, unsigned Alignment) const;
+      llvm::Type *Ty, unsigned Alignment, unsigned AddrSpace) const;
 
   /// @brief Create a broadcast of a vector.
   ///
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 4cf4132384e3a..0813d9757593d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -21,6 +21,7 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/TargetParser/Triple.h>
 #include <multi_llvm/intrinsic.h>
+#include <multi_llvm/target_transform_info.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
@@ -47,22 +48,22 @@ Value *applyEVLToMask(IRBuilder<> &B, Value *EVL, Value *Mask) {
 }
 
 bool isLegalMaskedLoad(const TargetTransformInfo &TTI, Type *Ty,
-                       unsigned Alignment) {
-  return TTI.isLegalMaskedLoad(Ty, Align(Alignment));
+                       unsigned Alignment, unsigned AddrSpace) {
+  return multi_llvm::isLegalMaskedLoad(TTI, Ty, Align(Alignment), AddrSpace);
 }
 
 bool isLegalMaskedStore(const TargetTransformInfo &TTI, Type *Ty,
-                        unsigned Alignment) {
-  return TTI.isLegalMaskedStore(Ty, Align(Alignment));
+                        unsigned Alignment, unsigned AddrSpace) {
+  return multi_llvm::isLegalMaskedStore(TTI, Ty, Align(Alignment), AddrSpace);
 }
 
 bool isLegalMaskedGather(const TargetTransformInfo &TTI, Type *Ty,
-                         unsigned Alignment) {
+                         unsigned Alignment, unsigned) {
   return TTI.isLegalMaskedGather(Ty, Align(Alignment));
 }
 
 bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
-                          unsigned Alignment) {
+                          unsigned Alignment, unsigned) {
   return TTI.isLegalMaskedScatter(Ty, Align(Alignment));
 }
 }  // namespace
@@ -91,7 +92,8 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   if (CIntStride && CIntStride->getSExtValue() == 1) {
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
-      const auto Legality = isVPLoadLegal(F, Ty, Alignment);
+      const auto Legality =
+          isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
       if (!Legality.isVPLegal()) {
         emitVeczRemarkMissed(F,
                              "Could not create a VP load as the target "
@@ -157,7 +159,8 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
   if (CIntStride && CIntStride->getSExtValue() == 1) {
     if (EVL) {
       const Function *F = B.GetInsertBlock()->getParent();
-      const auto Legality = isVPStoreLegal(F, VecTy, Alignment);
+      const auto Legality =
+          isVPStoreLegal(F, VecTy, Alignment, PtrTy->getAddressSpace());
       if (!Legality.isVPLegal()) {
         emitVeczRemarkMissed(F,
                              "Could not create a VP store as the target "
@@ -228,7 +231,8 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   // Use LLVM intrinsics for masked vector loads.
   if (Ty->isVectorTy()) {
     const Function *F = B.GetInsertBlock()->getParent();
-    const auto Legality = isVPLoadLegal(F, Ty, Alignment);
+    const auto Legality =
+        isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
     if (EVL && Legality.isVPLegal()) {
       const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
       const SmallVector<llvm::Type *, 2> Tys = {Ty, PtrTy};
@@ -333,7 +337,8 @@ Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
   // Use LLVM intrinsics for masked vector Stores.
   if (DataTy->isVectorTy()) {
     const Function *F = B.GetInsertBlock()->getParent();
-    const auto Legality = isVPStoreLegal(F, DataTy, Alignment);
+    const auto Legality =
+        isVPStoreLegal(F, DataTy, Alignment, PtrTy->getAddressSpace());
     if (EVL && Legality.isVPLegal()) {
       const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
       const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), PtrTy};
@@ -492,7 +497,8 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   Constant *DefaultEleData = UndefValue::get(EleTy);
 
   if (Ty->isVectorTy()) {
-    const auto Legality = isVPGatherLegal(F, Ty, Alignment);
+    const auto Legality =
+        isVPGatherLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
     if (EVL && Legality.isVPLegal()) {
       const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
       const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
@@ -590,7 +596,10 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
   if (DataTy->isVectorTy()) {
     auto *VecPtrTy = dyn_cast<VectorType>(Ptr->getType());
     VECZ_FAIL_IF(!VecPtrTy);
-    const auto Legality = isVPScatterLegal(F, DataTy, Alignment);
+    auto *PtrTy = dyn_cast<PointerType>(VecPtrTy->getElementType());
+    VECZ_FAIL_IF(!PtrTy);
+    const auto Legality =
+        isVPScatterLegal(F, DataTy, Alignment, PtrTy->getAddressSpace());
     if (EVL && Legality.isVPLegal()) {
       const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
       const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
@@ -896,18 +905,19 @@ bool TargetInfo::isVPVectorLegal(const Function &F, Type *Ty) const {
 
 TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
     const Function *F,
-    function_ref<bool(const llvm::TargetTransformInfo &, Type *, unsigned)>
+    function_ref<bool(const llvm::TargetTransformInfo &, Type *, unsigned,
+                      unsigned)>
         Checker,
-    Type *Ty, unsigned Alignment) const {
+    Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
   assert(Ty->isVectorTy() && "Expected a vector type");
   const bool isMaskLegal =
       !(isa<ScalableVectorType>(Ty) && TM_) ||
-      Checker(TM_->getTargetTransformInfo(*F), Ty, Alignment);
+      Checker(TM_->getTargetTransformInfo(*F), Ty, Alignment, AddrSpace);
   // Assuming a pointer bit width of 64
   bool isVPLegal = isMaskLegal && isVPVectorLegal(*F, Ty);
   if (isVPLegal) {
     const unsigned PtrBitWidth =
-        TM_ ? TM_->createDataLayout().getPointerSizeInBits(/*AS=*/0) : 64;
+        TM_ ? TM_->createDataLayout().getPointerSizeInBits(AddrSpace) : 64;
     auto &Ctx = Ty->getContext();
     auto *const IntTy = IntegerType::get(Ctx, PtrBitWidth);
     auto *const IntVecTy =
@@ -918,23 +928,23 @@ TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
 }
 
 TargetInfo::VPMemOpLegality TargetInfo::isVPLoadLegal(
-    const Function *F, Type *Ty, unsigned Alignment) const {
-  return checkMemOpLegality(F, isLegalMaskedLoad, Ty, Alignment);
+    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedLoad, Ty, Alignment, AddrSpace);
 }
 
 TargetInfo::VPMemOpLegality TargetInfo::isVPStoreLegal(
-    const Function *F, Type *Ty, unsigned Alignment) const {
-  return checkMemOpLegality(F, isLegalMaskedStore, Ty, Alignment);
+    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedStore, Ty, Alignment, AddrSpace);
 }
 
 TargetInfo::VPMemOpLegality TargetInfo::isVPGatherLegal(
-    const Function *F, Type *Ty, unsigned Alignment) const {
-  return checkMemOpLegality(F, isLegalMaskedGather, Ty, Alignment);
+    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedGather, Ty, Alignment, AddrSpace);
 }
 
 TargetInfo::VPMemOpLegality TargetInfo::isVPScatterLegal(
-    const Function *F, Type *Ty, unsigned Alignment) const {
-  return checkMemOpLegality(F, isLegalMaskedScatter, Ty, Alignment);
+    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedScatter, Ty, Alignment, AddrSpace);
 }
 
 bool TargetInfo::isLegalVPElementType(Type *) const { return true; }

From a52b7d9be20fbf34f5abe835089a10a1913c9eba Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Tue, 8 Apr 2025 15:21:18 +0100
Subject: [PATCH 147/182] Remove all conditional LLVM code for LLVM 18

---
 .../include/compiler/utils/barrier_regions.h  |  4 ----
 .../include/multi_llvm/dibuilder.h            |  7 -------
 .../source/barrier_regions.cpp                |  2 --
 .../source/work_item_loops_pass.cpp           | 12 -----------
 .../analysis/uniform_value_analysis.cpp       |  4 ----
 .../transform/packetization_helpers.cpp       |  2 --
 .../vecz/source/transform/scalarizer.cpp      |  2 --
 .../lit/llvm/inlined_function_debug_info.ll   |  6 ++----
 .../lit/llvm/insert_element_debug_info.ll     |  3 +--
 .../test/lit/llvm/packetization_debug_info.ll | 18 ++++++----------
 .../vecz/test/lit/llvm/phi_node_debug_info.ll |  3 +--
 .../test/lit/llvm/scalarization_debug_info.ll | 21 +++++++------------
 12 files changed, 17 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
index 0553b475e11e3..fe30907d8d2a4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -155,13 +155,11 @@ class Barrier {
     return debug_intrinsics_;
   }
 
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
   using debug_variable_records_t =
       llvm::SmallVector<std::pair<llvm::DbgVariableRecord *, unsigned>, 4>;
   const debug_variable_records_t &getDebugDbgVariableRecords() const {
     return debug_variable_records_;
   }
-#endif
 
   /// @brief gets the original function
   llvm::Function &getFunc() { return func_; }
@@ -265,11 +263,9 @@ class Barrier {
   debug_stub_map_t barrier_stub_call_map_;
   /// @brief List of debug intrinsics and byte offsets into live variable struct
   debug_intrinsics_t debug_intrinsics_;
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
   /// @brief List of debug DbgVariableRecords and byte offsets into live
   /// variable struct
   debug_variable_records_t debug_variable_records_;
-#endif
 
   size_t size_t_bytes;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
index b7276c25d77c8..0f0a06a723dbd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
@@ -35,16 +35,9 @@ template <typename DIBuilder>
 struct DIBuilderWrapper : DIBuilder {
   using DIBuilder::DIBuilder;
 
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
   llvm::BasicBlock *getBasicBlock(llvm::InsertPosition InsertPt) {
     return InsertPt.getBasicBlock();
   }
-#else
-  llvm::BasicBlock *getBasicBlock(llvm::BasicBlock::iterator InsertPt) {
-    // Cannot handle sentinels.
-    return InsertPt->getParent();
-  }
-#endif
 
   auto insertDeclare(llvm::Value *Storage, llvm::DILocalVariable *VarInfo,
                      llvm::DIExpression *Expr, const llvm::DILocation *DL,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 7c806722bcc4a..99de911fd8d71 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -975,12 +975,10 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() {
         debug_intrinsics_.push_back(std::make_pair(dbgDeclare, offset));
       }
     }
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
     const auto DVRDeclares = findDVRDeclares(member.value);
     for (auto *const DVRDeclare : DVRDeclares) {
       debug_variable_records_.push_back(std::make_pair(DVRDeclare, offset));
     }
-#endif
     offset += member.size;
     live_variable_index_map_[std::make_pair(member.value, member.member_idx)] =
         field_tys.size();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index bb97c8959d5ad..05acfb64c801b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -290,7 +290,6 @@ struct ScheduleGenerator {
           /*AlwaysPreserve=*/false, DINode::FlagZero,
           old_var->getAlignInBits());
       // Create intrinsic
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
       if (!module.IsNewDbgInfoFormat) {
         auto *const DII = cast<Instruction *>(DIB.insertDeclare(
             barrier.getDebugAddr(), new_var, expr, wrapperDbgLoc, block));
@@ -314,26 +313,15 @@ struct ScheduleGenerator {
 
         DummyInst->eraseFromParent();
       }
-#else
-      auto *const DII = DIB.insertDeclare(barrier.getDebugAddr(), new_var, expr,
-                                          wrapperDbgLoc, block);
-
-      // Bit of a HACK to produce the same debug output as the Mem2Reg
-      // pass used to do.
-      auto *const DVIntrinsic = cast<DbgVariableIntrinsic>(DII);
-      ConvertDebugDeclareToDebugValue(DVIntrinsic, SI, DIB);
-#endif
     };
     for (auto debug_pair : barrier.getDebugIntrinsics()) {
       RecreateDebugIntrinsic(debug_pair.first->getVariable(),
                              debug_pair.second);
     }
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
     for (auto debug_pair : barrier.getDebugDbgVariableRecords()) {
       RecreateDebugIntrinsic(debug_pair.first->getVariable(),
                              debug_pair.second);
     }
-#endif
   }
 
   void createWorkItemLoopBody(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 558d225250cec..d4f5032b15982 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -387,11 +387,7 @@ void UniformValueResult::markVaryingValues(Value *V, Value *From) {
   } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(VIns)) {
     // We need to clear the flags because the initial address may be out of
     // bounds but masked out.
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
     GEP->setNoWrapFlags(GEPNoWrapFlags::none());
-#else
-    GEP->setIsInBounds(false);
-#endif
 
     // Same as with the stores
     AllocaInst *Alloca = findAllocaFromPointer(GEP->getPointerOperand());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 0342a23a7fa89..beda48d076d62 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -77,12 +77,10 @@ IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) {
       ++Next;
     } while (!IsPhi && (Next != End) &&
              (isa<PHINode>(Next) || isa<AllocaInst>(Next)));
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
     // If there is debug info between this instruction and the next, insert
     // before the debug info. This is required for PHIs and makes sense for
     // other instructions too.
     Next.setHeadBit(true);
-#endif
     return {I->getParent(), Next};
   }
   // Else find the first point in the function after any allocas.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index baf050562fee1..8e74840269f0f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -709,7 +709,6 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
     }
   };
 
-#if LLVM_VERSION_GREATER_EQUAL(19, 0)
   for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) {
     DILocalVariable *DILocal = nullptr;
     DebugLoc DILoc;
@@ -730,7 +729,6 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
                                   Original->getIterator());
     });
   }
-#endif
 
   auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM);
   if (!MDV) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
index a96bb1e109f23..98fdad1edadbe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -131,10 +131,8 @@ attributes #4 = { nobuiltin }
 
 ; CHECK: %[[LOAD1:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
 ; CHECK: %[[LOAD2:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
-; CHECK-GE19: #dbg_value(i32 %[[LOAD1]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1:![0-9]+]]
-; CHECK-LT19: call void @llvm.dbg.value(metadata i32 %[[LOAD1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1:![0-9]+]]
-; CHECK-GE19: #dbg_value(i32 %[[LOAD2]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1]]
-; CHECK-LT19: call void @llvm.dbg.value(metadata i32 %[[LOAD2]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DI_LOC1]]
+; CHECK: #dbg_value(i32 %[[LOAD1]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1:![0-9]+]]
+; CHECK: #dbg_value(i32 %[[LOAD2]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1]]
 ; CHECK: %{{.*}} = mul nsw i32 %[[LOAD1]], %[[LOAD2]], !dbg [[DI_LOC2:![0-9]+]]
 
 ; CHECK: [[HELPER_SUBPROGRAM:![0-9]+]] = distinct !DISubprogram(name: "k_one",
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index 2327a0207e8a2..f88e542a6a86a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -50,8 +50,7 @@ entry:
 ; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the
 ; termination of the previous value assigned to %tmp - we could probably do
 ; better here by manifesting a vectorized value?
-; CHECK-GE19: #dbg_value(i32 {{(poison|undef)}}, [[VAR:![0-9]+]],
-; CHECK-LT19: call void @llvm.dbg.value(metadata i32 {{(poison|undef)}}, metadata [[VAR:![0-9]+]],
+; CHECK: #dbg_value(i32 {{(poison|undef)}}, [[VAR:![0-9]+]],
 ; CHECK-SAME:   !DIExpression({{.*}}),
 ; CHECK-SAME:   !{{[0-9]+}}
   %1 = load i32, i32* %tid, align 4, !dbg !32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index 963c8f2da6ac4..4432b71a7d440 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -36,28 +36,23 @@ entry:
   %a = alloca i32, align 4
   %b = alloca i32, align 4
   store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
-; CHECK-GE19: #dbg_value(ptr addrspace(1) %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
-; CHECK-LT19: call void @llvm.dbg.value(metadata ptr addrspace(1) %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
+; CHECK: #dbg_value(ptr addrspace(1) %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
 ; CHECK-SAME: [[PARAM_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
   store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
-; CHECK-GE19: #dbg_value(ptr addrspace(1) %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
-; CHECK-LT19: call void @llvm.dbg.value(metadata ptr addrspace(1) %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
+; CHECK: #dbg_value(ptr addrspace(1) %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[PARAM_LOC]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
   store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
-; CHECK-GE19: #dbg_value(ptr addrspace(1) %out, [[DI_OUT:![0-9]+]], [[EXPR]]
-; CHECK-LT19: call void @llvm.dbg.value(metadata ptr addrspace(1) %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
+; CHECK: #dbg_value(ptr addrspace(1) %out, [[DI_OUT:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[PARAM_LOC]]
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
-; CHECK-GE19: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
-; CHECK-LT19: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
+; CHECK: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[TID_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
   %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
-; CHECK-GE19: #dbg_value(i32 {{undef|poison}}, [[DI_A:![0-9]+]], !DIExpression(),
-; CHECK-LT19: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_A:![0-9]+]], metadata !DIExpression())
+; CHECK: #dbg_value(i32 {{undef|poison}}, [[DI_A:![0-9]+]], !DIExpression(),
 ; CHECK-SAME: [[A_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
@@ -65,8 +60,7 @@ entry:
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
   store i32 %2, i32* %a, align 4, !dbg !32
-; CHECK-GE19: #dbg_value(i32 {{undef|poison}}, [[DI_B:![0-9]+]], !DIExpression(),
-; CHECK-LT19: call void @llvm.dbg.value(metadata i32 undef, metadata [[DI_B:![0-9]+]], metadata !DIExpression())
+; CHECK: #dbg_value(i32 {{undef|poison}}, [[DI_B:![0-9]+]], !DIExpression(),
 ; CHECK-SAME: [[B_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
   %3 = load i64, i64* %tid, align 8, !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index 5f1ddd384e5ba..206b5b9e3eec3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -48,8 +48,7 @@ entry:
 
 ; CHECK: for.cond:
 ; CHECK: %[[PHI1:.+]] = phi {{i[0-9]+}} [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
-; CHECK-GE19: #dbg_value(i64 %[[PHI1]], !{{[0-9]+}},
-; CHECK-LT19: call void @llvm.dbg.value(metadata i64 %[[PHI1]], metadata !{{[0-9]+}},
+; CHECK: #dbg_value(i64 %[[PHI1]], !{{[0-9]+}},
 ; CHECK-SAME: !DIExpression({{.*}}),
 ; CHECK-SAME: !{{[0-9]+}}
 ; Check we haven't inserted a llvm.dbg.value intrinsic before the last of the PHIs.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
index 6c75e223b8cd4..1848a31b1d4c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -132,32 +132,25 @@ attributes #3 = { nobuiltin }
 ; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_mul2({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
 
 ; Check that intrinsics for user variable locations are still present
-; CHECK-LT19: call void @llvm.dbg.value(metadata {{.*}} %in1, metadata [[DI_IN1:![0-9]+]], metadata [[EXPR:!DIExpression()]]
-; CHECK-GE19: #dbg_value({{.*}} %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
+; CHECK: #dbg_value({{.*}} %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
 ; CHECK-SAME: [[PARAM_LOC:![0-9]+]]
 
-; CHECK-LT19: call void @llvm.dbg.value(metadata {{.*}} %in2, metadata [[DI_IN2:![0-9]+]], metadata [[EXPR]]
-; CHECK-GE19: #dbg_value({{.*}} %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
+; CHECK: #dbg_value({{.*}} %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[PARAM_LOC]]
 
-; CHECK-LT19: call void @llvm.dbg.value(metadata {{.*}} %out, metadata [[DI_OUT:![0-9]+]], metadata [[EXPR]]
-; CHECK-GE19: #dbg_value({{.*}} %out, [[DI_OUT:![0-9]+]], [[EXPR]]
+; CHECK: #dbg_value({{.*}} %out, [[DI_OUT:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[PARAM_LOC]]
 
-; CHECK-LT19: call void @llvm.dbg.value(metadata i64 %call, metadata [[DI_TID:![0-9]+]], metadata [[EXPR]]
-; CHECK-GE19: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
+; CHECK: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[TID_LOC:![0-9]+]]
 
-; CHECK-LT19: call void @llvm.dbg.declare(metadata ptr %a, metadata [[DI_A:![0-9]+]], metadata [[EXPR]]
-; CHECK-GE19: #dbg_declare(ptr %a, [[DI_A:![0-9]+]], [[EXPR]]
+; CHECK: #dbg_declare(ptr %a, [[DI_A:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[A_LOC:![0-9]+]]
 
-; CHECK-LT19: call void @llvm.dbg.declare(metadata ptr %b, metadata [[DI_B:![0-9]+]], metadata [[EXPR]]
-; CHECK-GE19: #dbg_declare(ptr %b, [[DI_B:![0-9]+]], [[EXPR]]
+; CHECK: #dbg_declare(ptr %b, [[DI_B:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[B_LOC:![0-9]+]]
 
-; CHECK-LT19: call void @llvm.dbg.declare(metadata ptr %tmp, metadata [[DI_TMP:![0-9]+]], metadata [[EXPR]]
-; CHECK-GE19: #dbg_declare(ptr %tmp, [[DI_TMP:![0-9]+]], [[EXPR]]
+; CHECK: #dbg_declare(ptr %tmp, [[DI_TMP:![0-9]+]], [[EXPR]]
 ; CHECK-SAME: [[TMP_LOC:![0-9]+]]
 
 ; Debug info metadata entries

From e35e1993a3bb04089f8502d44fbd3fe764fdce18 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 9 Apr 2025 16:34:03 +0100
Subject: [PATCH 148/182] Do not vectorize NoInline kernels.

A recent LLVM 21 change prevents us from being able to vectorize
NoInline kernels, but it was probably never a good idea to vectorize
them anyway: it is not clear whether the vectorization we perform is
something that a user who writes `__attribute__((noinline))` would want.
Therefore, explicitly reject vectorization of NoInline functions and
restore affected tests to their original version that permitted them to
not be vectorized.
---
 .../vectorizable_function_analysis.cpp        | 29 ++++++++++---------
 .../analysis/vectorizable_function_analysis.h |  4 ---
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
index 96ee366e8fe86..7afc0e48dd6fc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -47,7 +47,7 @@ namespace {
 /// @param[in] I Instruction to check for vectorizability.
 /// @param[in] Ctx VectorizationContext for BuiltinInfo.
 ///
-/// @return true if I can be vectorized.
+/// @return true if I can be vectorized, false otherwise.
 bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
   // Certain instructions just cannot appear.
   switch (I.getOpcode()) {
@@ -98,18 +98,26 @@ bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
 /// @param[in] F Function to check for vectorizability.
 /// @param[in] Ctx VectorizationContext for BuiltinInfo.
 ///
-/// @return the Instruction that prevents the function from vectorizing, or
-/// nullptr if the function can be vectorized.
-const Value *canVectorize(const Function &F, const VectorizationContext &Ctx) {
+/// @return true if F can be vectorized, false otherwise.
+bool canVectorize(const Function &F, const VectorizationContext &Ctx) {
+  // Do not vectorize functions with the OptNone attribute. Also do not
+  // vectorize functions with the NoInline attribute, since conceptually, the
+  // vectorized kernel calls the original kernel in a loop, and then that gets
+  // inlined and optimized.
+  if (F.hasFnAttribute(Attribute::OptimizeNone) ||
+      F.hasFnAttribute(Attribute::NoInline)) {
+    return false;
+  }
+
   // Look for things that are not (yet?) supported.
   for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
       if (!canVectorize(I, Ctx)) {
-        return &I;
+        return false;
       }
     }
   }
-  return nullptr;
+  return true;
 }
 
 }  // namespace
@@ -119,13 +127,6 @@ VectorizableFunctionAnalysis::Result VectorizableFunctionAnalysis::run(
   Result res;
   auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
 
-  // Do not vectorize functions with the OptNone attribute
-  if (F.hasFnAttribute(Attribute::OptimizeNone)) {
-    res.canVectorize = false;
-    return res;
-  }
-
-  res.failedAt = canVectorize(F, Ctx);
-  res.canVectorize = !res.failedAt;
+  res.canVectorize = canVectorize(F, Ctx);
   return res;
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
index 230e5aa883919..3a6eebb423b0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -44,10 +44,6 @@ class VectorizableFunctionAnalysis
     /// @brief Whether the function can be vectorized.
     bool canVectorize = false;
 
-    /// @brief If the function can not be vectorized, the value (if any) that
-    /// is the cause of the problem.
-    const llvm::Value *failedAt = nullptr;
-
     /// @brief Handle invalidation events from the new pass manager.
     ///
     /// @return false, as this analysis can never be invalidated.

From a337576a007d90922894c5e05ea0937c90c4834a Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 15 Apr 2025 18:27:24 +0100
Subject: [PATCH 149/182] [LLVM 21] Avoid calling spir_kernel functions.

LLVM 21 no longer permits calls to spir_kernel functions, not even if it
would be patched out by later passes.
---
 .../compiler_pipeline/source/pass_functions.cpp           | 5 +++++
 .../source/replace_local_module_scope_variables_pass.cpp  | 5 +++++
 .../vecz/test/lit/llvm/multiple_kernels_inlining.ll       | 8 ++++----
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 392ee922235dc..558fd4ef40cb1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -615,6 +615,11 @@ static llvm::Function *createKernelWrapperFunctionImpl(
   // copy the calling convention from the old function
   NewFunction.setCallingConv(F.getCallingConv());
 
+  // and remove spir_kernel from the old function
+  if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
+    F.setCallingConv(llvm::CallingConv::SPIR_FUNC);
+  }
+
   // copy the metadata into the new kernel ignoring any debug info.
   copyFunctionMetadata(F, NewFunction);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index e6b2de85afc4e..d514f1e5be183 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -640,6 +640,11 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
     // copy the calling convention too
     newFunc->setCallingConv(kernelFunc->getCallingConv());
 
+    // and clear spir_kernel from the original function
+    if (kernelFunc->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
+      kernelFunc->setCallingConv(llvm::CallingConv::SPIR_FUNC);
+    }
+
     // we don't use exceptions
     newFunc->addFnAttr(Attribute::NoUnwind);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
index 37b77bf8d00f5..1ce80a13e77b7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
@@ -19,7 +19,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-define spir_kernel void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %call = call i64 @__mux_get_global_id(i32 0)
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
@@ -31,15 +31,15 @@ entry:
 
 declare i64 @__mux_get_global_id(i32)
 
-define spir_kernel void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  call spir_kernel void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
+  call void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
   ret void
 }
 
 define spir_kernel void @foo3(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
-  call spir_kernel void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
+  call void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
   ret void
 }
 

From 1168a1892fd854a0996cba40ec2e3c0bee2d3b4d Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 18 Apr 2025 16:01:13 +0100
Subject: [PATCH 150/182] Fix up Instrinsic::getAttributes to work across LLVM
 versions

Support multiple Instrinsic::getAttribute() versions by using templates
to allow Support for both. This can be simplified in the future once
DPC++ catches up with LLVM in this aspect.
---
 .../include/multi_llvm/intrinsic.h            | 26 +++++++++++++++++++
 .../compiler_pipeline/source/builtin_info.cpp |  3 ++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
index 3c1f1560ceda9..0ca8be0f867fb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
@@ -17,9 +17,11 @@
 #ifndef MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
 #define MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
 
+#include <llvm/IR/Intrinsics.h>
 #include <multi_llvm/llvm_version.h>
 
 namespace multi_llvm {
+
 static inline auto GetOrInsertIntrinsicDeclaration(
     llvm::Module *M, llvm::Intrinsic::ID id,
     llvm::ArrayRef<llvm::Type *> Tys = {}) {
@@ -30,6 +32,30 @@ static inline auto GetOrInsertIntrinsicDeclaration(
 #endif
 }
 
+// Drop getAttributes workaround when LLVM 20 is minimum version
+// This can also be simplified once DPC++ catches up with getAttributes
+// with FunctionType as the last argument.
+namespace detail {
+template <typename... T>
+auto getAttributes(T... args)
+    -> decltype(llvm::Intrinsic::getAttributes(args...)) {
+  return llvm::Intrinsic::getAttributes(args...);
+}
+template <typename... T>
+auto getAttributes(T... args, llvm::FunctionType *)
+    -> decltype(llvm::Intrinsic::getAttributes(args...)) {
+  return llvm::Intrinsic::getAttributes(args...);
+}
+}  // namespace detail
+
+namespace Intrinsic {
+static inline auto getAttributes(llvm::LLVMContext &C, llvm::Intrinsic::ID ID,
+                                 llvm::FunctionType *FT) {
+  return detail::getAttributes<llvm::LLVMContext &, llvm::Intrinsic::ID>(C, ID,
+                                                                         FT);
+}
+}  // namespace Intrinsic
+
 }  // namespace multi_llvm
 
 #endif  // MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index f6c4076dd830a..75f1c307cc99a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -336,7 +336,8 @@ Builtin BuiltinInfo::analyzeBuiltin(const Function &F) const {
     int32_t Properties = eBuiltinPropertyNone;
 
     const Intrinsic::ID IntrID = (Intrinsic::ID)F.getIntrinsicID();
-    const AttributeList AS = Intrinsic::getAttributes(F.getContext(), IntrID);
+    const AttributeList AS = multi_llvm::Intrinsic::getAttributes(
+        F.getContext(), IntrID, F.getFunctionType());
     const bool NoSideEffect = F.onlyReadsMemory();
     bool SafeIntrinsic = false;
     switch (IntrID) {

From 0423986f9e859dd39ce390703357e53fbf13023e Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 24 Apr 2025 00:30:17 +0100
Subject: [PATCH 151/182] Update to address PointerType deprecation.

LLVM 21 deprecates the functions to get PointerTypes with specific
elements, as they were left over from the transition to opaque pointers
and have not been needed ever since.

* Change PointerType::get(EltTy, AS) to PointerType::get(Ctx, AS).
* Change PointerType::get(EltTy, PtrTy->getAddressSpace()) to PtrTy.
* Use IRBuilder instead of the above when one is available.
* Remove code to emit and handle bitcasts between pointer types.
* Remove long-dead NDEBUG_WI_LOOPS debug option which emits debug print
  instructions directly in kernels.

This is not strictly NFC, but mostly NFC: there are no changes to the
handling of any IR that is generated by current versions of LLVM.
---
 .../source/barrier_regions.cpp                |  13 +--
 .../source/cl_builtin_info.cpp                |  23 ++--
 .../compiler_pipeline/source/mangling.cpp     |   8 +-
 .../source/mux_builtin_info.cpp               |  12 +--
 ...lace_local_module_scope_variables_pass.cpp |   2 +-
 .../source/work_item_loops_pass.cpp           |  71 ------------
 .../vecz/source/memory_operations.cpp         |   6 +-
 .../vecz/source/transform/packetizer.cpp      |  17 +--
 .../vecz/source/transform/scalarizer.cpp      | 102 +++++-------------
 .../transform/squash_small_vectors_pass.cpp   |  12 +--
 .../vecz/source/vector_target_info.cpp        |  35 ++----
 .../test/lit/llvm/no_redundant_bitcasts.ll    |  93 ----------------
 12 files changed, 52 insertions(+), 342 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 99de911fd8d71..62f4afa86fb26 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -312,10 +312,6 @@ Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live,
   }
 
   Value *gep;
-  Type *data_ty = live->getType();
-  if (auto *AI = dyn_cast<AllocaInst>(live)) {
-    data_ty = AI->getAllocatedType();
-  }
 
   if (auto field_it = barrier.live_variable_index_map_.find(key);
       field_it != barrier.live_variable_index_map_.end()) {
@@ -356,13 +352,6 @@ Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live,
     gep = gepBuilder.CreateInBoundsGEP(
         barrier.live_var_mem_ty_, barrier_struct, live_variable_info_idxs,
         Twine("live_gep_scalable_") + live->getName());
-
-    // Cast the pointer to the scalable vector type
-    gep = gepBuilder.CreatePointerCast(
-        gep,
-        PointerType::get(
-            data_ty,
-            cast<PointerType>(barrier_struct->getType())->getAddressSpace()));
   } else {
     // Fall back and see if this live variable is actually a decomposed
     // structure type.
@@ -1071,7 +1060,7 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
   const bool hasBarrierStruct = !whole_live_variables_set_.empty() &&
                                 region.schedule != BarrierSchedule::Once;
   if (hasBarrierStruct) {
-    PointerType *pty = PointerType::get(live_var_mem_ty_, 0);
+    PointerType *pty = PointerType::get(context, /*AddressSpace=*/0);
     new_func_params.push_back(pty);
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index c78252cd0f9e2..bf156687fd88e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -1295,8 +1295,7 @@ Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
         if (!FixedVectorType::isValidElementType(PtrRetPointeeTy)) {
           return nullptr;
         }
-        Type *NewEleTy = FixedVectorType::get(PtrRetPointeeTy, Width);
-        Type *NewType = PointerType::get(NewEleTy, OldPtrTy->getAddressSpace());
+        Type *NewType = OldPtrTy;
         TypeQualifiers NewQuals;
         TypeQualifiers EleQuals = OldQuals;
         NewQuals.push_back(EleQuals.pop_front());  // Pointer qualifier
@@ -1413,9 +1412,7 @@ Function *CLBuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
         [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i];
         assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy &&
                "Demangling inconsistency");
-        auto *OldVecTy = cast<FixedVectorType>(PtrRetPointeeTy);
-        Type *NewTy = PointerType::get(OldVecTy->getElementType(),
-                                       OldPtrTy->getAddressSpace());
+        Type *NewTy = OldPtrTy;
         TypeQualifiers NewQuals = OldQuals;
         const TypeQualifier PtrQual = NewQuals.pop_front();
         const TypeQualifier VecQual = NewQuals.pop_front();
@@ -2190,8 +2187,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVLoad(Function *F, unsigned Width,
       Data = B.CreateInsertElement(Data, Lane, Index, "vload_insert");
     }
   } else {
-    Value *VecBase = B.CreateBitCast(GEPBase, PtrTy, "vload_ptr");
-    auto *Load = B.CreateLoad(DataTy, VecBase, false, "vload");
+    auto *Load = B.CreateLoad(DataTy, GEPBase, false, "vload");
 
     const unsigned Align = DataTy->getScalarSizeInBits() / 8;
     Load->setAlignment(MaybeAlign(Align).valueOrOne());
@@ -2250,8 +2246,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVStore(Function *F, unsigned Width,
       Store = B.CreateStore(Lane, GEP, false);
     }
   } else {
-    Value *VecBase = B.CreateBitCast(GEPBase, PtrTy, "vstore_ptr");
-    Store = B.CreateStore(Data, VecBase, false);
+    Store = B.CreateStore(Data, GEPBase, false);
 
     const unsigned Align = VecDataTy->getScalarSizeInBits() / 8;
     Store->setAlignment(MaybeAlign(Align).valueOrOne());
@@ -2279,12 +2274,10 @@ Value *CLBuiltinInfo::emitBuiltinInlineVLoadHalf(Function *F, IRBuilder<> &B,
     return nullptr;
   }
   Type *U16Ty = B.getInt16Ty();
-  Type *U16PtrTy = PointerType::get(U16Ty, PtrTy->getAddressSpace());
-  Value *DataPtr = B.CreateBitCast(Ptr, U16PtrTy);
 
   // Emit the base pointer.
   Value *Offset = Args[0];
-  DataPtr = B.CreateGEP(U16Ty, DataPtr, Offset, "vload_base");
+  Value *DataPtr = B.CreateGEP(U16Ty, Ptr, Offset, "vload_base");
 
   // Load a ushort.
   Value *Data = B.CreateLoad(B.getInt16Ty(), DataPtr, "vload_half");
@@ -2368,12 +2361,10 @@ Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode,
     return nullptr;
   }
   auto U16Ty = B.getInt16Ty();
-  Type *U16PtrTy = PointerType::get(U16Ty, PtrTy->getAddressSpace());
-  Value *DataPtr = B.CreateBitCast(Ptr, U16PtrTy);
 
   // Emit the base pointer.
   Value *Offset = Args[1];
-  DataPtr = B.CreateGEP(U16Ty, DataPtr, Offset, "vstore_base");
+  Value *DataPtr = B.CreateGEP(U16Ty, Ptr, Offset, "vstore_base");
 
   // Store the ushort.
   return B.CreateStore(Data, DataPtr);
@@ -2644,7 +2635,7 @@ Value *CLBuiltinInfo::emitBuiltinInlinePrintf(BuiltinID, IRBuilder<> &B,
   // Declare printf if needed.
   Function *Printf = M.getFunction("printf");
   if (!Printf) {
-    PointerType *PtrTy = PointerType::getUnqual(B.getInt8Ty());
+    PointerType *PtrTy = B.getPtrTy(/*AddressSpace=*/0);
     FunctionType *PrintfTy = FunctionType::get(B.getInt32Ty(), {PtrTy}, true);
     Printf =
         Function::Create(PrintfTy, GlobalValue::ExternalLinkage, "printf", &M);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
index e2f41d3b39017..3597b72eeca10 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
@@ -573,7 +573,7 @@ bool NameMangler::demangleType(Lexer &L, Type *&Ty, Type **PointerEltTy,
       return false;
     }
     Quals.push_back(QualsAS->Qual);
-    return PointerType::get(nullptr, QualsAS->AS);
+    return llvm::PointerType::get(*Context, QualsAS->AS);
   }
 
   // Match scalable vector types.
@@ -620,11 +620,7 @@ bool NameMangler::demangleType(Lexer &L, Type *&Ty, Type **PointerEltTy,
     if (PointerEltTy) {
       *PointerEltTy = ElementType;
     }
-    if (ElementType->isVoidTy()) {
-      Ty = llvm::PointerType::get(Type::getInt8Ty(*Context), QualsAS->AS);
-    } else {
-      Ty = llvm::PointerType::get(ElementType, QualsAS->AS);
-    }
+    Ty = llvm::PointerType::get(*Context, QualsAS->AS);
     return true;
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index 155d1380e0242..706a197a75a97 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -897,13 +897,7 @@ Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) {
   // space to the same structure type (i.e., regardless of image dimensions,
   // etc.)
   if (TgtExtTy->getName() == "spirv.Image") {
-    return PointerType::getUnqual([&Ctx]() {
-      const char *MuxImageTyName = "MuxImage";
-      if (auto *STy = StructType::getTypeByName(Ctx, MuxImageTyName)) {
-        return STy;
-      }
-      return StructType::create(Ctx, MuxImageTyName);
-    }());
+    return PointerType::getUnqual(Ctx);
   }
 
   return nullptr;
@@ -1210,7 +1204,7 @@ BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) {
     auto *const WIInfoS = getWorkItemInfoStructTy(M);
     WIInfo.ID = SchedParamIndices::WI;
     WIInfo.ParamPointeeTy = WIInfoS;
-    WIInfo.ParamTy = PointerType::get(WIInfoS, /*AddressSpace=*/0);
+    WIInfo.ParamTy = PointerType::get(Ctx, /*AddressSpace=*/0);
     WIInfo.ParamName = "wi-info";
     WIInfo.ParamDebugName = WIInfoS->getStructName().str();
     WIInfo.PassedExternally = false;
@@ -1226,7 +1220,7 @@ BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) {
     auto *const WGInfoS = getWorkGroupInfoStructTy(M);
     WGInfo.ID = SchedParamIndices::WG;
     WGInfo.ParamPointeeTy = WGInfoS;
-    WGInfo.ParamTy = PointerType::get(WGInfoS, /*AddressSpace=*/0);
+    WGInfo.ParamTy = PointerType::get(Ctx, /*AddressSpace=*/0);
     WGInfo.ParamName = "wg-info";
     WGInfo.ParamDebugName = WGInfoS->getStructName().str();
     WGInfo.PassedExternally = true;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index d514f1e5be183..1be6cdaec9226 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -355,7 +355,7 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
   // change all our functions to take a pointer to the new structTy we created
   const AttributeSet defaultAttrs;
   addParamToAllRequiredFunctions(
-      M, PointerType::get(structTy, /*AddressSpace=*/0), defaultAttrs);
+      M, PointerType::get(M.getContext(), /*AddressSpace=*/0), defaultAttrs);
 
   // Check if we have debug info, if so we need to fix it up to turn global
   // variable entries into local variable ones.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 05acfb64c801b..2462d18fe7aa1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -36,7 +36,6 @@
 
 using namespace llvm;
 
-#define NDEBUG_WI_LOOPS
 #define DEBUG_TYPE "work-item-loops"
 
 namespace compiler {
@@ -99,65 +98,6 @@ class BarrierWithLiveVars : public Barrier {
 }  // namespace compiler
 
 namespace {
-#ifndef NDEBUG_WI_LOOPS
-/// @brief Generate IR level printf function call Debug function only.
-///
-/// @param[in] format Format string string.
-/// @param[in] module Current module.
-/// @param[in] v Value for printing.
-/// @param[in] bb Basic block insertion point for @p v.
-///
-/// @return Return instruction to be checked.
-Instruction *IRPrintf(const std::string format, Module &module, Value *v,
-                      BasicBlock *bb) {
-  LLVMContext &context = module.getContext();
-  PointerType *ptr_type = PointerType::getUnqual(IntegerType::get(context, 8));
-
-  SmallVector<Type *, 16> args;
-  args.push_back(ptr_type);
-  FunctionType *printf_type =
-      FunctionType::get(IntegerType::get(context, 32), args, true);
-
-  bool isDeclared = true;
-  Function *func_printf = module.getFunction("printf");
-  if (!func_printf) {
-    func_printf = Function::Create(printf_type, GlobalValue::ExternalLinkage,
-                                   "printf", &module);
-    isDeclared = false;
-  }
-
-  ArrayType *array_type =
-      ArrayType::get(IntegerType::get(context, 8), format.size() + 1);
-  GlobalVariable *str;
-  if (isDeclared) {
-    str = new GlobalVariable(
-        module, array_type, true, GlobalValue::PrivateLinkage, 0, ".str",
-        nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 2, false);
-  } else {
-    str = new GlobalVariable(
-        module, array_type, true, GlobalValue::PrivateLinkage, 0, ".str",
-        nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0, false);
-  }
-  str->setAlignment(MaybeAlign(1));
-
-  Constant *const_array = ConstantDataArray::getString(context, format, true);
-  SmallVector<Constant *, 16> indices;
-  ConstantInt *cst_8 = ConstantInt::get(context, APInt(64, StringRef("0"), 10));
-  indices.push_back(cst_8);
-  indices.push_back(cst_8);
-  Constant *cst_ptr = ConstantExpr::getGetElementPtr(nullptr, str, indices);
-
-  str->setInitializer(const_array);
-
-  SmallVector<Value *, 8> call_params;
-  call_params.push_back(cst_ptr);
-  call_params.push_back(v);
-
-  CallInst *call = CallInst::Create(func_printf, call_params, "", bb);
-
-  return call;
-}
-#endif  // NDEBUG_WI_LOOPS
 
 Value *materializeVF(IRBuilder<> &builder,
                      compiler::utils::VectorizationFactor vf) {
@@ -235,13 +175,6 @@ struct ScheduleGenerator {
       Value *const live_var_mem_idxs[] = {byteOffset};
       live_var_ptr =
           ir.CreateInBoundsGEP(ir.getInt8Ty(), mem_space, live_var_mem_idxs);
-
-      // cast to the live mem type
-      live_var_ptr = ir.CreatePointerCast(
-          live_var_ptr,
-          PointerType::get(
-              barrier.getLiveVarsType(),
-              cast<PointerType>(live_var_ptr->getType())->getAddressSpace()));
     }
 
     return live_var_ptr;
@@ -380,10 +313,6 @@ struct ScheduleGenerator {
     ci->setCallingConv(subkernel.getCallingConv());
     ci->setAttributes(compiler::utils::getCopiedFunctionAttrs(subkernel));
 
-#ifndef NDEBUG_WI_LOOPS
-    IRPrintf(std::string("return.kernel.body=%d\x0A"), module, ci, block);
-#endif  // NDEBUG_WI_LOOPS
-
     // And update the location of where we need to go to next (if we need to)
     const auto &successors = barrier.getSuccessorIds(i);
     if (successors.size() > 1) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index 7a1087504d6e5..aa000931e98a7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -113,8 +113,7 @@ static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data,
   VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
   VECZ_FAIL_IF(!Mask);
   assert(!Data || Data->getType() == DataTy);
-  auto *PtrTy =
-      PointerType::get(DataTy, Ptr->getType()->getPointerAddressSpace());
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
   Function *F =
       getOrCreateMaskedMemOpFn(Ctx, DataTy, PtrTy, Alignment,
                                /*IsLoad*/ Data == nullptr, EVL != nullptr);
@@ -242,8 +241,7 @@ static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data,
   VECZ_FAIL_IF(!DataTy);
   VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
   assert(!Data || Data->getType() == DataTy);
-  auto *PtrTy = PointerType::get(DataTy->getScalarType(),
-                                 Ptr->getType()->getPointerAddressSpace());
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
   Type *MaskTy = Mask ? Mask->getType() : nullptr;
   Function *F = getOrCreateInterleavedMemOpFn(
       Ctx, DataTy, PtrTy, Stride, MaskTy, Alignment,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 3fca1cb8609dc..7437a8b649bb8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2539,8 +2539,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
     PACK_FAIL_IF(ptrPacket.empty());
 
     auto *const scalarTy = dataTy->getScalarType();
-    auto *const scalarPtrTy =
-        cast<PointerType>(ptr->getType()->getScalarType());
+    auto *const ptrTy = cast<PointerType>(ptr->getType()->getScalarType());
 
     // When scattering/gathering with a vector type, we can cast it to a
     // vector of pointers to the scalar type and widen it into a vector
@@ -2559,9 +2558,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       const bool success =
           createSubSplats(Ctx.targetInfo(), B, ptrPacket, scalarWidth);
       PACK_FAIL_IF(!success);
-      auto *const newPtrTy = llvm::VectorType::get(
-          PointerType::get(scalarTy, scalarPtrTy->getPointerAddressSpace()),
-          wideEC);
+      auto *const newPtrTy = llvm::VectorType::get(ptrTy, wideEC);
       // Bitcast the above sub-splat to purely scalar pointers
       vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
       // Create an index sequence to start the offseting process
@@ -2590,9 +2587,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
         }
       }
 
-      auto *const newPtrTy = FixedVectorType::get(
-          PointerType::get(scalarTy, scalarPtrTy->getPointerAddressSpace()),
-          simdWidth);
+      auto *const newPtrTy = FixedVectorType::get(ptrTy, simdWidth);
 
       auto *const idxVector = ConstantVector::get(indices);
       auto *const undef = UndefValue::get(newPtrTy);
@@ -3337,11 +3332,7 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
       B.SetInsertPoint(&*EntryBB.getFirstInsertionPt());
       Type *AllocaTy = getWideType(PtrEleTy, SimdWidth);
       PointerRetAlloca = B.CreateAlloca(AllocaTy, nullptr, "ptr_ret_temp");
-      Value *NewOp = PointerRetAlloca;
-      if (PtrTy->getAddressSpace() != 0) {
-        Type *NewOpTy = PointerType::get(AllocaTy, PtrTy->getAddressSpace());
-        NewOp = B.CreateAddrSpaceCast(NewOp, NewOpTy);
-      }
+      Value *NewOp = B.CreateAddrSpaceCast(PointerRetAlloca, PtrTy);
       PointerRetAddr = ScalarOp;
       PointerRetStride = ConstantStride;
       Ops.push_back(NewOp);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 8e74840269f0f..7246d0cd81fec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -773,37 +773,18 @@ SimdPacket *Scalarizer::assignScalar(SimdPacket *P, Value *V) {
 }
 
 SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
-  Value *VecPtr = Load->getPointerOperand();
-  PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
+  Value *PtrBase = Load->getPointerOperand();
   auto *VecDataTy = dyn_cast<FixedVectorType>(Load->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
 
   Type *ScalarEleTy = VecDataTy->getElementType();
-  PointerType *ScalarPtrTy =
-      PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
 
   // Absorb redundant bitcasts
-  Value *ScalarPtrBase = nullptr;
-  if (auto *BitCast = dyn_cast<BitCastInst>(VecPtr)) {
-    // Note that we assume the bitcast isn't used by anything else other than
-    // loads or stores. Other uses of the bitcast are possible in principle,
-    // which cases could be purposely constructed but it is considered unlikely
-    // to occur naturally. If it happens, the DeleteInstructions pass will not
-    // actually delete it so no harm is done in any case.
-    IC.deleteInstructionLater(BitCast);
-    VecPtr = BitCast->getOperand(0);
-    if (BitCast->getSrcTy() == ScalarPtrTy) {
-      ScalarPtrBase = VecPtr;
-    }
-  }
-  GetElementPtrInst *VecPtrGEP = dyn_cast<GetElementPtrInst>(VecPtr);
-  const bool InBounds = (VecPtrGEP && VecPtrGEP->isInBounds());
+  GetElementPtrInst *PtrGEP = dyn_cast<GetElementPtrInst>(PtrBase);
+  const bool InBounds = (PtrGEP && PtrGEP->isInBounds());
 
   IRBuilder<> B(Load);
-  if (!ScalarPtrBase) {
-    ScalarPtrBase = B.CreateBitCast(VecPtr, ScalarPtrTy);
-  }
 
   SimdPacket PtrPacket;
   SimdPacket *P = getPacket(Load, SimdWidth);
@@ -825,11 +806,10 @@ SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
       }
     }
 
-    Value *ScalarPtr =
-        InBounds
-            ? B.CreateInBoundsGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i))
-            : B.CreateGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i));
-    PtrPacket.set(i, ScalarPtr);
+    Value *Ptr = InBounds
+                     ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i))
+                     : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i));
+    PtrPacket.set(i, Ptr);
   }
 
   // The individual elements may need laxer alignment requirements than the
@@ -855,39 +835,23 @@ SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
 }
 
 SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
-  Value *VecPtr = Store->getPointerOperand();
-  assert(VecPtr && "Could not get pointer operand from Store");
-  PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
+  Value *PtrBase = Store->getPointerOperand();
+  assert(PtrBase && "Could not get pointer operand from Store");
   auto *VecDataTy =
       dyn_cast<FixedVectorType>(Store->getValueOperand()->getType());
   VECZ_FAIL_IF(!VecDataTy);
   const unsigned SimdWidth = VecDataTy->getNumElements();
   Type *ScalarEleTy = VecDataTy->getElementType();
-  PointerType *ScalarPtrTy =
-      PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
   Value *VectorData = Store->getValueOperand();
 
   // Emit scalarized data values.
   const SimdPacket *DataPacket = scalarize(VectorData, PM);
   VECZ_FAIL_IF(!DataPacket);
 
-  // Absorb redundant bitcasts
-  Value *ScalarPtrBase = nullptr;
-  if (auto *BitCast = dyn_cast<BitCastInst>(VecPtr)) {
-    // See comment at equivalent part of Scalarizer::scalarizeLoad()
-    IC.deleteInstructionLater(BitCast);
-    VecPtr = BitCast->getOperand(0);
-    if (BitCast->getSrcTy() == ScalarPtrTy) {
-      ScalarPtrBase = VecPtr;
-    }
-  }
-  GetElementPtrInst *VecPtrGEP = dyn_cast<GetElementPtrInst>(VecPtr);
-  const bool InBounds = (VecPtrGEP && VecPtrGEP->isInBounds());
+  GetElementPtrInst *PtrGEP = dyn_cast<GetElementPtrInst>(PtrBase);
+  const bool InBounds = (PtrGEP && PtrGEP->isInBounds());
 
   IRBuilder<> B(Store);
-  if (!ScalarPtrBase) {
-    ScalarPtrBase = B.CreateBitCast(VecPtr, ScalarPtrTy);
-  }
 
   SimdPacket PtrPacket;
   SimdPacket *P = getPacket(Store, SimdWidth);
@@ -909,11 +873,10 @@ SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
       }
     }
 
-    Value *ScalarPtr =
-        InBounds
-            ? B.CreateInBoundsGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i))
-            : B.CreateGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i));
-    PtrPacket.set(i, ScalarPtr);
+    Value *Ptr = InBounds
+                     ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i))
+                     : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i));
+    PtrPacket.set(i, Ptr);
   }
 
   // See comment at equivalent part of scalarizeLoad()
@@ -1258,8 +1221,8 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
   const SimdPacket *MaskPacket = scalarize(MaskedOp.getMaskOperand(), PM);
   VECZ_FAIL_IF(!MaskPacket);
 
-  Value *VecPtr = MaskedOp.getPointerOperand();
-  VECZ_FAIL_IF(!VecPtr);
+  Value *PtrBase = MaskedOp.getPointerOperand();
+  VECZ_FAIL_IF(!PtrBase);
 
   // Scalarize data packet if this is a store
   const SimdPacket *DataPacket = nullptr;
@@ -1268,27 +1231,12 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
     VECZ_FAIL_IF(!DataPacket);
   }
 
-  PointerType *VecPtrTy = cast<PointerType>(VecPtr->getType());
   Type *ScalarEleTy = VecDataTy->getElementType();
-  PointerType *ScalarPtrTy =
-      PointerType::get(ScalarEleTy, VecPtrTy->getAddressSpace());
 
-  // Absorb redundant bitcasts
-  Value *ScalarPtrBase = nullptr;
-  if (auto *BitCast = dyn_cast<BitCastInst>(VecPtr)) {
-    IC.deleteInstructionLater(BitCast);
-    VecPtr = BitCast->getOperand(0);
-    if (BitCast->getSrcTy() == ScalarPtrTy) {
-      ScalarPtrBase = VecPtr;
-    }
-  }
-  GetElementPtrInst *VecPtrGEP = dyn_cast<GetElementPtrInst>(VecPtr);
-  const bool InBounds = (VecPtrGEP && VecPtrGEP->isInBounds());
+  GetElementPtrInst *PtrGEP = dyn_cast<GetElementPtrInst>(PtrBase);
+  const bool InBounds = (PtrGEP && PtrGEP->isInBounds());
 
   IRBuilder<> B(CI);
-  if (!ScalarPtrBase) {
-    ScalarPtrBase = B.CreateBitCast(VecPtr, ScalarPtrTy);
-  }
 
   SimdPacket PtrPacket;
   SimdPacket *P = getPacket(CI, SimdWidth);
@@ -1300,11 +1248,10 @@ SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
       continue;
     }
 
-    Value *ScalarPtr =
-        InBounds
-            ? B.CreateInBoundsGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i))
-            : B.CreateGEP(ScalarEleTy, ScalarPtrBase, B.getInt32(i));
-    PtrPacket.set(i, ScalarPtr);
+    Value *Ptr = InBounds
+                     ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i))
+                     : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i));
+    PtrPacket.set(i, Ptr);
   }
 
   const unsigned Alignment = MaskedOp.getAlignment();
@@ -1385,8 +1332,7 @@ SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
         auto *OldVecTy = cast<FixedVectorType>(PtrRetPointeeTy);
         VECZ_STAT_FAIL_IF(OldVecTy->getNumElements() != SimdWidth,
                           VeczScalarizeFailBuiltin);
-        Type *NewTy = PointerType::get(OldVecTy->getElementType(),
-                                       OldPtrTy->getAddressSpace());
+        Type *NewTy = OldPtrTy;
         Value *ScalarAddrBase = B.CreateBitCast(OrigOp, NewTy);
         SimdPacket *OpPacket = getPacket(ScalarAddrBase, SimdWidth);
         for (unsigned j = 0; j < SimdWidth; j++) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
index 40067523ecc03..d00bef685e6a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -108,12 +108,8 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
 
           IRBuilder<> B(load);
           const auto name = load->getName();
-          auto *const newPtrTy =
-              PointerType::get(intTy, ptr->getType()->getPointerAddressSpace());
-          auto *const ptrCast = B.CreatePointerCast(
-              ptr, newPtrTy, Twine(ptr->getName(), ".squashptr"));
           auto *newLoad = cast<LoadInst>(
-              B.CreateLoad(intTy, ptrCast, Twine(name, ".squashed")));
+              B.CreateLoad(intTy, ptr, Twine(name, ".squashed")));
           newLoad->setAlignment(align);
           newLoad->copyMetadata(*load);
 
@@ -151,12 +147,8 @@ PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
           }
 
           IRBuilder<> B(store);
-          auto *const newPtrTy =
-              PointerType::get(intTy, ptr->getType()->getPointerAddressSpace());
-          auto *const newPtr = B.CreatePointerCast(
-              ptr, newPtrTy, Twine(ptr->getName(), ".squashptr"));
           auto *const newData = getSquashed(data, intTy, B);
-          auto *newStore = cast<StoreInst>(B.CreateStore(newData, newPtr));
+          auto *newStore = cast<StoreInst>(B.CreateStore(newData, ptr));
           newStore->setAlignment(align);
           newStore->copyMetadata(*store);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 0813d9757593d..ea6b305c082c5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -693,11 +693,6 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
   // Store the packetized vector to the allocation
   B.CreateStore(src, alloc);
 
-  // Re-interpret the allocation as a pointer to the element type
-  auto *const eltptrTy = PointerType::get(eltTy, /*AddressSpace=*/0);
-  auto *const bcastalloc =
-      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
-
   const unsigned fixedVecElts =
       multi_llvm::getVectorNumElements(origSrc->getType());
 
@@ -711,8 +706,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
     // Index into the allocation, coming back with the starting offset from
     // which to begin our loads. This is either a scalar pointer, or a vector of
     // pointers.
-    auto *const gep =
-        B.CreateInBoundsGEP(eltTy, bcastalloc, index, "vec.alloc");
+    auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, index, "vec.alloc");
 
     load = ::createInterleavedLoad(Ctx, narrowTy, gep, stride, /*Mask*/ nullptr,
                                    /*EVL*/ nullptr, alignment.value());
@@ -728,8 +722,7 @@ Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
 
     // Index into the allocation, coming back with the starting offset from
     // which to begin our striding load.
-    auto *const gep =
-        B.CreateInBoundsGEP(eltTy, bcastalloc, index, "vec.alloc");
+    auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, index, "vec.alloc");
 
     load = ::createGather(Ctx, narrowTy, gep, /*Mask*/ nullptr, /*EVL*/ nullptr,
                           alignment.value());
@@ -778,15 +771,11 @@ Value *TargetInfo::createScalableBroadcast(IRBuilder<> &B, Value *vector,
 
   auto *const eltTy = cast<llvm::VectorType>(ty)->getElementType();
 
-  auto *const eltptrTy = PointerType::get(eltTy, /*AddressSpace=*/0);
-  auto *const bcastalloc =
-      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
   auto *const stepsRem = TargetInfo::createBroadcastIndexVector(
       B,
       ScalableVectorType::get(B.getInt32Ty(), cast<ScalableVectorType>(wideTy)),
       factor, URem, "idx1");
-  auto *const gep =
-      B.CreateInBoundsGEP(eltTy, bcastalloc, stepsRem, "vec.alloc");
+  auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, stepsRem, "vec.alloc");
   auto *const boolTrue = ConstantInt::getTrue(B.getContext());
   auto *const mask = B.CreateVectorSplat(wideEltCount, boolTrue, "truemask");
   // Set the alignment to that of vector element type.
@@ -840,11 +829,6 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
   // Store the wide vector to the allocation
   B.CreateStore(into, alloc);
 
-  // Re-interpret the allocation as a pointer to the element type
-  auto *const eltptrTy = PointerType::get(scalarTy, /*AddressSpace=*/0);
-  auto *const bcastalloc =
-      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
-
   const unsigned fixedVecElts =
       multi_llvm::getVectorNumElements(insert->getOperand(0)->getType());
 
@@ -861,8 +845,7 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
     // Index into the allocation, coming back with the starting offset from
     // which to begin our loads. This is either a scalar pointer, or a vector of
     // pointers.
-    auto *const gep =
-        B.CreateInBoundsGEP(scalarTy, bcastalloc, index, "vec.alloc");
+    auto *const gep = B.CreateInBoundsGEP(scalarTy, alloc, index, "vec.alloc");
 
     store = ::createInterleavedStore(Ctx, elt, gep, stride, /*Mask*/ nullptr,
                                      /*EVL*/ nullptr, alignment.value());
@@ -884,8 +867,7 @@ Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
 
     // Index into the allocation, coming back with the starting offset from
     // which to begin our striding load.
-    auto *const gep =
-        B.CreateInBoundsGEP(scalarTy, bcastalloc, index, "vec.alloc");
+    auto *const gep = B.CreateInBoundsGEP(scalarTy, alloc, index, "vec.alloc");
 
     store = ::createScatter(Ctx, elt, gep, /*Mask*/ nullptr,
                             /*EVL*/ nullptr, alignment.value());
@@ -981,13 +963,8 @@ llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
 
   auto *const eltTy = srcTy->getElementType();
 
-  // Re-interpret the allocation as a pointer to the element type
-  auto *const eltptrTy = PointerType::get(eltTy, /*AddressSpace=*/0);
-  auto *const bcastalloc =
-      B.CreatePointerBitCastOrAddrSpaceCast(alloc, eltptrTy, "bcast.alloc");
-
   // Index into the allocation.
-  auto *const gep = B.CreateInBoundsGEP(eltTy, bcastalloc, mask, "vec.alloc");
+  auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, mask, "vec.alloc");
 
   const auto eltCount = maskTy->getElementCount();
   auto *const dstTy = VectorType::get(eltTy, eltCount);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
deleted file mode 100644
index eead4848591b3..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_redundant_bitcasts.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; RUN: veczc -k memop_loop_dep -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
-
-; ModuleID = 'kernel.opencl'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
-target triple = "spir64-unknown-unknown"
-
-declare i64 @__mux_get_global_id(i32)
-
-define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0)
-  %cmp1 = icmp slt i32 %i, %e
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %for.body.lr.ph
-  %i.addr.02 = phi i32 [ %i, %for.body.lr.ph ], [ %inc, %for.inc ]
-  %0 = shl i64 %call, 2
-  %vload_base = getelementptr i32, i32 addrspace(1)* %in, i64 %0
-  %vload_ptr = bitcast i32 addrspace(1)* %vload_base to <4 x i32> addrspace(1)*
-  %vload = load <4 x i32>, <4 x i32> addrspace(1)* %vload_ptr, align 16
-  %1 = shl i64 %call, 2
-  %vstore_base = getelementptr i32, i32 addrspace(1)* %out, i64 %1
-  %vstore_ptr = bitcast i32 addrspace(1)* %vstore_base to <4 x i32> addrspace(1)*
-  store <4 x i32> %vload, <4 x i32> addrspace(1)* %vstore_ptr, align 16
-  %2 = extractelement <4 x i32> %vload, i64 0
-  %tobool = icmp ne i32 %2, 0
-  %tobool2 = icmp eq i64 %call, 0
-  %or.cond = and i1 %tobool2, %tobool
-  br i1 %or.cond, label %while.cond.preheader, label %for.inc
-
-while.cond.preheader:                             ; preds = %for.body
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.cond, %while.cond.preheader
-  %tobool3 = icmp eq i64 %call, 0
-  br i1 %tobool3, label %for.inc.loopexit, label %while.cond
-
-for.inc.loopexit:                                 ; preds = %while.cond
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.inc.loopexit, %for.body
-  %inc = add nsw i32 %i.addr.02, 1
-  %exitcond = icmp ne i32 %inc, %e
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.inc
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
-  ret void
-}
-
-; CA-1431 when we scalarize the vector load, the pointer bitcast back to the
-; scalar type is not needed, since the original pointer was the same scalar
-; type and can be used directly.
-
-; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
-
-; Make sure Scalarization doesn't create any redundant bitcasts
-; CHECK-NOT: bitcast
-; CHECK: getelementptr i32, ptr addrspace(1) %{{.+}}, i32 0
-; CHECK-NOT: bitcast
-; CHECK: load i32
-; CHECK-NOT: bitcast
-
-; Make sure there is no duplicate GEP that gets the 0-indexed element from the vector
-; CHECK-NOT: getelementptr i32, ptr addrspace(1) %{{.+}}, i32 0
-; CHECK-NOT: bitcast
-; CHECK: load i32
-; CHECK-NOT: bitcast
-; CHECK: load i32
-; CHECK-NOT: bitcast
-; CHECK: load i32
-; CHECK-NOT: bitcast

From 3020f6787b1855a5baa582021314714f9601a83c Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 23 Apr 2025 22:15:25 +0100
Subject: [PATCH 152/182] [vecz] Track barriers by ID.

We were tracking barriers by their index, which can (rarely) break when
optimizations on vectorized kernels cause barriers to be optimized away
before the work item loops pass runs. All barriers have a unique ID at
this point already, which we can use to track them instead.
---
 .../include/compiler/utils/barrier_regions.h  | 24 ++++++----
 .../source/barrier_regions.cpp                | 45 ++++++++-----------
 .../source/work_item_loops_pass.cpp           | 29 ++++++------
 3 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
index fe30907d8d2a4..c40d1743b0bfc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -31,6 +31,8 @@
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <multi_llvm/llvm_version.h>
 
+#include <map>
+
 #include "pass_functions.h"
 
 namespace llvm {
@@ -82,10 +84,11 @@ struct BarrierRegion {
   BarrierSchedule schedule = BarrierSchedule::Unordered;
 };
 
-using BarrierGraph = llvm::SmallVector<BarrierRegion, 8>;
-
 class Barrier {
  public:
+  /// @brief Type for ids of new kernel functions
+  using kernel_id_map_t = std::map<unsigned, llvm::Function *>;
+
   Barrier(llvm::Module &m, llvm::Function &f, bool IsDebug)
       : live_var_mem_ty_(nullptr),
         size_t_bytes(compiler::utils::getSizeTypeBytes(m)),
@@ -106,6 +109,9 @@ class Barrier {
   /// @brief returns the maximum alignment of the barrier struct
   unsigned getLiveVarMaxAlignment() const { return max_live_var_alignment; }
 
+  /// @brief gets the split subkernels
+  const kernel_id_map_t &getSubkernels() const { return kernel_id_map_; }
+
   /// @brief gets the split subkernel for the given barrier id
   llvm::Function *getSubkernel(unsigned id) const {
     return kernel_id_map_.find(id)->second;
@@ -116,7 +122,7 @@ class Barrier {
 
   llvm::CallInst *getBarrierCall(unsigned id) const {
     return llvm::dyn_cast_or_null<llvm::CallInst>(
-        barrier_graph[id - kBarrier_FirstID].barrier_inst);
+        barrier_region_id_map_.find(id)->second.barrier_inst);
   }
 
   /// @brief gets the size of the fixed sized part of the barrier struct
@@ -135,12 +141,12 @@ class Barrier {
 
   /// @brief gets the barrier IDs of the successors of the given barrier region
   const llvm::SmallVectorImpl<unsigned> &getSuccessorIds(unsigned id) const {
-    return barrier_graph[id - kBarrier_FirstID].successor_ids;
+    return barrier_region_id_map_.find(id)->second.successor_ids;
   }
 
   /// @brief gets the barrier IDs of the successors of the given barrier region
   BarrierSchedule getSchedule(unsigned id) const {
-    return barrier_graph[id - kBarrier_FirstID].schedule;
+    return barrier_region_id_map_.find(id)->second.schedule;
   }
 
   /// @brief replaces a subkernel with a given function
@@ -223,8 +229,8 @@ class Barrier {
   using live_variable_scalables_map_t = live_variable_index_map_t;
   /// @brief Type for ids of barriers
   using barrier_id_map_t = llvm::DenseMap<llvm::BasicBlock *, unsigned>;
-  /// @brief Type for ids of new kernel functions
-  using kernel_id_map_t = llvm::DenseMap<unsigned, llvm::Function *>;
+  /// @brief Type for ids of barrier regions
+  using barrier_region_id_map_t = std::map<unsigned, BarrierRegion>;
   /// @brief Type for map from ids to fence instructions
   using fence_id_map_t = llvm::DenseMap<unsigned, llvm::FenceInst *>;
   /// @brief Type between block and instruction for barrier.
@@ -245,6 +251,8 @@ class Barrier {
   live_variable_scalables_map_t live_variable_scalables_map_;
   /// @brief Keep ids of barriers.
   barrier_id_map_t barrier_id_map_;
+  /// @brief Look up a barrier region by its id.
+  barrier_region_id_map_t barrier_region_id_map_;
   /// @brief Keep ids of barriers.
   kernel_id_map_t kernel_id_map_;
   /// @brief Keep struct types for live variables' memory layout.
@@ -269,8 +277,6 @@ class Barrier {
 
   size_t size_t_bytes;
 
-  BarrierGraph barrier_graph;
-
   llvm::Module &module_;
   llvm::Function &func_;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 62f4afa86fb26..1d9256e5aa9b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -437,11 +437,12 @@ void compiler::utils::Barrier::Run(llvm::ModuleAnalysisManager &mam) {
   bi_ = &mam.getResult<BuiltinInfoAnalysis>(module_);
   FindBarriers();
 
+  kernel_id_map_[kBarrier_EndID] = nullptr;
+
   if (barriers_.empty()) {
     // If there are no barriers, we can use the original function as the
     // single barrier region.
-    barrier_graph.emplace_back();
-    auto &node = barrier_graph.back();
+    auto &node = barrier_region_id_map_[kBarrier_FirstID];
     node.entry = &func_.getEntryBlock();
     node.id = kBarrier_FirstID;
     node.successor_ids.push_back(kBarrier_EndID);
@@ -513,11 +514,9 @@ void compiler::utils::Barrier::FindBarriers() {
         if (callee != nullptr) {
           const auto B = bi_->analyzeBuiltin(*callee);
           if (BuiltinInfo::isMuxBuiltinWithWGBarrierID(B.ID)) {
-            unsigned id = ~0u;
             auto *const id_param = call_inst->getOperand(0);
-            if (auto *const id_param_c = dyn_cast<ConstantInt>(id_param)) {
-              id = id_param_c->getZExtValue();
-            }
+            auto *const id_param_c = cast<ConstantInt>(id_param);
+            const auto id = id_param_c->getZExtValue();
             orderedBarriers.emplace_back(id, call_inst);
           }
         }
@@ -548,13 +547,15 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
     exit_stub = MakeStubFunction("__barrier_exit", module_, stub_cc);
   }
 
-  barrier_graph.emplace_back();
-  auto &node = barrier_graph.back();
+  auto &node = barrier_region_id_map_[kBarrier_FirstID];
   node.entry = &func_.getEntryBlock();
   node.id = kBarrier_FirstID;
 
-  unsigned barrier_id = kBarrier_StartNewID;
   for (CallInst *split_point : barriers_) {
+    // ID identifying which barrier invoked stub used as argument to call.
+    auto *id = cast<ConstantInt>(split_point->getOperand(0));
+    const auto barrier_id = kBarrier_StartNewID + id->getZExtValue();
+
     if (is_debug_) {
       assert(entry_stub != nullptr);  // Guaranteed as is_debug_ is const.
       assert(exit_stub != nullptr);   // Guaranteed as is_debug_ is const.
@@ -564,10 +565,6 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
       // them at a point where live variables have already been loaded. This
       // info won't be available till later.
 
-      // ID identifying which barrier invoked stub used as argument to call.
-      // This number monotonically increases from 0 for each barrier.
-      auto id = ConstantInt::get(Type::getInt32Ty(module_.getContext()),
-                                 barrier_id - kBarrier_StartNewID);
       // Call invoking entry stub
       auto entry_caller = CallInst::Create(entry_stub, id);
       entry_caller->setDebugLoc(split_point->getDebugLoc());
@@ -583,10 +580,9 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
           std::make_pair(entry_caller, exit_caller);
     }
 
-    barrier_graph.emplace_back();
-    auto &node = barrier_graph.back();
+    auto &node = barrier_region_id_map_[barrier_id];
     node.barrier_inst = split_point;
-    node.id = barrier_id++;
+    node.id = barrier_id;
     node.schedule = getBarrierSchedule(*split_point);
 
     // Our scan implementation requires a linear work-item ordering, to loop
@@ -603,7 +599,7 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
   // We have to gather the basic block data after splitting, because we
   // might not be processing barriers in program order, and things can get
   // awfully confused.
-  for (auto &node : barrier_graph) {
+  for (auto &[i, node] : barrier_region_id_map_) {
     if (node.barrier_inst) {
       auto *const bb = node.barrier_inst->getParent();
       barrier_id_map_[bb] = node.id;
@@ -770,7 +766,7 @@ void compiler::utils::Barrier::FindLiveVariables() {
     }
   }
 
-  for (auto &region : barrier_graph) {
+  for (auto &[i, region] : barrier_region_id_map_) {
     GatherBarrierRegionBlocks(region);
     GatherBarrierRegionUses(region, func_args);
     whole_live_variables_set_.set_union(region.uses_int);
@@ -1150,9 +1146,9 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
     } else if (ReturnInst *ret =
                    dyn_cast<ReturnInst>(cloned_bb->getTerminator())) {
       // Change return instruction with end barrier number.
-      ConstantInt *cst_zero =
+      ConstantInt *cst_endid =
           ConstantInt::get(Type::getInt32Ty(context), kBarrier_EndID);
-      ReturnInst *new_ret = ReturnInst::Create(context, cst_zero);
+      ReturnInst *new_ret = ReturnInst::Create(context, cst_endid);
       new_ret->insertBefore(ret->getIterator());
       ret->replaceAllUsesWith(new_ret);
       ret->eraseFromParent();
@@ -1450,7 +1446,7 @@ BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
 void compiler::utils::Barrier::SeperateKernelWithBarrier() {
   if (barriers_.empty()) return;
 
-  for (auto &region : barrier_graph) {
+  for (auto &[i, region] : barrier_region_id_map_) {
     kernel_id_map_[region.id] = GenerateNewKernel(region);
   }
 
@@ -1467,15 +1463,10 @@ void compiler::utils::Barrier::SeperateKernelWithBarrier() {
 
   LLVM_DEBUG({
     for (const auto &Kid : kernel_id_map_) {
-      dbgs() << "1. kernel_id[" << Kid.first << "] = " << Kid.second->getName()
+      dbgs() << "kernel_id[" << Kid.first << "] = " << Kid.second->getName()
              << "\n";
     }
 
-    for (unsigned I = kBarrier_FirstID;
-         I < kernel_id_map_.size() + kBarrier_FirstID; I++) {
-      dbgs() << "2. kernel_id[" << I << "] = " << kernel_id_map_[I]->getName()
-             << "\n";
-    }
     dbgs() << "\n\n" << module_ << "\n\n";
   });
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 2462d18fe7aa1..1c0fbe6a9d92f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -1552,13 +1552,14 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   AllocaInst *nextID =
       entryIR.CreateAlloca(index_type, nullptr, "next_barrier_id");
 
-  SmallVector<BasicBlock *, 8> bbs;
-  const unsigned num_blocks = barrierMain.getNumSubkernels();
-  assert(!emitTail || barrierTail->getNumSubkernels() == num_blocks);
-
-  for (unsigned i = kBarrier_EndID; i <= num_blocks; i++) {
-    BasicBlock *bb = BasicBlock::Create(context, "sw.bb", new_wrapper);
-    bbs.push_back(bb);
+  std::map<unsigned, BasicBlock *> bbs;
+  // The vectorized kernel has been further optimized and may have removed
+  // unreachable barriers that are still present in the scalar kernel. But if
+  // they are unreachable, we know they must also be unreachable in the scalar
+  // kernel even if we have not yet detected that.
+
+  for (auto &[i, subkernel] : barrierMain.getSubkernels()) {
+    bbs[i] = BasicBlock::Create(context, "sw.bb", new_wrapper);
   }
 
   ScheduleGenerator schedule(M, barrierMain, barrierTail, BI);
@@ -1584,7 +1585,9 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
   // Branch directly into the first basic block.
   entryIR.CreateBr(bbs[kBarrier_FirstID]);
 
-  for (unsigned i = kBarrier_EndID; i <= num_blocks; i++) {
+  for (auto &[i_, subkernel_] : barrierMain.getSubkernels()) {
+    auto i = i_;
+
     // Keep it linear
     BasicBlock *const block = bbs[i];
     block->moveAfter(&new_wrapper->back());
@@ -1663,7 +1666,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
 
       if (num_succ == 1) {
         // If there is only one successor, we can branch directly to it
-        exitIR.CreateBr(bbs[successors.front()]);
+        exitIR.CreateBr(bbs.find(successors.front())->second);
       } else if (num_succ == 2) {
         // If there are exactly two successors, we can use a conditional branch
         auto *const bb_id = ConstantInt::get(index_type, successors[0]);
@@ -1673,8 +1676,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
         auto *const cmp_id =
             CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, ld_next_id,
                             bb_id, "", br_block);
-        BranchInst::Create(bbs[successors[0]], bbs[successors[1]], cmp_id,
-                           br_block);
+        BranchInst::Create(bbs.find(successors[0])->second,
+                           bbs.find(successors[1])->second, cmp_id, br_block);
 
         exitIR.CreateBr(br_block);
       } else if (num_succ == 0) {
@@ -1700,9 +1703,9 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
         LoadInst *const ld_next_id =
             new LoadInst(index_type, nextID, "", switch_body);
         SwitchInst *const sw = SwitchInst::Create(
-            ld_next_id, bbs[successors[0]], num_succ, switch_body);
+            ld_next_id, bbs.find(successors[0])->second, num_succ, switch_body);
         for (const auto i : successors) {
-          sw->addCase(ConstantInt::get(index_type, i), bbs[i]);
+          sw->addCase(ConstantInt::get(index_type, i), bbs.find(i)->second);
         }
         exitIR.CreateBr(switch_body);
       }

From 4fe6c8efcf6683a36da368eec9df6e874f916ca1 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 24 Apr 2025 21:36:43 +0100
Subject: [PATCH 153/182] [builtins] Improve robustness.

* Replace eBuiltinInvalid with std::optional to force callers to figure
  out what to do when a function is not a built-in.
* Do not assert that CallInst::getCalledFunction() returns a function.
  The called operand is not required to be a function, and even if it is
  a function, getCalledFunction() can return nullptr on signature
  mismatches.
* Add a test that we no longer crash on vectorizing a kernel containing
  calls to printf() when a conflicting definition of printf() has been
  provided. This cannot happen in OpenCL, but can happen in NativeCPU.
---
 .../include/compiler/utils/builtin_info.h     |  26 ++-
 .../include/compiler/utils/cl_builtin_info.h  |   6 +-
 .../source/barrier_regions.cpp                |  33 ++--
 .../compiler_pipeline/source/builtin_info.cpp | 150 ++++++++++--------
 .../source/cl_builtin_info.cpp                |  76 +++++----
 .../source/define_mux_builtins_pass.cpp       |  11 +-
 .../optimal_builtin_replacement_pass.cpp      |  19 ++-
 .../source/prepare_barriers_pass.cpp          |  12 +-
 .../source/sub_group_analysis.cpp             |   7 +-
 .../source/work_item_loops_pass.cpp           |   6 +-
 .../analysis/instantiation_analysis.cpp       |   3 +-
 .../source/analysis/simd_width_analysis.cpp   |  11 +-
 .../analysis/uniform_value_analysis.cpp       |  26 +--
 .../vectorizable_function_analysis.cpp        |   2 +-
 .../vecz/source/offset_info.cpp               |  53 ++++---
 .../control_flow_conversion_pass.cpp          |  59 +++----
 .../inline_post_vectorization_pass.cpp        |   4 +-
 .../source/transform/instantiation_pass.cpp   |   5 +-
 .../vecz/source/transform/packetizer.cpp      | 103 ++++++++----
 .../vecz/source/transform/scalarizer.cpp      |  12 +-
 .../vecz/source/vectorization_context.cpp     |  17 +-
 .../vecz/source/vectorization_heuristics.cpp  |  14 +-
 .../vecz/test/lit/llvm/vector_printf_def.ll   |  43 +++++
 23 files changed, 423 insertions(+), 275 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
index e96b99073463c..5147db4ffb7de 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
@@ -40,7 +40,6 @@ using BuiltinID = int32_t;
 
 enum BaseBuiltinID {
   eBuiltinUnknown,
-  eBuiltinInvalid,
 
   // Mux builtins
   eMuxBuiltinIsFTZ,
@@ -237,9 +236,6 @@ struct Builtin {
   /// overloadable mux builtins)
   std::vector<llvm::Type *> mux_overload_info = {};
 
-  /// @brief returns whether the builtin is valid
-  bool isValid() const { return ID != eBuiltinInvalid; }
-
   /// @brief returns whether the builtin is unknown
   bool isUnknown() const { return ID == eBuiltinUnknown; }
 };
@@ -362,13 +358,13 @@ class BuiltinInfo {
   /// @brief Determine general properties for the given builtin function.
   /// @param[in] F Function to analyze.
   /// @return Analyzed properties for the builtin.
-  Builtin analyzeBuiltin(const llvm::Function &F) const;
+  std::optional<Builtin> analyzeBuiltin(const llvm::Function &F) const;
 
   /// @brief Determine general properties for the given builtin function.
   /// @param[in] CI Call instruction to analyze.
   /// @return Analyzed properties for the builtin call.
-  BuiltinCall analyzeBuiltinCall(const llvm::CallInst &CI,
-                                 unsigned SimdDimIdx) const;
+  std::optional<BuiltinCall> analyzeBuiltinCall(const llvm::CallInst &CI,
+                                                unsigned SimdDimIdx) const;
 
   /// @brief Try to find a builtin function that is a vector equivalent of the
   /// given function with the given vector width, if it exists.
@@ -418,11 +414,11 @@ class BuiltinInfo {
   /// @return An identifier for the builtin, or the invalid builtin if there
   /// is none. This builtin should have a signature of `<void type | integer
   /// type> <builtin name>(<char*>, ...)`.
-  BuiltinID getPrintfBuiltin() const;
+  std::optional<BuiltinID> getPrintfBuiltin() const;
 
   /// @brief Returns true if the given ID is a ComputeMux builtin ID.
   static bool isMuxBuiltinID(BuiltinID ID) {
-    return ID > eBuiltinInvalid && ID < eFirstTargetBuiltin;
+    return ID > eBuiltinUnknown && ID < eFirstTargetBuiltin;
   }
 
   /// @brief Returns true if the given ID is an overloadable ComputeMux builtin
@@ -451,7 +447,8 @@ class BuiltinInfo {
 
   /// @brief Returns the mux builtin ID matching the group collective, or
   /// eBuiltinInvalid.
-  static BuiltinID getMuxGroupCollective(const GroupCollective &Group);
+  static std::optional<BuiltinID> getMuxGroupCollective(
+      const GroupCollective &Group);
 
   /// @brief Returns true if the mux builtin has a barrier ID as its first
   /// operand.
@@ -649,8 +646,8 @@ class BuiltinInfo {
   /// @param[in] F The function to identify.
   /// @return Valid builtin ID if the name was identified, as well as any types
   /// required to overload the builtin ID.
-  std::pair<BuiltinID, std::vector<llvm::Type *>> identifyMuxBuiltin(
-      const llvm::Function &F) const;
+  std::optional<std::pair<BuiltinID, std::vector<llvm::Type *>>>
+  identifyMuxBuiltin(const llvm::Function &F) const;
 
   /// @brief Determine whether the given builtin function returns uniform values
   /// or not. An optional call instruction can be passed for more accuracy.
@@ -792,7 +789,8 @@ class BILangInfoConcept {
   /// @see BuiltinInfo::getBuiltinsModule
   virtual llvm::Module *getBuiltinsModule() { return nullptr; }
   /// @see BuiltinInfo::analyzeBuiltin
-  virtual Builtin analyzeBuiltin(const llvm::Function &F) const = 0;
+  virtual std::optional<Builtin> analyzeBuiltin(
+      const llvm::Function &F) const = 0;
   /// @see BuiltinInfo::isBuiltinUniform
   virtual BuiltinUniformity isBuiltinUniform(const Builtin &B,
                                              const llvm::CallInst *,
@@ -820,7 +818,7 @@ class BILangInfoConcept {
     return nullptr;
   }
   /// @see BuiltinInfo::getPrintfBuiltin
-  virtual BuiltinID getPrintfBuiltin() const = 0;
+  virtual std::optional<BuiltinID> getPrintfBuiltin() const = 0;
 };
 
 /// @brief Caches and returns the BuiltinInfo for a Module.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
index 7c80403f7d35c..9dda278e03b0d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
@@ -91,7 +91,7 @@ class CLBuiltinInfo : public BILangInfoConcept {
                                      unsigned SimdDimIdx) const override;
 
   /// @see BuiltinInfo::analyzeBuiltin
-  Builtin analyzeBuiltin(const llvm::Function &F) const override;
+  std::optional<Builtin> analyzeBuiltin(const llvm::Function &F) const override;
   /// @see BuiltinInfo::getVectorEquivalent
   llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
                                       llvm::Module *M = nullptr) override;
@@ -106,10 +106,10 @@ class CLBuiltinInfo : public BILangInfoConcept {
   llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &,
                                               BIMuxInfoConcept &) override;
   /// @see BuiltinInfo::getPrintfBuiltin
-  BuiltinID getPrintfBuiltin() const override;
+  std::optional<BuiltinID> getPrintfBuiltin() const override;
 
  private:
-  BuiltinID identifyBuiltin(const llvm::Function &) const;
+  std::optional<BuiltinID> identifyBuiltin(const llvm::Function &) const;
 
   llvm::Function *materializeBuiltin(
       llvm::StringRef BuiltinName, llvm::Module *DestM = nullptr,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 1d9256e5aa9b3..0160470bb4cd7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -59,11 +59,13 @@ std::optional<compiler::utils::GroupCollective> getWorkGroupCollectiveCall(
     return std::nullopt;
   }
 
-  Function *callee = ci->getCalledFunction();
-  assert(callee && "could not get called function");
-  auto info = bi.isMuxGroupCollective(bi.analyzeBuiltin(*callee).ID);
-  if (info && info->isWorkGroupScope()) {
-    return info;
+  if (Function *callee = ci->getCalledFunction()) {
+    if (const auto b = bi.analyzeBuiltin(*callee)) {
+      const auto info = bi.isMuxGroupCollective(b->ID);
+      if (info && info->isWorkGroupScope()) {
+        return info;
+      }
+    }
   }
   return std::nullopt;
 }
@@ -164,15 +166,17 @@ inline bool CheckValidUse(Value *v) {
 
 bool IsRematerializableBuiltinCall(Value *v, compiler::utils::BuiltinInfo &bi) {
   if (auto *call = dyn_cast<CallInst>(v)) {
-    assert(call->getCalledFunction() && "Could not get called function");
-    const auto B = bi.analyzeBuiltin(*call->getCalledFunction());
-    if (B.properties & compiler::utils::eBuiltinPropertyRematerializable) {
-      for (auto &op : call->operands()) {
-        if (isa<Instruction>(op.get())) {
-          return false;
+    if (auto *F = call->getCalledFunction()) {
+      if (const auto B = bi.analyzeBuiltin(*F)) {
+        if (B->properties & compiler::utils::eBuiltinPropertyRematerializable) {
+          for (auto &op : call->operands()) {
+            if (isa<Instruction>(op.get())) {
+              return false;
+            }
+          }
+          return true;
         }
       }
-      return true;
     }
   }
   return false;
@@ -510,10 +514,9 @@ void compiler::utils::Barrier::FindBarriers() {
     for (Instruction &bi : b) {
       // Check call instructions for barrier.
       if (CallInst *call_inst = dyn_cast<CallInst>(&bi)) {
-        Function *callee = call_inst->getCalledFunction();
-        if (callee != nullptr) {
+        if (Function *callee = call_inst->getCalledFunction()) {
           const auto B = bi_->analyzeBuiltin(*callee);
-          if (BuiltinInfo::isMuxBuiltinWithWGBarrierID(B.ID)) {
+          if (B && BuiltinInfo::isMuxBuiltinWithWGBarrierID(B->ID)) {
             auto *const id_param = call_inst->getOperand(0);
             auto *const id_param_c = cast<ConstantInt>(id_param);
             const auto id = id_param_c->getZExtValue();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index 75f1c307cc99a..2fa7ae9b9b926 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -45,11 +45,11 @@ Module *BuiltinInfo::getBuiltinsModule() {
   return nullptr;
 }
 
-std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
-    const Function &F) const {
+std::optional<std::pair<BuiltinID, std::vector<Type *>>>
+BuiltinInfo::identifyMuxBuiltin(const Function &F) const {
   StringRef Name = F.getName();
   auto ID =
-      StringSwitch<BuiltinID>(Name)
+      StringSwitch<std::optional<BuiltinID>>(Name)
           .Case(MuxBuiltins::isftz, eMuxBuiltinIsFTZ)
           .Case(MuxBuiltins::usefast, eMuxBuiltinUseFast)
           .Case(MuxBuiltins::isembeddedprofile, eMuxBuiltinIsEmbeddedProfile)
@@ -87,11 +87,11 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
           .Case(MuxBuiltins::work_group_barrier, eMuxBuiltinWorkGroupBarrier)
           .Case(MuxBuiltins::sub_group_barrier, eMuxBuiltinSubGroupBarrier)
           .Case(MuxBuiltins::mem_barrier, eMuxBuiltinMemBarrier)
-          .Default(eBuiltinInvalid);
-  if (ID != eBuiltinInvalid) {
-    switch (ID) {
+          .Default(std::nullopt);
+  if (ID) {
+    switch (*ID) {
       default:
-        return {ID, {}};
+        return {{*ID, {}}};
       case eMuxBuiltinDMARead1D:
       case eMuxBuiltinDMARead2D:
       case eMuxBuiltinDMARead3D:
@@ -104,7 +104,7 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
         // builtins' name (i.e., it's not mangled) as it's required to be
         // consistent at any single snapshot of the module, though it may
         // change through time.
-        return {ID, {F.getReturnType()}};
+        return {{*ID, {F.getReturnType()}}};
     }
   }
 
@@ -115,7 +115,7 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
   const bool IsVecgroupOp = Name.consume_front("__mux_vec_group_");
   if (!IsSubgroupOp && !IsVecgroupOp &&
       !Name.consume_front("__mux_work_group_")) {
-    return {eBuiltinInvalid, {}};
+    return std::nullopt;
   }
 
 #define SCOPED_GROUP_OP(OP)                 \
@@ -135,22 +135,22 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
     ID = SCOPED_GROUP_OP(Broadcast);
   } else if (Name.consume_front("shuffle_up")) {
     if (!IsSubgroupOp) {
-      return {eBuiltinInvalid, {}};
+      return std::nullopt;
     }
     ID = eMuxBuiltinSubgroupShuffleUp;
   } else if (Name.consume_front("shuffle_down")) {
     if (!IsSubgroupOp) {
-      return {eBuiltinInvalid, {}};
+      return std::nullopt;
     }
     ID = eMuxBuiltinSubgroupShuffleDown;
   } else if (Name.consume_front("shuffle_xor")) {
     if (!IsSubgroupOp) {
-      return {eBuiltinInvalid, {}};
+      return std::nullopt;
     }
     ID = eMuxBuiltinSubgroupShuffleXor;
   } else if (Name.consume_front("shuffle")) {
     if (!IsSubgroupOp) {
-      return {eBuiltinInvalid, {}};
+      return std::nullopt;
     }
     ID = eMuxBuiltinSubgroupShuffle;
   } else if (Name.consume_front("reduce_")) {
@@ -166,7 +166,7 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
       Name = Name.drop_front(RealGroup.size());
     }
 
-    ID = StringSwitch<BuiltinID>(Group)
+    ID = StringSwitch<std::optional<BuiltinID>>(Group)
              .Case("add", SCOPED_GROUP_OP(ReduceAdd))
              .Case("fadd", SCOPED_GROUP_OP(ReduceFAdd))
              .Case("mul", SCOPED_GROUP_OP(ReduceMul))
@@ -183,11 +183,11 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
              .Case("logical_and", SCOPED_GROUP_OP(ReduceLogicalAnd))
              .Case("logical_or", SCOPED_GROUP_OP(ReduceLogicalOr))
              .Case("logical_xor", SCOPED_GROUP_OP(ReduceLogicalXor))
-             .Default(eBuiltinInvalid);
+             .Default(std::nullopt);
   } else if (Name.consume_front("scan_")) {
     const bool IsInclusive = Name.consume_front("inclusive_");
     if (!IsInclusive && !Name.consume_front("exclusive_")) {
-      return {eBuiltinInvalid, {}};
+      return std::nullopt;
     }
 
     auto NextIdx = Name.find_first_of('_');
@@ -201,7 +201,7 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
       Name = Name.drop_front(RealGroup.size());
     }
 
-    ID = StringSwitch<BuiltinID>(Group)
+    ID = StringSwitch<std::optional<BuiltinID>>(Group)
              .Case("add", IsInclusive ? SCOPED_GROUP_OP(ScanAddInclusive)
                                       : SCOPED_GROUP_OP(ScanAddExclusive))
              .Case("fadd", IsInclusive ? SCOPED_GROUP_OP(ScanFAddInclusive)
@@ -237,39 +237,41 @@ std::pair<BuiltinID, std::vector<Type *>> BuiltinInfo::identifyMuxBuiltin(
              .Case("logical_xor",
                    IsInclusive ? SCOPED_GROUP_OP(ScanLogicalXorInclusive)
                                : SCOPED_GROUP_OP(ScanLogicalXorExclusive))
-             .Default(eBuiltinInvalid);
+             .Default(std::nullopt);
+  }
+  if (!ID) {
+    return std::nullopt;
   }
 
   std::vector<Type *> OverloadInfo;
-  if (ID != eBuiltinInvalid) {
-    // Consume the rest of this group Op function name. If we can't identify a
-    // series of mangled type names, this builtin is invalid.
-    unsigned NumMangledArgs = 0;
-    // Work-group builtins have an unmangled 'barrier ID' parameter first, which
-    // we want to skip.
-    const unsigned Offset = ID >= eFirstMuxWorkgroupCollectiveBuiltin &&
-                            ID <= eLastMuxWorkgroupCollectiveBuiltin;
-    while (!Name.empty()) {
-      if (!Name.consume_front("_")) {
-        return {eBuiltinInvalid, {}};
-      }
-      auto [Ty, NewName] = getDemangledTypeFromStr(Name, F.getContext());
-      Name = NewName;
 
-      auto ParamIdx = Offset + NumMangledArgs;
-      if (ParamIdx >= F.arg_size() || Ty != F.getArg(ParamIdx)->getType()) {
-        return {eBuiltinInvalid, {}};
-      }
-
-      ++NumMangledArgs;
-      OverloadInfo.push_back(Ty);
+  // Consume the rest of this group Op function name. If we can't identify a
+  // series of mangled type names, this builtin is invalid.
+  unsigned NumMangledArgs = 0;
+  // Work-group builtins have an unmangled 'barrier ID' parameter first, which
+  // we want to skip.
+  const unsigned Offset = ID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+                          ID <= eLastMuxWorkgroupCollectiveBuiltin;
+  while (!Name.empty()) {
+    if (!Name.consume_front("_")) {
+      return std::nullopt;
     }
-    if (NumMangledArgs != NumExpectedMangledArgs) {
-      return {eBuiltinInvalid, {}};
+    auto [Ty, NewName] = getDemangledTypeFromStr(Name, F.getContext());
+    Name = NewName;
+
+    auto ParamIdx = Offset + NumMangledArgs;
+    if (ParamIdx >= F.arg_size() || Ty != F.getArg(ParamIdx)->getType()) {
+      return std::nullopt;
     }
+
+    ++NumMangledArgs;
+    OverloadInfo.push_back(Ty);
+  }
+  if (NumMangledArgs != NumExpectedMangledArgs) {
+    return std::nullopt;
   }
 
-  return {ID, OverloadInfo};
+  return {{*ID, OverloadInfo}};
 #undef SCOPED_GROUP_OP
 }
 
@@ -330,7 +332,7 @@ BuiltinUniformity BuiltinInfo::isBuiltinUniform(const Builtin &B,
   return eBuiltinUniformityUnknown;
 }
 
-Builtin BuiltinInfo::analyzeBuiltin(const Function &F) const {
+std::optional<Builtin> BuiltinInfo::analyzeBuiltin(const Function &F) const {
   // Handle LLVM intrinsics.
   if (F.isIntrinsic()) {
     int32_t Properties = eBuiltinPropertyNone;
@@ -413,16 +415,17 @@ Builtin BuiltinInfo::analyzeBuiltin(const Function &F) const {
     return Builtin{F, eBuiltinUnknown, (BuiltinProperties)Properties};
   }
 
-  auto [ID, OverloadInfo] = identifyMuxBuiltin(F);
-
-  if (ID == eBuiltinInvalid) {
+  auto MB = identifyMuxBuiltin(F);
+  if (!MB) {
     // It's not a Mux builtin, so defer to the language implementation
     if (LangImpl) {
       return LangImpl->analyzeBuiltin(F);
     }
-    return Builtin{F, ID, eBuiltinPropertyNone};
+    return std::nullopt;
   }
 
+  auto [ID, OverloadInfo] = *MB;
+
   // Check that all overloadable builtins have returned some overloading
   // information, for API consistency.
   assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
@@ -488,13 +491,15 @@ Builtin BuiltinInfo::analyzeBuiltin(const Function &F) const {
   return Builtin{F, ID, (BuiltinProperties)Properties, OverloadInfo};
 }
 
-BuiltinCall BuiltinInfo::analyzeBuiltinCall(const CallInst &CI,
-                                            unsigned SimdDimIdx) const {
-  auto *const callee = CI.getCalledFunction();
-  assert(callee && "Call instruction with no callee");
-  const auto B = analyzeBuiltin(*callee);
-  const auto U = isBuiltinUniform(B, &CI, SimdDimIdx);
-  return BuiltinCall{B, CI, U};
+std::optional<BuiltinCall> BuiltinInfo::analyzeBuiltinCall(
+    const CallInst &CI, unsigned SimdDimIdx) const {
+  if (auto *const callee = dyn_cast<Function>(CI.getCalledOperand())) {
+    if (const auto B = analyzeBuiltin(*callee)) {
+      const auto U = isBuiltinUniform(*B, &CI, SimdDimIdx);
+      return BuiltinCall{*B, CI, U};
+    }
+  }
+  return std::nullopt;
 }
 
 Function *BuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
@@ -570,8 +575,9 @@ std::optional<llvm::ConstantRange> BuiltinInfo::getBuiltinRange(
   }
 
   // First, check mux builtins
-  if (auto [ID, _] = identifyMuxBuiltin(*F); isMuxBuiltinID(ID)) {
-    return MuxImpl->getBuiltinRange(CI, ID, MaxLocalSizes, MaxGlobalSizes);
+  if (auto MB = identifyMuxBuiltin(*F); MB && isMuxBuiltinID(MB->first)) {
+    return MuxImpl->getBuiltinRange(CI, MB->first, MaxLocalSizes,
+                                    MaxGlobalSizes);
   }
 
   // Next, ask the language builtin info
@@ -590,11 +596,11 @@ Instruction *BuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI) {
   return nullptr;
 }
 
-BuiltinID BuiltinInfo::getPrintfBuiltin() const {
+std::optional<BuiltinID> BuiltinInfo::getPrintfBuiltin() const {
   if (LangImpl) {
     return LangImpl->getPrintfBuiltin();
   }
-  return eBuiltinInvalid;
+  return std::nullopt;
 }
 
 bool BuiltinInfo::requiresSchedulingParameters(BuiltinID ID) {
@@ -1144,7 +1150,8 @@ std::optional<GroupCollective> BuiltinInfo::isMuxGroupCollective(BuiltinID ID) {
 #undef CASE_GROUP_OP_ALL_SCOPES
 }
 
-BuiltinID BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
+std::optional<BuiltinID> BuiltinInfo::getMuxGroupCollective(
+    const GroupCollective &Group) {
 #define SIMPLE_SCOPE_SWITCH(OP)                     \
   do {                                              \
     switch (Group.Scope) {                          \
@@ -1219,19 +1226,26 @@ BuiltinID BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
       COMPLEX_SCOPE_SWITCH(Scan, Inclusive);
       break;
     case GroupCollective::OpKind::Shuffle:
-      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffle
-                                     : eBuiltinInvalid;
     case GroupCollective::OpKind::ShuffleUp:
-      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffleUp
-                                     : eBuiltinInvalid;
     case GroupCollective::OpKind::ShuffleDown:
-      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffleDown
-                                     : eBuiltinInvalid;
     case GroupCollective::OpKind::ShuffleXor:
-      return Group.isSubGroupScope() ? eMuxBuiltinSubgroupShuffleXor
-                                     : eBuiltinInvalid;
+      if (!Group.isSubGroupScope()) {
+        break;
+      }
+      switch (Group.Op) {
+        default:
+          llvm_unreachable("Unhandled op");
+        case GroupCollective::OpKind::Shuffle:
+          return eMuxBuiltinSubgroupShuffle;
+        case GroupCollective::OpKind::ShuffleUp:
+          return eMuxBuiltinSubgroupShuffleUp;
+        case GroupCollective::OpKind::ShuffleDown:
+          return eMuxBuiltinSubgroupShuffleDown;
+        case GroupCollective::OpKind::ShuffleXor:
+          return eMuxBuiltinSubgroupShuffleXor;
+      }
   }
-  return eBuiltinInvalid;
+  return std::nullopt;
 #undef COMPLEX_SCOPE_SWITCH
 #undef SCOPE_SWITCH
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index bf156687fd88e..69042752fd7a7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -798,7 +798,6 @@ static constexpr CLBuiltinEntry Builtins[] = {
     {eCLBuiltinCodeplayUnpackNormalize, "codeplay_unpack_normalize"},
     {eCLBuiltinCodeplayUnpackHalf2, "codeplay_unpack_half2"},
 
-    {eBuiltinInvalid, nullptr},
     {eBuiltinUnknown, nullptr}};
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -839,7 +838,9 @@ Function *CLBuiltinInfo::declareBuiltin(Module *M, BuiltinID ID, Type *RetTy,
   return Builtin;
 }
 
-BuiltinID CLBuiltinInfo::getPrintfBuiltin() const { return eCLBuiltinPrintf; }
+std::optional<BuiltinID> CLBuiltinInfo::getPrintfBuiltin() const {
+  return eCLBuiltinPrintf;
+}
 
 Module *CLBuiltinInfo::getBuiltinsModule() {
   if (!Loader) {
@@ -870,13 +871,14 @@ Function *CLBuiltinInfo::materializeBuiltin(StringRef BuiltinName,
   return Loader->materializeBuiltin(BuiltinName, DestM, Flags);
 }
 
-BuiltinID CLBuiltinInfo::identifyBuiltin(const Function &F) const {
+std::optional<BuiltinID> CLBuiltinInfo::identifyBuiltin(
+    const Function &F) const {
   NameMangler Mangler(nullptr);
   const StringRef Name = F.getName();
   const CLBuiltinEntry *entry = Builtins;
   const auto Version = getOpenCLVersion(*F.getParent());
   const StringRef DemangledName = Mangler.demangleName(Name);
-  while (entry->ID != eBuiltinInvalid) {
+  while (entry->ID != eBuiltinUnknown) {
     if (Version >= entry->MinVer && DemangledName == entry->OpenCLFnName) {
       return entry->ID;
     }
@@ -885,7 +887,7 @@ BuiltinID CLBuiltinInfo::identifyBuiltin(const Function &F) const {
 
   if (DemangledName == Name) {
     // The function name is not mangled and so it can not be an OpenCL builtin.
-    return eBuiltinInvalid;
+    return std::nullopt;
   }
 
   Lexer L(Mangler.demangleName(Name));
@@ -940,7 +942,7 @@ BuiltinID CLBuiltinInfo::identifyBuiltin(const Function &F) const {
 
 llvm::StringRef CLBuiltinInfo::getBuiltinName(BuiltinID ID) const {
   const CLBuiltinEntry *entry = Builtins;
-  while (entry->ID != eBuiltinInvalid) {
+  while (entry->ID != eBuiltinUnknown) {
     if (ID == entry->ID) {
       return entry->OpenCLFnName;
     }
@@ -954,21 +956,27 @@ BuiltinUniformity CLBuiltinInfo::isBuiltinUniform(const Builtin &,
                                                   unsigned) const {
   // Assume that builtins with side effects are varying.
   if (Function *Callee = CI->getCalledFunction()) {
-    const auto Props = analyzeBuiltin(*Callee).properties;
-    if (Props & eBuiltinPropertySideEffects) {
-      return eBuiltinUniformityNever;
+    if (auto B = analyzeBuiltin(*Callee)) {
+      const auto Props = B->properties;
+      if (Props & eBuiltinPropertySideEffects) {
+        return eBuiltinUniformityNever;
+      }
     }
   }
 
   return eBuiltinUniformityLikeInputs;
 }
 
-Builtin CLBuiltinInfo::analyzeBuiltin(const Function &Callee) const {
-  const BuiltinID ID = identifyBuiltin(Callee);
+std::optional<Builtin> CLBuiltinInfo::analyzeBuiltin(
+    const Function &Callee) const {
+  const auto ID = identifyBuiltin(Callee);
+  if (!ID) {
+    return std::nullopt;
+  }
 
   bool IsConvergent = false;
   unsigned Properties = eBuiltinPropertyNone;
-  switch (ID) {
+  switch (*ID) {
     default:
       // Assume convergence on unknown builtins.
       IsConvergent = true;
@@ -1252,7 +1260,7 @@ Builtin CLBuiltinInfo::analyzeBuiltin(const Function &Callee) const {
     Properties |= eBuiltinPropertyKnownNonConvergent;
   }
 
-  return Builtin{Callee, ID, (BuiltinProperties)Properties};
+  return Builtin{Callee, *ID, (BuiltinProperties)Properties};
 }
 
 Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
@@ -1512,12 +1520,12 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
   }
 
   // Handle 'common' builtins.
-  const BuiltinID BuiltinID = identifyBuiltin(*F);
-  if (BuiltinID != eBuiltinInvalid && BuiltinID != eBuiltinUnknown) {
+  const auto BuiltinID = identifyBuiltin(*F);
+  if (BuiltinID && *BuiltinID != eBuiltinUnknown) {
     // Note we have to handle these specially since we need to deduce whether
     // the source operand is signed or not. It is not possible to do this based
     // solely on the BuiltinID.
-    switch (BuiltinID) {
+    switch (*BuiltinID) {
         // 6.2 Explicit Conversions
       case eCLBuiltinConvertChar:
       case eCLBuiltinConvertShort:
@@ -1527,7 +1535,7 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
       case eCLBuiltinConvertUShort:
       case eCLBuiltinConvertUInt:
       case eCLBuiltinConvertULong:
-        return emitBuiltinInlineConvert(F, BuiltinID, B, Args);
+        return emitBuiltinInlineConvert(F, *BuiltinID, B, Args);
         // 6.12.3 Integer Functions
       case eCLBuiltinAddSat:
       case eCLBuiltinSubSat: {
@@ -1590,7 +1598,7 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
       default:
         break;
     }
-    return emitBuiltinInline(BuiltinID, B, Args);
+    return emitBuiltinInline(*BuiltinID, B, Args);
   }
 
   return nullptr;
@@ -2315,26 +2323,26 @@ Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode,
   }
 
   // Declare the conversion builtin.
-  BuiltinID ConvID;
+  std::optional<BuiltinID> ConvID;
 
   if (Data->getType() == B.getFloatTy()) {
-    ConvID = StringSwitch<BuiltinID>(Mode)
+    ConvID = StringSwitch<std::optional<BuiltinID>>(Mode)
                  .Case("", eCLBuiltinConvertFloatToHalf)
                  .Case("_rte", eCLBuiltinConvertFloatToHalfRte)
                  .Case("_rtz", eCLBuiltinConvertFloatToHalfRtz)
                  .Case("_rtp", eCLBuiltinConvertFloatToHalfRtp)
                  .Case("_rtn", eCLBuiltinConvertFloatToHalfRtn)
-                 .Default(eBuiltinInvalid);
+                 .Default(std::nullopt);
   } else {
-    ConvID = StringSwitch<BuiltinID>(Mode)
+    ConvID = StringSwitch<std::optional<BuiltinID>>(Mode)
                  .Case("", eCLBuiltinConvertDoubleToHalf)
                  .Case("_rte", eCLBuiltinConvertDoubleToHalfRte)
                  .Case("_rtz", eCLBuiltinConvertDoubleToHalfRtz)
                  .Case("_rtp", eCLBuiltinConvertDoubleToHalfRtp)
                  .Case("_rtn", eCLBuiltinConvertDoubleToHalfRtn)
-                 .Default(eBuiltinInvalid);
+                 .Default(std::nullopt);
   }
-  if (ConvID == eBuiltinInvalid) {
+  if (!ConvID) {
     return nullptr;
   }
   Module *M = F->getParent();
@@ -2343,7 +2351,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode,
   // However, if the double extension is enabled, it is also possible to use
   // double instead. This means that we might have to convert either a float or
   // a double to a half.
-  Function *FloatToHalfFn = declareBuiltin(M, ConvID, B.getInt16Ty(),
+  Function *FloatToHalfFn = declareBuiltin(M, *ConvID, B.getInt16Ty(),
                                            {Data->getType()}, {eTypeQualNone});
   if (!FloatToHalfFn) {
     return nullptr;
@@ -2787,11 +2795,16 @@ Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
     CallInst &CI, BIMuxInfoConcept &BIMuxImpl) {
   auto &M = *CI.getModule();
   auto *const F = CI.getCalledFunction();
-  assert(F && "No calling function?");
+  if (!F) {
+    return nullptr;
+  }
   const auto ID = identifyBuiltin(*F);
+  if (!ID) {
+    return nullptr;
+  }
 
   // Handle straightforward 1:1 mappings.
-  if (auto MuxID = get1To1BuiltinLowering(ID)) {
+  if (auto MuxID = get1To1BuiltinLowering(*ID)) {
     auto *const MuxBuiltinFn = BIMuxImpl.getOrDeclareMuxBuiltin(*MuxID, M);
     assert(MuxBuiltinFn && "Could not get/declare mux builtin");
     const SmallVector<Value *> Args(CI.args());
@@ -2811,11 +2824,12 @@ Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
   unsigned DefaultMemOrder =
       BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
 
-  switch (ID) {
+  switch (*ID) {
     default:
       // Sub-group and work-group builtins need lowering to their mux
       // equivalents.
-      if (auto *const NewI = lowerGroupBuiltinToMuxBuiltin(CI, ID, BIMuxImpl)) {
+      if (auto *const NewI =
+              lowerGroupBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl)) {
         return NewI;
       }
       return nullptr;
@@ -2889,7 +2903,7 @@ Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
     case eCLBuiltinAsyncWorkGroupStridedCopy:
     case eCLBuiltinAsyncWorkGroupCopy2D2D:
     case eCLBuiltinAsyncWorkGroupCopy3D3D:
-      return lowerAsyncBuiltinToMuxBuiltin(CI, ID, BIMuxImpl);
+      return lowerAsyncBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl);
     case eCLBuiltinWaitGroupEvents: {
       auto *const MuxWait =
           BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinDMAWait, M);
@@ -2927,7 +2941,7 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
   // integer variant and do a checking step afterwards where we refine the
   // builtin ID.
   bool RecheckOpType = false;
-  BaseBuiltinID MuxBuiltinID = eBuiltinInvalid;
+  BaseBuiltinID MuxBuiltinID;
   switch (ID) {
     default:
       return nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
index 6acdfc09d02e4..5229fa6064f25 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
@@ -28,8 +28,12 @@ PreservedAnalyses compiler::utils::DefineMuxBuiltinsPass::run(
   auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
 
   auto functionNeedsDefining = [&BI](Function &F) {
-    return F.isDeclaration() && !F.isIntrinsic() &&
-           BI.isMuxBuiltinID(BI.analyzeBuiltin(F).ID);
+    if (F.isDeclaration() && !F.isIntrinsic()) {
+      if (auto B = BI.analyzeBuiltin(F)) {
+        return BI.isMuxBuiltinID(B->ID);
+      }
+    }
+    return false;
   };
 
   // Define all mux builtins
@@ -43,7 +47,8 @@ PreservedAnalyses compiler::utils::DefineMuxBuiltinsPass::run(
     // will be appended to the module's function list and so will be
     // encountered by later iterations.
     auto Builtin = BI.analyzeBuiltin(F);
-    if (BI.defineMuxBuiltin(Builtin.ID, M, Builtin.mux_overload_info)) {
+    assert(Builtin && "Failed to analyze builtin");
+    if (BI.defineMuxBuiltin(Builtin->ID, M, Builtin->mux_overload_info)) {
       Changed = true;
     }
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index bf6dfc786e353..f548d652f4510 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -232,12 +232,13 @@ PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C,
 
   const auto &MAMProxy = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(C, CG);
   if (auto *BI = MAMProxy.getCachedResult<BuiltinInfoAnalysis>(M)) {
-    replacements.emplace_back(
-        [BI](CallBase &CB, StringRef, const SmallVectorImpl<Type *> &,
-             const SmallVectorImpl<TypeQualifiers> &) -> Value * {
-          Function *Callee = CB.getCalledFunction();
-          const auto Props = BI->analyzeBuiltin(*Callee).properties;
-          if (Props & eBuiltinPropertyCanEmitInline) {
+    replacements.emplace_back([BI](CallBase &CB, StringRef,
+                                   const SmallVectorImpl<Type *> &,
+                                   const SmallVectorImpl<TypeQualifiers> &)
+                                  -> Value * {
+      if (Function *Callee = CB.getCalledFunction()) {
+        if (const auto Builtin = BI->analyzeBuiltin(*Callee)) {
+          if (Builtin->properties & eBuiltinPropertyCanEmitInline) {
             IRBuilder<> B(&CB);
             const SmallVector<Value *, 4> Args(CB.args());
             if (Value *Impl = BI->emitBuiltinInline(Callee, B, Args)) {
@@ -248,8 +249,10 @@ PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C,
               return Impl;
             }
           }
-          return nullptr;
-        });
+        }
+      }
+      return nullptr;
+    });
   }
 
   if (adjustReplacements) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
index cb49bce92fdf2..0c6067b3f6871 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -44,7 +44,7 @@ PreservedAnalyses compiler::utils::PrepareBarriersPass::run(
   for (Function &F : M) {
     const auto B = BI.analyzeBuiltin(F);
     // If the function does not have a barrier id.
-    if (!BI.isMuxBuiltinWithBarrierID(B.ID)) {
+    if (!B || !BI.isMuxBuiltinWithBarrierID(B->ID)) {
       continue;
     }
 
@@ -114,10 +114,12 @@ PreservedAnalyses compiler::utils::PrepareBarriersPass::run(
       for (Instruction &I : BB) {
         // Check call instructions for barrier.
         if (auto *const CI = dyn_cast<CallInst>(&I)) {
-          Function *Callee = CI->getCalledFunction();
-          if (Callee &&
-              BI.isMuxBuiltinWithBarrierID(BI.analyzeBuiltin(*Callee).ID)) {
-            CI->setOperand(0, ConstantInt::get(I32Ty, ID++));
+          if (Function *Callee = CI->getCalledFunction()) {
+            if (auto B = BI.analyzeBuiltin(*Callee)) {
+              if (BI.isMuxBuiltinWithBarrierID(B->ID)) {
+                CI->setOperand(0, ConstantInt::get(I32Ty, ID++));
+              }
+            }
           }
         }
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
index bbbf353517ff3..c2bf37e7fcff8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
@@ -107,8 +107,11 @@ std::optional<Builtin> GlobalSubgroupInfo::isMuxSubgroupBuiltin(
     return std::nullopt;
   }
   auto SGBuiltin = BI.analyzeBuiltin(*F);
+  if (!SGBuiltin) {
+    return std::nullopt;
+  }
 
-  switch (SGBuiltin.ID) {
+  switch (SGBuiltin->ID) {
     default:
       break;
     case eMuxBuiltinSubGroupBarrier:
@@ -120,7 +123,7 @@ std::optional<Builtin> GlobalSubgroupInfo::isMuxSubgroupBuiltin(
       return SGBuiltin;
   }
 
-  if (auto GroupOp = BI.isMuxGroupCollective(SGBuiltin.ID);
+  if (auto GroupOp = BI.isMuxGroupCollective(SGBuiltin->ID);
       GroupOp && GroupOp->isSubGroupScope()) {
     return SGBuiltin;
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 1c0fbe6a9d92f..bd3dfd8818a75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -423,7 +423,8 @@ struct ScheduleGenerator {
     }
 
     auto Builtin = BI.analyzeBuiltin(*BarrierCall->getCalledFunction());
-    return BI.isMuxGroupCollective(Builtin.ID);
+    assert(Builtin && "Barrier call must be a known builtin");
+    return BI.isMuxGroupCollective(Builtin->ID);
   }
 
   std::tuple<BasicBlock *, Value *,
@@ -1600,7 +1601,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
       if (auto *const CI = barrierMain.getBarrierCall(i)) {
         auto *const callee = CI->getCalledFunction();
         const auto builtin = BI.analyzeBuiltin(*callee);
-        if (builtin.ID == compiler::utils::eMuxBuiltinWorkGroupBarrier) {
+        if (builtin &&
+            builtin->ID == compiler::utils::eMuxBuiltinWorkGroupBarrier) {
           IRBuilder<> B(block);
           auto *MemBarrier =
               BI.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index 0dcd6bf69f743..54f64218d0b16 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -63,7 +63,8 @@ bool analyzeCall(const VectorizationContext &Ctx, CallInst *CI) {
     return true;
   }
 
-  const auto Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
+  auto B = Ctx.builtins().analyzeBuiltin(*Callee);
+  const auto Props = B ? B->properties : 0;
 
   // Intrinsics without side-effects can be safely instantiated.
   if (Callee->isIntrinsic() &&
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
index 86627dc4a2ed2..8fc023539213b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -96,11 +96,12 @@ unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F,
   auto ShouldConsider = [&](const Value *V) -> bool {
     // Filter out work item builtin calls such as get_local_id()
     if (auto *const CI = dyn_cast<CallInst>(V)) {
-      const Function *Callee = CI->getCalledFunction();
-      if (Callee &&
-          VU.context().builtins().analyzeBuiltin(*Callee).properties ==
-              compiler::utils::eBuiltinPropertyWorkItem) {
-        return false;
+      if (const Function *Callee = CI->getCalledFunction()) {
+        if (auto B = VU.context().builtins().analyzeBuiltin(*Callee)) {
+          if (B->properties == compiler::utils::eBuiltinPropertyWorkItem) {
+            return false;
+          }
+        }
       }
     }
     return true;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index d4f5032b15982..85cb97be03335 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -171,7 +171,11 @@ static bool isGroupBroadcastOrReduction(
   if (!Callee) {
     return false;
   }
-  auto Info = BI.isMuxGroupCollective(BI.analyzeBuiltin(*Callee).ID);
+  auto B = BI.analyzeBuiltin(*Callee);
+  if (!B) {
+    return false;
+  }
+  auto Info = BI.isMuxGroupCollective(B->ID);
   return Info && (Info->isSubGroupScope() || Info->isWorkGroupScope()) &&
          (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast());
 }
@@ -206,7 +210,7 @@ void UniformValueResult::findVectorLeaves(
           if (!Callee->isIntrinsic() && CI->use_empty()) {
             // Try to identify the called function
             const auto Builtin = BI.analyzeBuiltin(*Callee);
-            if (!Builtin.isValid()) {
+            if (!Builtin) {
               Leaves.push_back(CI);
             }
           }
@@ -268,7 +272,10 @@ void UniformValueResult::findVectorRoots(std::vector<Value *> &Roots) const {
         continue;
       }
       const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension);
-      const auto Uniformity = Builtin.uniformity;
+      if (!Builtin) {
+        continue;
+      }
+      const auto Uniformity = Builtin->uniformity;
       if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
           Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
         // Calls to `get_global_id`/`get_local_id` are roots.
@@ -329,12 +336,12 @@ void UniformValueResult::markVaryingValues(Value *V, Value *From) {
     Function *Callee = CI->getCalledFunction();
     if (Callee) {
       const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-      const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension);
-      const auto Uniformity = Builtin.uniformity;
-      if (Uniformity == compiler::utils::eBuiltinUniformityAlways) {
-        return;
+      if (const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension)) {
+        const auto Uniformity = Builtin->uniformity;
+        if (Uniformity == compiler::utils::eBuiltinUniformityAlways) {
+          return;
+        }
       }
-
       if (auto Op = MemOp::get(CI)) {
         // The mask cannot affect the MemOp value, even though we may still
         // need to packetize the mask..
@@ -509,7 +516,8 @@ UniformValueResult UniformValueAnalysis::run(
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (Function *Callee = CI->getCalledFunction()) {
           const auto Builtin = BI.analyzeBuiltin(*Callee);
-          if (Builtin.properties & compiler::utils::eBuiltinPropertyAtomic) {
+          if (Builtin &&
+              Builtin->properties & compiler::utils::eBuiltinPropertyAtomic) {
             Res.markVaryingValues(&I);
             continue;
           }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
index 7afc0e48dd6fc..d112bcd90af9a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -74,7 +74,7 @@ bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
       // correspond to user functions.
       const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
       const auto Builtin = BI.analyzeBuiltin(*Callee);
-      if (!Builtin.isValid()) {
+      if (!Builtin) {
         // If it is a user function missing a definition, we cannot safely
         // instantiate it. For example, what if it contains calls to
         // get_global_id internally?
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 245bad2aa03d7..9f08ebfa24969 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -319,32 +319,33 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
   // Analyse function calls.
   if (CallInst *CI = dyn_cast<CallInst>(Offset)) {
     const auto &BI = SAR.UVR.Ctx.builtins();
-    const auto Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension);
-    switch (Builtin.uniformity) {
-      default:
-      case compiler::utils::eBuiltinUniformityMaybeInstanceID:
-      case compiler::utils::eBuiltinUniformityNever:
-        return setMayDiverge();
-      case compiler::utils::eBuiltinUniformityLikeInputs:
-        break;
-      case compiler::utils::eBuiltinUniformityAlways:
-        return setKind(eOffsetUniformVariable);
-      case compiler::utils::eBuiltinUniformityInstanceID:
-        if (Builtin.properties & compiler::utils::eBuiltinPropertyLocalID) {
-          // If the local size is unknown (represented by zero), the resulting
-          // mask will be ~0ULL (all ones). Potentially, it is possible to use
-          // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in
-          // this case.
-          uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
-          LocalBitMask |= LocalBitMask >> 32;
-          LocalBitMask |= LocalBitMask >> 16;
-          LocalBitMask |= LocalBitMask >> 8;
-          LocalBitMask |= LocalBitMask >> 4;
-          LocalBitMask |= LocalBitMask >> 2;
-          LocalBitMask |= LocalBitMask >> 1;
-          BitMask = LocalBitMask;
-        }
-        return setStride(1);
+    if (const auto Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension)) {
+      switch (Builtin->uniformity) {
+        default:
+        case compiler::utils::eBuiltinUniformityMaybeInstanceID:
+        case compiler::utils::eBuiltinUniformityNever:
+          return setMayDiverge();
+        case compiler::utils::eBuiltinUniformityLikeInputs:
+          break;
+        case compiler::utils::eBuiltinUniformityAlways:
+          return setKind(eOffsetUniformVariable);
+        case compiler::utils::eBuiltinUniformityInstanceID:
+          if (Builtin->properties & compiler::utils::eBuiltinPropertyLocalID) {
+            // If the local size is unknown (represented by zero), the resulting
+            // mask will be ~0ULL (all ones). Potentially, it is possible to use
+            // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in
+            // this case.
+            uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
+            LocalBitMask |= LocalBitMask >> 32;
+            LocalBitMask |= LocalBitMask >> 16;
+            LocalBitMask |= LocalBitMask >> 8;
+            LocalBitMask |= LocalBitMask >> 4;
+            LocalBitMask |= LocalBitMask >> 2;
+            LocalBitMask |= LocalBitMask >> 1;
+            BitMask = LocalBitMask;
+          }
+          return setStride(1);
+      }
     }
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 94d5527e61c47..b22e429bb1c53 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -1344,40 +1344,43 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
     return true;
   }
 
-  // Builtins without side effects do not need to be masked.
-  const auto builtin = Ctx.builtins().analyzeBuiltin(*callee);
-  const auto props = builtin.properties;
-  if (props & compiler::utils::eBuiltinPropertyNoSideEffects) {
-    LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n");
-    return true;
-  }
-  if (props & compiler::utils::eBuiltinPropertyWorkItem) {
-    LLVM_DEBUG(dbgs() << "vecz-cf: Called function is a workitem ID builtin\n");
-    return true;
-  }
-  if (props & compiler::utils::eBuiltinPropertyExecutionFlow) {
-    LLVM_DEBUG(
-        dbgs() << "vecz-cf: Called function is an execution flow builtin\n");
-    // Masking this kind of builtin (a barrier) is not valid.
-    return false;
-  }
   // Functions without side-effects do not need to be masked.
   if (callee->onlyReadsMemory() || callee->doesNotAccessMemory()) {
     LLVM_DEBUG(
         dbgs() << "vecz-cf: Called function does not have any side-effects\n");
     return true;
   }
-  // We don't want to mask work-group collective builtins, because they are
-  // barriers (see above). This should actually be a rare situation, as these
-  // builtins are required to be uniform/convergent and so either all
-  // work-items or no work-items should hit them. Most of the time, this
-  // situation relies on the vectorizer failing to trace the branch flow and
-  // failing to realize the conditions are in fact uniform.
-  if (auto info = Ctx.builtins().isMuxGroupCollective(builtin.ID);
-      info && info->isWorkGroupScope()) {
-    LLVM_DEBUG(
-        dbgs() << "vecz-cf: Called function is a work-group collective\n");
-    return true;
+
+  // Builtins without side effects do not need to be masked.
+  if (const auto builtin = Ctx.builtins().analyzeBuiltin(*callee)) {
+    const auto props = builtin->properties;
+    if (props & compiler::utils::eBuiltinPropertyNoSideEffects) {
+      LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n");
+      return true;
+    }
+    if (props & compiler::utils::eBuiltinPropertyWorkItem) {
+      LLVM_DEBUG(
+          dbgs() << "vecz-cf: Called function is a workitem ID builtin\n");
+      return true;
+    }
+    if (props & compiler::utils::eBuiltinPropertyExecutionFlow) {
+      LLVM_DEBUG(
+          dbgs() << "vecz-cf: Called function is an execution flow builtin\n");
+      // Masking this kind of builtin (a barrier) is not valid.
+      return false;
+    }
+    // We don't want to mask work-group collective builtins, because they are
+    // barriers (see above). This should actually be a rare situation, as these
+    // builtins are required to be uniform/convergent and so either all
+    // work-items or no work-items should hit them. Most of the time, this
+    // situation relies on the vectorizer failing to trace the branch flow and
+    // failing to realize the conditions are in fact uniform.
+    if (auto info = Ctx.builtins().isMuxGroupCollective(builtin->ID);
+        info && info->isWorkGroupScope()) {
+      LLVM_DEBUG(
+          dbgs() << "vecz-cf: Called function is a work-group collective\n");
+      return true;
+    }
   }
 
   // Create the new function and replace the old one with it
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index f83b16b03f689..f7e07793b73cb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -62,8 +62,8 @@ Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
   // Emit builtins inline when they have no vector/scalar equivalent.
   IRBuilder<> B(CI);
   const auto Builtin = BI.analyzeBuiltin(*Callee);
-  if (Builtin.properties &
-      compiler::utils::eBuiltinPropertyInlinePostVectorization) {
+  if (Builtin && Builtin->properties &
+                     compiler::utils::eBuiltinPropertyInlinePostVectorization) {
     const SmallVector<Value *, 4> Args(CI->args());
     if (Value *Impl = BI.emitBuiltinInline(Callee, B, Args)) {
       VECZ_ERROR_IF(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
index c8d5cae91bc33..832c76cd8f8d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -140,8 +140,9 @@ PacketRange InstantiationPass::instantiateCall(CallInst *CI) {
   // Handle special call instructions that return a lane ID.
   const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   const auto Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension());
-  if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
-    const auto Uniformity = Builtin.uniformity;
+  if (Builtin &&
+      Builtin->properties & compiler::utils::eBuiltinPropertyWorkItem) {
+    const auto Uniformity = Builtin->uniformity;
     if (Uniformity == compiler::utils::eBuiltinUniformityNever) {
       // can't handle these (global/local linear ID probably)
       VECZ_FAIL();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 7437a8b649bb8..0b599bc64b019 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -761,9 +761,15 @@ bool Packetizer::Impl::packetize() {
         continue;
       }
 
-      if (auto *const Callee = CI->getCalledFunction();
-          Callee && Ctx.builtins().analyzeBuiltin(*Callee).ID ==
-                        compiler::utils::eMuxBuiltinGetSubGroupSize) {
+      auto *const Callee = CI->getCalledFunction();
+      if (!Callee) {
+        continue;
+      }
+      auto B = Ctx.builtins().analyzeBuiltin(*Callee);
+      if (!B) {
+        continue;
+      }
+      if (B->ID == compiler::utils::eMuxBuiltinGetSubGroupSize) {
         auto *const replacement = [this](CallInst *CI) -> Value * {
           // The vectorized sub-group size is the mux sub-group reduction sum
           // of all of the vectorized sub-group sizes:
@@ -1199,14 +1205,20 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
 
 Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
   auto *const CI = dyn_cast<CallInst>(I);
-  if (!CI || !CI->getCalledFunction()) {
+  if (!CI) {
     return nullptr;
   }
   const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return nullptr;
+  }
 
   const auto Builtin = BI.analyzeBuiltin(*callee);
-  const auto Info = BI.isMuxGroupCollective(Builtin.ID);
+  if (!Builtin) {
+    return nullptr;
+  }
+  const auto Info = BI.isMuxGroupCollective(Builtin->ID);
 
   if (!Info || (!Info->isSubGroupScope() && !Info->isWorkGroupScope()) ||
       (!Info->isAnyAll() && !Info->isReduction())) {
@@ -1271,15 +1283,21 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
 
 Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   auto *const CI = dyn_cast<CallInst>(I);
-  if (!CI || !CI->getCalledFunction()) {
+  if (!CI) {
     return nullptr;
   }
   const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return nullptr;
+  }
   const auto Builtin = BI.analyzeBuiltin(*callee);
+  if (!Builtin) {
+    return nullptr;
+  }
 
   bool isWorkGroup = false;
-  if (auto Info = BI.isMuxGroupCollective(Builtin.ID)) {
+  if (auto Info = BI.isMuxGroupCollective(Builtin->ID)) {
     if (!Info->isBroadcast() ||
         (!Info->isSubGroupScope() && !Info->isWorkGroupScope())) {
       return nullptr;
@@ -1372,14 +1390,21 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
 std::optional<compiler::utils::GroupCollective>
 Packetizer::Impl::isSubgroupShuffleLike(Instruction *I) {
   auto *const CI = dyn_cast<CallInst>(I);
-  if (!CI || !CI->getCalledFunction()) {
+  if (!CI) {
     return std::nullopt;
   }
   const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
   Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return std::nullopt;
+  }
 
   const auto Builtin = BI.analyzeBuiltin(*callee);
-  const auto Info = BI.isMuxGroupCollective(Builtin.ID);
+  if (!Builtin) {
+    return std::nullopt;
+  }
+
+  const auto Info = BI.isMuxGroupCollective(Builtin->ID);
 
   if (Info && Info->isSubGroupScope() && Info->isShuffleLike()) {
     return Info;
@@ -1560,10 +1585,13 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
       B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
   const auto Builtin =
       Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
+  if (!Builtin) {
+    return Packetizer::Result(*this);
+  }
 
   // Vectorize the sub-group local ID
   auto *const VecSubgroupLocalID =
-      vectorizeWorkGroupCall(SubgroupLocalID, Builtin);
+      vectorizeWorkGroupCall(SubgroupLocalID, *Builtin);
   if (!VecSubgroupLocalID) {
     return Packetizer::Result(*this);
   }
@@ -1602,10 +1630,10 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
   RegularShuffle.Op = compiler::utils::GroupCollective::OpKind::Shuffle;
 
   auto RegularShuffleID = Ctx.builtins().getMuxGroupCollective(RegularShuffle);
-  assert(RegularShuffleID != compiler::utils::eBuiltinInvalid);
+  assert(RegularShuffleID);
 
   auto *const RegularShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin(
-      RegularShuffleID, *F.getParent(), {CI->getType()});
+      *RegularShuffleID, *F.getParent(), {CI->getType()});
   assert(RegularShuffleFn);
 
   auto *const VecData = PackData.getAsValue();
@@ -1740,10 +1768,13 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
       B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
   const auto Builtin =
       Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
+  if (!Builtin) {
+    return Packetizer::Result(*this);
+  }
 
   // Vectorize the sub-group local ID
   auto *const VecSubgroupLocalID =
-      vectorizeWorkGroupCall(SubgroupLocalID, Builtin);
+      vectorizeWorkGroupCall(SubgroupLocalID, *Builtin);
   if (!VecSubgroupLocalID) {
     return Packetizer::Result(*this);
   }
@@ -1856,8 +1887,9 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
       B.CreateSub(DeltaLHS, DeltaRHS, "mux.sg.local.id.deltas");
 
   auto ShuffleID = Ctx.builtins().getMuxGroupCollective(ShuffleUpDown);
+  assert(ShuffleID);
   auto *const ShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin(
-      ShuffleID, *F.getParent(), {LHSPackVal->getType()});
+      *ShuffleID, *F.getParent(), {LHSPackVal->getType()});
   assert(ShuffleFn);
 
   SmallVector<Value *, 16> Results(VF);
@@ -2052,8 +2084,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
       return results;
     }
 
-    const auto Props = Ctx.builtins().analyzeBuiltin(*Callee).properties;
-    if (!(Props & compiler::utils::eBuiltinPropertyVectorEquivalent)) {
+    auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
+    if (!Builtin || !(Builtin->properties &
+                      compiler::utils::eBuiltinPropertyVectorEquivalent)) {
       return results;
     }
 
@@ -2137,17 +2170,19 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
   const auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
 
   // Handle scans, which defer to internal builtins.
-  if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin.ID)) {
-    if (Info->isScan()) {
-      return packetizeGroupScan(CI, *Info);
+  if (Builtin) {
+    if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin->ID)) {
+      if (Info->isScan()) {
+        return packetizeGroupScan(CI, *Info);
+      }
     }
-  }
 
-  // Handle external builtins.
-  const auto Props = Builtin.properties;
-  if (Props & compiler::utils::eBuiltinPropertyExecutionFlow ||
-      Props & compiler::utils::eBuiltinPropertyWorkItem) {
-    return results;
+    // Handle external builtins.
+    const auto Props = Builtin->properties;
+    if (Props & compiler::utils::eBuiltinPropertyExecutionFlow ||
+        Props & compiler::utils::eBuiltinPropertyWorkItem) {
+      return results;
+    }
   }
 
   auto *const ty = CI->getType();
@@ -2386,10 +2421,10 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
   ExclScan.Op = compiler::utils::GroupCollective::OpKind::ScanExclusive;
 
   auto ExclScanID = Ctx.builtins().getMuxGroupCollective(ExclScan);
-  assert(ExclScanID != compiler::utils::eBuiltinInvalid);
+  assert(ExclScanID);
 
   auto *const ExclScanFn = Ctx.builtins().getOrDeclareMuxBuiltin(
-      ExclScanID, *F.getParent(), {CI->getType()});
+      *ExclScanID, *F.getParent(), {CI->getType()});
   assert(ExclScanFn);
 
   SmallVector<Value *, 2> ExclScanOps = {Reduction};
@@ -3272,13 +3307,13 @@ Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
 
   // Handle external builtins.
   const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-  const auto Builtin = BI.analyzeBuiltinCall(*CI, Dimension);
-
-  if (Builtin.properties & compiler::utils::eBuiltinPropertyExecutionFlow) {
-    return nullptr;
-  }
-  if (Builtin.properties & compiler::utils::eBuiltinPropertyWorkItem) {
-    return vectorizeWorkGroupCall(CI, Builtin);
+  if (const auto Builtin = BI.analyzeBuiltinCall(*CI, Dimension)) {
+    if (Builtin->properties & compiler::utils::eBuiltinPropertyExecutionFlow) {
+      return nullptr;
+    }
+    if (Builtin->properties & compiler::utils::eBuiltinPropertyWorkItem) {
+      return vectorizeWorkGroupCall(CI, *Builtin);
+    }
   }
 
   // Try to find a unit for this builtin.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 7246d0cd81fec..a70b822601043 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -228,9 +228,10 @@ Value *Scalarizer::scalarizeOperands(Instruction *I) {
     if (!Callee->isIntrinsic()) {
       // Check if this is indeed a printf call
       const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-      const auto ID = BI.analyzeBuiltin(*Callee).ID;
-      if (ID == BI.getPrintfBuiltin()) {
-        return scalarizeOperandsPrintf(CI);
+      if (auto B = BI.analyzeBuiltin(*Callee)) {
+        if (B->ID == BI.getPrintfBuiltin()) {
+          return scalarizeOperandsPrintf(CI);
+        }
       }
     }
 
@@ -1306,11 +1307,12 @@ SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
   }
 
   const auto Builtin = BI.analyzeBuiltin(*Callee);
-  Function *ScalarEquiv = BI.getScalarEquivalent(Builtin, F.getParent());
+  VECZ_FAIL_IF(!Builtin);
+  Function *ScalarEquiv = BI.getScalarEquivalent(*Builtin, F.getParent());
   VECZ_STAT_FAIL_IF(!ScalarEquiv, VeczScalarizeFailBuiltin);
 
   IRBuilder<> B(CI);
-  const auto Props = Builtin.properties;
+  const auto Props = Builtin->properties;
   // Ignore the mask if present
   const unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size();
   SmallVector<SimdPacket *, 4> OpPackets(NumArgs);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 1fbccf2e6fbda..137c88e39f01f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -135,15 +135,20 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
     }
   }
 
+  auto &result = VectorizedBuiltins[&F][SimdWidth];
+
   const auto Builtin = BI.analyzeBuiltin(F);
+  if (!Builtin) {
+    ++VeczContextFailBuiltin;
+    return result;
+  }
 
   // Try to find a vector equivalent for the builtin.
   Function *const VectorCallee =
       isInternalBuiltin(&F)
           ? getInternalVectorEquivalent(&F, SimdWidth)
-          : BI.getVectorEquivalent(Builtin, SimdWidth, &Module);
+          : BI.getVectorEquivalent(*Builtin, SimdWidth, &Module);
 
-  auto &result = VectorizedBuiltins[&F][SimdWidth];
   if (!VectorCallee) {
     ++VeczContextFailBuiltin;
     return result;
@@ -152,7 +157,7 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
   result.func = VectorCallee;
 
   // Gather information about the function's arguments.
-  const auto Props = Builtin.properties;
+  const auto Props = Builtin->properties;
   unsigned i = 0;
   for (const Argument &Arg : F.args()) {
     Type *pointerRetPointeeTy = nullptr;
@@ -182,8 +187,10 @@ VectorizationResult VectorizationContext::getVectorizedFunction(
 
   auto simdWidth = factor.getFixedValue();
   if (auto *vecTy = dyn_cast<FixedVectorType>(callee.getReturnType())) {
-    const auto Builtin = BI.analyzeBuiltin(callee);
-    Function *scalarEquiv = builtins().getScalarEquivalent(Builtin, &Module);
+    Function *scalarEquiv = nullptr;
+    if (const auto Builtin = BI.analyzeBuiltin(callee)) {
+      scalarEquiv = builtins().getScalarEquivalent(*Builtin, &Module);
+    }
     if (!scalarEquiv) {
       ++VeczContextFailScalarizeCall;
       return VectorizationResult();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
index 5859fbfff697d..4a5a270eca2fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -209,10 +209,12 @@ const Value *Heuristics::shouldVectorizeVisitCmpOperand(
   if (const CallInst *CI = dyn_cast<const CallInst>(Val)) {
     // We only care if the CallInst does involve a call to a work-item builtin.
     const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
-    const auto Uniformity = BI.analyzeBuiltinCall(*CI, SimdDimIdx).uniformity;
-    if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
-        Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
-      return (Cache[Val] = CI);
+    if (auto B = BI.analyzeBuiltinCall(*CI, SimdDimIdx)) {
+      const auto Uniformity = B->uniformity;
+      if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+          Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+        return (Cache[Val] = CI);
+      }
     }
   }
 
@@ -296,8 +298,8 @@ bool Heuristics::shouldVectorize() {
         const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
         if (Function *Callee = CI->getCalledFunction()) {
           const auto builtin = BI.analyzeBuiltin(*Callee);
-          if (!(builtin.properties &
-                compiler::utils::eBuiltinPropertyWorkItem)) {
+          if (!builtin || !(builtin->properties &
+                            compiler::utils::eBuiltinPropertyWorkItem)) {
             weight++;
           }
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
new file mode 100644
index 0000000000000..db2b534f0069b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [4 x i8] c"%p\0A\00", align 1
+
+define spir_kernel void @test() {
+entry:
+  %gid = call spir_func i64 @__mux_get_global_id(i32 0)
+  %printf = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 %gid)
+  ret void
+}
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+define spir_func i32 @printf(ptr, ...) {
+  ret i32 0
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: ret void

From 4869bb45846226761ba44de34f518df8c15a38a7 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 25 Apr 2025 10:50:13 +0100
Subject: [PATCH 154/182] [NFC] Fix comments.

getPtrTy()'s parameter is called AddrSpace, not AddressSpace. Adjust
comments accordingly.
---
 .../compiler_pipeline/source/cl_builtin_info.cpp                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 69042752fd7a7..c74805251fbba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -2643,7 +2643,7 @@ Value *CLBuiltinInfo::emitBuiltinInlinePrintf(BuiltinID, IRBuilder<> &B,
   // Declare printf if needed.
   Function *Printf = M.getFunction("printf");
   if (!Printf) {
-    PointerType *PtrTy = B.getPtrTy(/*AddressSpace=*/0);
+    PointerType *PtrTy = B.getPtrTy(/*AddrSpace=*/0);
     FunctionType *PrintfTy = FunctionType::get(B.getInt32Ty(), {PtrTy}, true);
     Printf =
         Function::Create(PrintfTy, GlobalValue::ExternalLinkage, "printf", &M);

From 61d55e803e4eada5ca6b0fea856a5a219cf45e68 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 30 Apr 2025 09:07:00 +0100
Subject: [PATCH 155/182] [LLVM 21] Update for CreateTargetInfo API change.

CreateTargetInfo no longer receives a shared_ptr, it receives a
reference instead.
---
 .../include/multi_llvm/targetinfo.h           | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
new file mode 100644
index 0000000000000..3b936883b4b7d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
@@ -0,0 +1,58 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
+#define MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
+
+#include <clang/Basic/TargetInfo.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+namespace detail {
+
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+
+template <typename TargetInfo = clang::TargetInfo>
+auto createTargetInfo(clang::DiagnosticsEngine &Diags,
+                      clang::TargetOptions &Opts)
+    -> decltype(TargetInfo::CreateTargetInfo(Diags, Opts)) {
+  return TargetInfo::CreateTargetInfo(Diags, Opts);
+}
+
+#endif
+
+template <typename TargetInfo = clang::TargetInfo>
+auto createTargetInfo(clang::DiagnosticsEngine &Diags,
+                      clang::TargetOptions &Opts)
+    -> decltype(TargetInfo::CreateTargetInfo(
+        Diags, std::make_shared<clang::TargetOptions>(Opts))) {
+  return TargetInfo::CreateTargetInfo(
+      Diags, std::make_shared<clang::TargetOptions>(Opts));
+}
+
+}  // namespace detail
+
+struct TargetInfo {
+  static clang::TargetInfo *CreateTargetInfo(clang::DiagnosticsEngine &Diags,
+                                             clang::TargetOptions &Opts) {
+    return multi_llvm::detail::createTargetInfo(Diags, Opts);
+  }
+};
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED

From 6099cc0f63fe81b95cd27c4ab22a341967971cc7 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 2 May 2025 01:23:04 +0100
Subject: [PATCH 156/182] [LLVM 21] Add FMaximum, FMinimum.

LLVM 21 adds FMaximum and FMinimum to AtomicRMWInst::BinOp. Add support
for them.
---
 .../include/multi_llvm/instructions.h         |  60 +++++++++
 .../vecz/source/vectorization_context.cpp     | 123 ++++++++++--------
 2 files changed, 131 insertions(+), 52 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
new file mode 100644
index 0000000000000..7b9b93ba17b7e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
@@ -0,0 +1,60 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
+#define MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
+
+#include <llvm/IR/Instructions.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+namespace detail {
+
+template <typename Base = llvm::AtomicRMWInst, typename = void>
+struct AtomicRMWInst : Base {};
+
+#if LLVM_VERSION_LESS(20, 0)
+template <typename Base>
+struct AtomicRMWInst<
+    Base, std::enable_if_t<Base::LAST_BINOP - Base::FIRST_BINOP == 16>>
+    : llvm::AtomicRMWInst {
+  static constexpr BinOp USubCond = static_cast<BinOp>(BAD_BINOP + 1);
+  static constexpr BinOp USubSat = static_cast<BinOp>(BAD_BINOP + 2);
+  static constexpr BinOp FMaximum = static_cast<BinOp>(BAD_BINOP + 3);
+  static constexpr BinOp FMinimum = static_cast<BinOp>(BAD_BINOP + 4);
+};
+#endif
+
+// #if LLVM_VERSION_LESS(21, 0)
+// This is enabled for now on LLVM 21 as well to allow building against older
+// LLVM 21 snapshots.
+template <typename Base>
+struct AtomicRMWInst<
+    Base, std::enable_if_t<Base::LAST_BINOP - Base::FIRST_BINOP == 18>>
+    : llvm::AtomicRMWInst {
+  static constexpr BinOp FMaximum = static_cast<BinOp>(BAD_BINOP + 1);
+  static constexpr BinOp FMinimum = static_cast<BinOp>(BAD_BINOP + 2);
+};
+// #endif
+
+}  // namespace detail
+
+struct AtomicRMWInst : detail::AtomicRMWInst<> {};
+
+}  // namespace multi_llvm
+
+#endif  // MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 137c88e39f01f..a6029bb830bd3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -28,6 +28,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/instructions.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -415,53 +416,53 @@ VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
   }
   AtomicInfo.IsVolatile = FnName.consume_front("volatile_");
 
-  if (IsCmpXchg) {
-    AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP;
-  } else {
-    if (FnName.consume_front("xchg")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xchg;
-    } else if (FnName.consume_front("add")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Add;
-    } else if (FnName.consume_front("sub")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Sub;
-    } else if (FnName.consume_front("and")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::And;
-    } else if (FnName.consume_front("nand")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Nand;
-    } else if (FnName.consume_front("or")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Or;
-    } else if (FnName.consume_front("xor")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Xor;
-    } else if (FnName.consume_front("max")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Max;
-    } else if (FnName.consume_front("min")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::Min;
-    } else if (FnName.consume_front("umax")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMax;
-    } else if (FnName.consume_front("umin")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UMin;
-    } else if (FnName.consume_front("fadd")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FAdd;
-    } else if (FnName.consume_front("fsub")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FSub;
-    } else if (FnName.consume_front("fmax")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMax;
-    } else if (FnName.consume_front("fmin")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::FMin;
-    } else if (FnName.consume_front("uincwrap")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UIncWrap;
-    } else if (FnName.consume_front("udecwrap")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::UDecWrap;
-#if LLVM_VERSION_GREATER_EQUAL(20, 0)
-    } else if (FnName.consume_front("usubcond")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::USubCond;
-    } else if (FnName.consume_front("usubsat")) {
-      AtomicInfo.BinOp = AtomicRMWInst::BinOp::USubSat;
-#endif
-    } else {
-      return std::nullopt;
+  AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP;
+
+  if (!IsCmpXchg) {
+    if (FnName.consume_front("xchg_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Xchg;
+    } else if (FnName.consume_front("add_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Add;
+    } else if (FnName.consume_front("sub_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Sub;
+    } else if (FnName.consume_front("and_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::And;
+    } else if (FnName.consume_front("nand_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Nand;
+    } else if (FnName.consume_front("or_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Or;
+    } else if (FnName.consume_front("xor_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Xor;
+    } else if (FnName.consume_front("max_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Max;
+    } else if (FnName.consume_front("min_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Min;
+    } else if (FnName.consume_front("umax_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UMax;
+    } else if (FnName.consume_front("umin_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UMin;
+    } else if (FnName.consume_front("fadd_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FAdd;
+    } else if (FnName.consume_front("fsub_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FSub;
+    } else if (FnName.consume_front("fmax_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMax;
+    } else if (FnName.consume_front("fmin_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMin;
+    } else if (FnName.consume_front("fmaximum_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMaximum;
+    } else if (FnName.consume_front("fminimum_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMinimum;
+    } else if (FnName.consume_front("uincwrap_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UIncWrap;
+    } else if (FnName.consume_front("udecwrap_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UDecWrap;
+    } else if (FnName.consume_front("usubcond_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::USubCond;
+    } else if (FnName.consume_front("usubsat_")) {
+      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::USubSat;
     }
-    if (!FnName.consume_front("_")) {
+    if (AtomicInfo.BinOp >= multi_llvm::AtomicRMWInst::BAD_BINOP) {
       return std::nullopt;
     }
   }
@@ -574,11 +575,21 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
   }
 
   if (!isCmpXchg) {
-#define BINOP_CASE(BINOP, STR) \
-  case AtomicRMWInst::BINOP:   \
-    O << (STR);                \
+#define BINOP_CASE(BINOP, STR)           \
+  case multi_llvm::AtomicRMWInst::BINOP: \
+    O << (STR);                          \
     break
 
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4063)
+#endif
+
     switch (I.BinOp) {
       BINOP_CASE(Xchg, "xchg");
       BINOP_CASE(Add, "add");
@@ -595,16 +606,24 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
       BINOP_CASE(FSub, "fsub");
       BINOP_CASE(FMax, "fmax");
       BINOP_CASE(FMin, "fmin");
+      BINOP_CASE(FMaximum, "fmaximum");
+      BINOP_CASE(FMinimum, "fminumum");
       BINOP_CASE(UIncWrap, "uincwrap");
       BINOP_CASE(UDecWrap, "udecwrap");
-#if LLVM_VERSION_GREATER_EQUAL(20, 0)
       BINOP_CASE(USubCond, "usubcond");
       BINOP_CASE(USubSat, "usubsat");
-#endif
-      case llvm::AtomicRMWInst::BAD_BINOP:
+      case multi_llvm::AtomicRMWInst::BAD_BINOP:
         return nullptr;
     }
 
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 #undef BINOP_CASE
     O << "_";
   }

From c5ce5ce6457395bb14f024a0596c2584d6942b2e Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 2 May 2025 13:43:25 +0100
Subject: [PATCH 157/182] [NFC] Address LLVM 21's BinOp additions in a better
 way.

The earlier change to restore compatibility with LLVM 21 required
suppressing compiler warnings and made it so that we would not be
alerted when we need to account for future BinOp additions.

This change moves the complexity of dealing with the different LLVM
versions from vectorization_context.cpp into multi_llvm, and handles it
in a way that does not require suppressing compiler warnings.
---
 .../include/multi_llvm/instructions.h         | 51 +++++-----
 .../include/multi_llvm/instructions.inc       | 84 ++++++++++++++++
 .../vecz/source/vectorization_context.cpp     | 99 +------------------
 3 files changed, 112 insertions(+), 122 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
index 7b9b93ba17b7e..a8506921affd8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
@@ -24,36 +24,33 @@ namespace multi_llvm {
 
 namespace detail {
 
-template <typename Base = llvm::AtomicRMWInst, typename = void>
-struct AtomicRMWInst : Base {};
-
-#if LLVM_VERSION_LESS(20, 0)
-template <typename Base>
-struct AtomicRMWInst<
-    Base, std::enable_if_t<Base::LAST_BINOP - Base::FIRST_BINOP == 16>>
-    : llvm::AtomicRMWInst {
-  static constexpr BinOp USubCond = static_cast<BinOp>(BAD_BINOP + 1);
-  static constexpr BinOp USubSat = static_cast<BinOp>(BAD_BINOP + 2);
-  static constexpr BinOp FMaximum = static_cast<BinOp>(BAD_BINOP + 3);
-  static constexpr BinOp FMinimum = static_cast<BinOp>(BAD_BINOP + 4);
-};
-#endif
-
-// #if LLVM_VERSION_LESS(21, 0)
-// This is enabled for now on LLVM 21 as well to allow building against older
-// LLVM 21 snapshots.
-template <typename Base>
-struct AtomicRMWInst<
-    Base, std::enable_if_t<Base::LAST_BINOP - Base::FIRST_BINOP == 18>>
-    : llvm::AtomicRMWInst {
-  static constexpr BinOp FMaximum = static_cast<BinOp>(BAD_BINOP + 1);
-  static constexpr BinOp FMinimum = static_cast<BinOp>(BAD_BINOP + 2);
-};
-// #endif
+template <typename T = llvm::AtomicRMWInst::BinOp, typename = void>
+struct BinOpHelper;
+
+// TODO Make this entirely version-based once we no longer have to account for
+// older LLVM 21 snapshots that use the LLVM 20 definition of
+// llvm::AtomicRMWInst::BinOp.
+#define LLVM 21
+#include <multi_llvm/instructions.inc>
+#undef LLVM
+#define LLVM 20
+#include <multi_llvm/instructions.inc>
+#undef LLVM
+#define LLVM 19
+#include <multi_llvm/instructions.inc>
+#undef LLVM
 
 }  // namespace detail
 
-struct AtomicRMWInst : detail::AtomicRMWInst<> {};
+static std::optional<llvm::AtomicRMWInst::BinOp> consume_binop_with_underscore(
+    llvm::StringRef &String) {
+  return multi_llvm::detail::BinOpHelper<>::consume_front_with_underscore(
+      String);
+}
+
+static llvm::StringRef to_string(llvm::AtomicRMWInst::BinOp BinOp) {
+  return multi_llvm::detail::BinOpHelper<>::to_string(BinOp);
+}
 
 }  // namespace multi_llvm
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
new file mode 100644
index 0000000000000..12221a798d931
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
@@ -0,0 +1,84 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#if LLVM == 19
+template <typename T>
+struct BinOpHelper<T, std::enable_if_t<T::LAST_BINOP - T::FIRST_BINOP == 16>>
+#define BINOP_LLVM20(OP, STR)
+#define BINOP_LLVM21(OP, STR)
+#elif LLVM == 20
+template <typename T>
+struct BinOpHelper<T, std::enable_if_t<T::LAST_BINOP - T::FIRST_BINOP == 18>>
+#define BINOP_LLVM20(OP, STR) BINOP(OP, STR)
+#define BINOP_LLVM21(OP, STR)
+#elif LLVM == 21
+template <typename T, typename>
+struct BinOpHelper
+#define BINOP_LLVM20(OP, STR) BINOP(OP, STR)
+#define BINOP_LLVM21(OP, STR) BINOP(OP, STR)
+#endif
+{
+#define BINOPS()                     \
+  BINOP(Xchg, "xchg")                \
+  BINOP(Add, "add")                  \
+  BINOP(Sub, "sub")                  \
+  BINOP(And, "and")                  \
+  BINOP(Nand, "nand")                \
+  BINOP(Or, "or")                    \
+  BINOP(Xor, "xor")                  \
+  BINOP(Max, "max")                  \
+  BINOP(Min, "min")                  \
+  BINOP(UMax, "umax")                \
+  BINOP(UMin, "umin")                \
+  BINOP(FAdd, "fadd")                \
+  BINOP(FSub, "fsub")                \
+  BINOP(FMax, "fmax")                \
+  BINOP(FMin, "fmin")                \
+  BINOP_LLVM21(FMaximum, "fmaximum") \
+  BINOP_LLVM21(FMinimum, "fminumum") \
+  BINOP(UIncWrap, "uincwrap")        \
+  BINOP(UDecWrap, "udecwrap")        \
+  BINOP_LLVM20(USubCond, "usubcond") \
+  BINOP_LLVM20(USubSat, "usubsat")
+
+  static std::optional<T> consume_front_with_underscore(
+      llvm::StringRef &String) {
+#define BINOP(BINOP, STR)              \
+  if (String.consume_front(STR "_")) { \
+    return T::BINOP;                   \
+  }
+    BINOPS()
+#undef BINOP
+    return std::nullopt;
+  }
+
+  static llvm::StringRef to_string(T BinOp) {
+    switch (BinOp) {
+#define BINOP(BINOP, STR) \
+  case T::BINOP:          \
+    return STR;
+      BINOPS()
+#undef BINOP
+      case T::BAD_BINOP:
+        break;
+    }
+    llvm_unreachable("Unexpected BinOp");
+  }
+
+#undef BINOPS
+#undef BINOP_LLVM20
+#undef BINOP_LLVM21
+};
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index a6029bb830bd3..7dd0b5422c30d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -419,50 +419,9 @@ VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
   AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP;
 
   if (!IsCmpXchg) {
-    if (FnName.consume_front("xchg_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Xchg;
-    } else if (FnName.consume_front("add_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Add;
-    } else if (FnName.consume_front("sub_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Sub;
-    } else if (FnName.consume_front("and_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::And;
-    } else if (FnName.consume_front("nand_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Nand;
-    } else if (FnName.consume_front("or_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Or;
-    } else if (FnName.consume_front("xor_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Xor;
-    } else if (FnName.consume_front("max_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Max;
-    } else if (FnName.consume_front("min_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::Min;
-    } else if (FnName.consume_front("umax_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UMax;
-    } else if (FnName.consume_front("umin_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UMin;
-    } else if (FnName.consume_front("fadd_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FAdd;
-    } else if (FnName.consume_front("fsub_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FSub;
-    } else if (FnName.consume_front("fmax_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMax;
-    } else if (FnName.consume_front("fmin_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMin;
-    } else if (FnName.consume_front("fmaximum_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMaximum;
-    } else if (FnName.consume_front("fminimum_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::FMinimum;
-    } else if (FnName.consume_front("uincwrap_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UIncWrap;
-    } else if (FnName.consume_front("udecwrap_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::UDecWrap;
-    } else if (FnName.consume_front("usubcond_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::USubCond;
-    } else if (FnName.consume_front("usubsat_")) {
-      AtomicInfo.BinOp = multi_llvm::AtomicRMWInst::USubSat;
-    }
-    if (AtomicInfo.BinOp >= multi_llvm::AtomicRMWInst::BAD_BINOP) {
+    if (auto BinOp = multi_llvm::consume_binop_with_underscore(FnName)) {
+      AtomicInfo.BinOp = *BinOp;
+    } else {
       return std::nullopt;
     }
   }
@@ -575,57 +534,7 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
   }
 
   if (!isCmpXchg) {
-#define BINOP_CASE(BINOP, STR)           \
-  case multi_llvm::AtomicRMWInst::BINOP: \
-    O << (STR);                          \
-    break
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch"
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4063)
-#endif
-
-    switch (I.BinOp) {
-      BINOP_CASE(Xchg, "xchg");
-      BINOP_CASE(Add, "add");
-      BINOP_CASE(Sub, "sub");
-      BINOP_CASE(And, "and");
-      BINOP_CASE(Nand, "nand");
-      BINOP_CASE(Or, "or");
-      BINOP_CASE(Xor, "xor");
-      BINOP_CASE(Max, "max");
-      BINOP_CASE(Min, "min");
-      BINOP_CASE(UMax, "umax");
-      BINOP_CASE(UMin, "umin");
-      BINOP_CASE(FAdd, "fadd");
-      BINOP_CASE(FSub, "fsub");
-      BINOP_CASE(FMax, "fmax");
-      BINOP_CASE(FMin, "fmin");
-      BINOP_CASE(FMaximum, "fmaximum");
-      BINOP_CASE(FMinimum, "fminumum");
-      BINOP_CASE(UIncWrap, "uincwrap");
-      BINOP_CASE(UDecWrap, "udecwrap");
-      BINOP_CASE(USubCond, "usubcond");
-      BINOP_CASE(USubSat, "usubsat");
-      case multi_llvm::AtomicRMWInst::BAD_BINOP:
-        return nullptr;
-    }
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#undef BINOP_CASE
-    O << "_";
+    O << multi_llvm::to_string(I.BinOp) << "_";
   }
 
   O << "align" << I.Align.value() << "_";

From f2e2aa419534c30e7cd0227245359700d94c4b12 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 17 Jan 2025 12:25:05 +0000
Subject: [PATCH 158/182] Removal of Vulkan API support

This removes Vulkan API support from the construction kit, including
all testing and documentation - see
https://github.com/uxlfoundation/oneapi-construction-kit/blob/rfcs/rfc-0003.md
for more information.

Note that some code which is only being used by Vulkan remains after this
commit. This will be reviewed in a later commit.

Note that the next release should increase the version to 5.0.0 after
this breaking change.
---
 .../compiler_pipeline/source/pass_functions.cpp                | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 558fd4ef40cb1..0252f527ee063 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -60,8 +60,7 @@ uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn) {
     if (alloca_inst.isArrayAllocation()) {
       auto *arr_size_val = alloca_inst.getArraySize();
       auto *const_int = llvm::dyn_cast<llvm::ConstantInt>(arr_size_val);
-      assert(const_int != nullptr &&
-             "OpenCL or Vulkan Array Allocation of dynamic size");
+      assert(const_int != nullptr && "Array Allocation of dynamic size");
       const uint64_t arr_size = const_int->getUniqueInteger().getLimitedValue();
       bytes += arr_size * alloc_size;
 

From 71d18ce4ee52d6a5ea3292fc0a57b8da02ae0a53 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 21 May 2025 17:01:14 +0100
Subject: [PATCH 159/182] [NFC] Remove Abacus extras.

abacus_extra contained functions that were only used for Vulkan. Since
we removed the Vulkan API, this became dead code.
---
 .../source/cl_builtin_info.cpp                | 36 -------------------
 1 file changed, 36 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index c74805251fbba..27e8fe7541f1e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -406,21 +406,6 @@ enum CLBuiltinID : compiler::utils::BuiltinID {
   /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_xor'.
   eCLBuiltinSubgroupScanLogicalXorExclusive,
 
-  // GLSL builtin functions
-  eCLBuiltinCodeplayFindLSB,
-  eCLBuiltinCodeplayFindMSB,
-  eCLBuiltinCodeplayBitReverse,
-  eCLBuiltinCodeplayFaceForward,
-  eCLBuiltinCodeplayReflect,
-  eCLBuiltinCodeplayRefract,
-  eCLBuiltinCodeplayPackNormalizeChar4,
-  eCLBuiltinCodeplayPackNormalizeUchar4,
-  eCLBuiltinCodeplayPackNormalizeShort2,
-  eCLBuiltinCodeplayPackNormalizeUshort2,
-  eCLBuiltinCodeplayPackHalf2,
-  eCLBuiltinCodeplayUnpackNormalize,
-  eCLBuiltinCodeplayUnpackHalf2,
-
   // 6.12.7 Vector Data Load and Store Functions
   eCLBuiltinVLoad,
   eCLBuiltinVLoadHalf,
@@ -783,21 +768,6 @@ static constexpr CLBuiltinEntry Builtins[] = {
     {eCLBuiltinSubgroupScanLogicalXorExclusive,
      "sub_group_scan_exclusive_logical_xor", OpenCLC30},
 
-    // GLSL builtin functions
-    {eCLBuiltinCodeplayFaceForward, "codeplay_face_forward"},
-    {eCLBuiltinCodeplayReflect, "codeplay_reflect"},
-    {eCLBuiltinCodeplayRefract, "codeplay_refract"},
-    {eCLBuiltinCodeplayFindLSB, "codeplay_pack_find_lsb"},
-    {eCLBuiltinCodeplayFindMSB, "codeplay_pack_find_msb"},
-    {eCLBuiltinCodeplayBitReverse, "codeplay_pack_bit_reverse"},
-    {eCLBuiltinCodeplayPackNormalizeChar4, "codeplay_pack_normalize_char4"},
-    {eCLBuiltinCodeplayPackNormalizeUchar4, "codeplay_pack_normalize_uchar4"},
-    {eCLBuiltinCodeplayPackNormalizeShort2, "codeplay_pack_normalize_short2"},
-    {eCLBuiltinCodeplayPackNormalizeUshort2, "codeplay_pack_normalize_ushort2"},
-    {eCLBuiltinCodeplayPackHalf2, "codeplay_pack_half2"},
-    {eCLBuiltinCodeplayUnpackNormalize, "codeplay_unpack_normalize"},
-    {eCLBuiltinCodeplayUnpackHalf2, "codeplay_unpack_half2"},
-
     {eBuiltinUnknown, nullptr}};
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1134,12 +1104,6 @@ std::optional<Builtin> CLBuiltinInfo::analyzeBuiltin(
     case eCLBuiltinSubSat:
       Properties |= eBuiltinPropertyCanEmitInline;
       break;
-    case eCLBuiltinCodeplayFaceForward:
-    case eCLBuiltinCodeplayReflect:
-    case eCLBuiltinCodeplayRefract:
-      Properties |= eBuiltinPropertyReduction;
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      break;
     case eCLBuiltinConvertChar:
     case eCLBuiltinConvertShort:
     case eCLBuiltinConvertInt:

From 1f16d4d79cf4b32bd89353e425695630c135009b Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 4 Jun 2025 12:43:12 +0100
Subject: [PATCH 160/182] [LLVM 21] Avoid computeKnownBits.

LLVM 21 changes the API of computeKnownBits. To avoid conditional code,
use llvm::WithCache which has existed since LLVM 18 and allows us to use
the same code across all LLVM versions we support.
---
 .../compiler_passes/vecz/source/offset_info.cpp            | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 9f08ebfa24969..c0ce76424d150 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -18,6 +18,7 @@
 
 #include <compiler/utils/builtin_info.h>
 #include <llvm/Analysis/ValueTracking.h>
+#include <llvm/Analysis/WithCache.h>
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
@@ -177,8 +178,10 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
 
   // If we have a uniform value here we don't need to analyse any further.
   if (!SAR.UVR.isVarying(Ins)) {
-    const auto &KB =
-        computeKnownBits(Ins, SAR.F.getParent()->getDataLayout(), 0, &SAR.AC);
+    SimplifyQuery SQ(SAR.F.getParent()->getDataLayout());
+    SQ.AC = &SAR.AC;
+    const WithCache<Instruction *> InsWithCache(Ins);
+    const auto &KB = InsWithCache.getKnownBits(SQ);
     const auto bitWidth = OffsetTy->getIntegerBitWidth();
 
     // We are interested in the bits that are not known to be zero.

From 76d0c25d68e8060c8b8ad1c8b1d38a72c606a8be Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 11 Jun 2025 13:49:53 +0100
Subject: [PATCH 161/182] [LLVM 21] Remove VectorizationFactor, use
 CreateElementCount.

LLVM 21 changes the IRBuilderBase::CreateVScale method to just do what
its name implies, to create a call to the @llvm.vscale intrinsic
function. In all instances where we were using it, we could have used
IRBuilder::CreateElementCount instead. To allow this to be used more
easily, this change removes compiler::utils::VectorizationFactor and
uses llvm::ElementCount instead, which does the same thing.
---
 .../include/compiler/utils/metadata.h         |  5 +--
 .../compiler/utils/work_item_loops_pass.h     |  1 -
 .../compiler_pipeline/source/metadata.cpp     |  7 +--
 .../source/work_item_loops_pass.cpp           | 43 +++++++++----------
 .../compiler_passes/vecz/include/vecz/pass.h  | 12 +++---
 .../compiler_passes/vecz/source/pass.cpp      | 11 ++---
 .../vecz/source/transform/packetizer.cpp      | 24 +++--------
 .../vecz/source/vector_target_info_riscv.cpp  |  6 +--
 .../vecz/source/vectorization_context.cpp     |  9 +---
 .../vecz/source/vectorizer.cpp                |  6 +--
 .../vecz/tools/source/veczc.cpp               | 23 ++++------
 11 files changed, 56 insertions(+), 91 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
index 493b9df6ee04a..9fc1337564cc2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
@@ -19,11 +19,10 @@
 
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/IR/Metadata.h>
+#include <llvm/Support/TypeSize.h>
 
 #include <optional>
 
-#include "vectorization_factor.h"
-
 namespace llvm {
 class Function;
 class Module;
@@ -54,7 +53,7 @@ uint32_t getOpenCLVersion(const llvm::Module &m);
 /// @brief Describes the state of vectorization on a function/loop.
 struct VectorizationInfo {
   /// @brief The VectorizationFactor. A scalar value if unvectorized.
-  VectorizationFactor vf;
+  llvm::ElementCount vf;
   /// @brief The dimension along which vectorization took place.
   unsigned simdDimIdx;
   /// @brief Whether or not the function/loop was vector-predicated.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
index b990d31ab0d99..ff2d50ed0170e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
@@ -23,7 +23,6 @@
 
 #include <compiler/utils/barrier_regions.h>
 #include <compiler/utils/metadata.h>
-#include <compiler/utils/vectorization_factor.h>
 #include <llvm/ADT/StringRef.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/PassManager.h>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
index 179bf2480266e..2daae4607f9ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
@@ -48,7 +48,8 @@ static MDTuple *encodeVectorizationInfo(const VectorizationInfo &info,
 
   return MDTuple::get(
       Ctx,
-      {ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.vf.getKnownMin())),
+      {ConstantAsMetadata::get(
+           ConstantInt::get(i32Ty, info.vf.getKnownMinValue())),
        ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.vf.isScalable())),
        ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.simdDimIdx)),
        ConstantAsMetadata::get(
@@ -66,8 +67,8 @@ static std::optional<VectorizationInfo> extractVectorizationInfo(MDTuple *md) {
 
   VectorizationInfo info;
 
-  info.vf.setKnownMin(widthMD->getZExtValue());
-  info.vf.setIsScalable(isScalableMD->equalsInt(1));
+  info.vf = llvm::ElementCount::get(widthMD->getZExtValue(),
+                                    isScalableMD->equalsInt(1));
   info.simdDimIdx = simdDimIdxMD->getZExtValue();
   info.IsVectorPredicated = isVPMD->equalsInt(1);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index bd3dfd8818a75..4578a27686028 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -21,7 +21,6 @@
 #include <compiler/utils/metadata.h>
 #include <compiler/utils/pass_functions.h>
 #include <compiler/utils/sub_group_analysis.h>
-#include <compiler/utils/vectorization_factor.h>
 #include <compiler/utils/work_item_loops_pass.h>
 #include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IRBuilder.h>
@@ -99,14 +98,6 @@ class BarrierWithLiveVars : public Barrier {
 
 namespace {
 
-Value *materializeVF(IRBuilder<> &builder,
-                     compiler::utils::VectorizationFactor vf) {
-  auto &m = *builder.GetInsertBlock()->getModule();
-  Constant *multiple =
-      ConstantInt::get(compiler::utils::getSizeType(m), vf.getKnownMin());
-  return !vf.isScalable() ? multiple : builder.CreateVScale(multiple);
-}
-
 struct ScheduleGenerator {
   ScheduleGenerator(Module &m,
                     const compiler::utils::BarrierWithLiveVars &barrierMain,
@@ -550,7 +541,8 @@ struct ScheduleGenerator {
         auto *const op = groupCall->getOperand(1);
 
         // Compute the address of the value in the main barrier struct
-        auto *const VF = materializeVF(ir, barrierMain.getVFInfo().vf);
+        auto *const VF = ir.CreateElementCount(
+            compiler::utils::getSizeType(module), barrierMain.getVFInfo().vf);
         auto *const liveVars = createLiveVarsPtr(barrierMain, ir, idsMain[0],
                                                  idsMain[1], idsMain[2], VF);
         compiler::utils::Barrier::LiveValuesHelper live_values(barrierMain,
@@ -695,7 +687,9 @@ struct ScheduleGenerator {
                   // preheader
                   IRBuilder<> irph(mainPreheaderBB,
                                    mainPreheaderBB->getFirstInsertionPt());
-                  auto *VF = materializeVF(irph, barrierMain.getVFInfo().vf);
+                  auto *VF = irph.CreateElementCount(
+                      compiler::utils::getSizeType(module),
+                      barrierMain.getVFInfo().vf);
 
                   compiler::utils::CreateLoopOpts inner_opts;
                   inner_opts.indexInc = VF;
@@ -1001,7 +995,9 @@ struct ScheduleGenerator {
                   // preheader
                   IRBuilder<> irph(mainPreheaderBB,
                                    mainPreheaderBB->getFirstInsertionPt());
-                  auto *VF = materializeVF(irph, barrierMain.getVFInfo().vf);
+                  auto *VF = irph.CreateElementCount(
+                      compiler::utils::getSizeType(module),
+                      barrierMain.getVFInfo().vf);
 
                   compiler::utils::CreateLoopOpts inner_vf_opts;
                   inner_vf_opts.indexInc = VF;
@@ -1305,7 +1301,7 @@ void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier,
     const auto fixedSize = barrier.getLiveVarMemSizeFixed();
     // We ensure that the VFs are the same between the main and tail.
     auto *const vscale =
-        B.CreateVScale(ConstantInt::get(size_ty, scalablesSize));
+        B.CreateElementCount(size_ty, ElementCount::getScalable(scalablesSize));
     auto *const structSize =
         B.CreateAdd(vscale, ConstantInt::get(size_ty, fixedSize));
     auto *const buffer_size = B.CreateMul(structSize, live_var_size);
@@ -1368,7 +1364,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
 
   auto sizeTyBytes = getSizeTypeBytes(M);
 
-  auto *VF = materializeVF(entryIR, barrierMain.getVFInfo().vf);
+  auto *VF = entryIR.CreateElementCount(compiler::utils::getSizeType(M),
+                                        barrierMain.getVFInfo().vf);
   Value *localSizeDim[3];
 
   if (auto wgs = parseRequiredWGSMetadata(refF)) {
@@ -1478,8 +1475,11 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
       peel = entryIR.CreateSelect(
           remcond, mainLoopLimit,
           Constant::getNullValue(mainLoopLimit->getType()), "peel");
-      effectiveVF = entryIR.CreateSelect(
-          remcond, materializeVF(entryIR, barrierTail->getVFInfo().vf), VF);
+      effectiveVF =
+          entryIR.CreateSelect(remcond,
+                               entryIR.CreateElementCount(
+                                   VF->getType(), barrierTail->getVFInfo().vf),
+                               VF);
     }
     mainLoopLimit = entryIR.CreateSub(mainLoopLimit, peel, "mainLoopLimit");
   }
@@ -1748,9 +1748,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
         // compile time but we should never actually execute such a kernel -
         // we already assume the local sizes are never zero, see elsewhere in
         // this pass) then encode a token info metadata of 1.
-        mainInfo =
-            VectorizationInfo{VectorizationFactor::getScalar(), workItemDim0,
-                              /*isVectorPredicated*/ false};
+        mainInfo = VectorizationInfo{ElementCount::getFixed(1), workItemDim0,
+                                     /*isVectorPredicated*/ false};
       }
     }
     tailInfo = std::nullopt;
@@ -1799,7 +1798,7 @@ PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
 
     const auto WorkItemDim0 = 0;
 
-    const VectorizationInfo scalarTailInfo{VectorizationFactor::getScalar(),
+    const VectorizationInfo scalarTailInfo{ElementCount::getFixed(1),
                                            WorkItemDim0,
                                            /*IsVectorPredicated*/ false};
 
@@ -1843,7 +1842,7 @@ PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
     //   local work-group size
     if (!TailFunc || MainInfo.IsVectorPredicated || ForceNoTail ||
         (LocalSizeInVecDim && !MainInfo.vf.isScalable() &&
-         *LocalSizeInVecDim % MainInfo.vf.getKnownMin() == 0)) {
+         *LocalSizeInVecDim % MainInfo.vf.getKnownMinValue() == 0)) {
       MainTailPairs.push_back({BaseName, &F, MainInfo, /*TailF*/ nullptr,
                                /*TailInfo*/ std::nullopt,
                                /*SkippedTailF*/ TailFunc});
@@ -1878,7 +1877,7 @@ PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
       } else if (auto wgs = parseRequiredWGSMetadata(*P.MainF)) {
         const uint64_t local_size_x = wgs.value()[0];
         if (!P.MainInfo.IsVectorPredicated &&
-            !(local_size_x % P.MainInfo.vf.getKnownMin())) {
+            !(local_size_x % P.MainInfo.vf.getKnownMinValue())) {
           RedundantMains.insert(TailF);
         }
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index af6742390abfc..75c0045705a58 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -21,9 +21,9 @@
 #ifndef VECZ_PASS_H
 #define VECZ_PASS_H
 
-#include <compiler/utils/vectorization_factor.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/PassManager.h>
+#include <llvm/Support/TypeSize.h>
 
 #include <cstdint>
 #include <optional>
@@ -48,23 +48,21 @@ namespace vecz {
 /// @{
 
 struct VeczPassOptions {
-  VeczPassOptions() : vecz_auto(false), vec_dim_idx(0), local_size(0) {}
-
   /// @brief boolean choices such as double support, partial scalarization
   vecz::VectorizationChoices choices;
 
   /// @brief vectorization factor, including known min and scalable flag
-  compiler::utils::VectorizationFactor factor;
+  llvm::ElementCount factor = llvm::ElementCount::getFixed(1);
 
   /// @brief automatically work out factor
-  bool vecz_auto;
+  bool vecz_auto = false;
 
   /// @brief Index of vectorization dimension to use (0 => x, 1 => y, 2 => z).
-  uint32_t vec_dim_idx;
+  uint32_t vec_dim_idx = 0;
 
   /// @brief local_size Value specifying the local size for the function (0 is
   /// unknown)
-  uint64_t local_size;
+  uint64_t local_size = 0;
 };
 
 /// @brief Returns the vectorization options that would vectorize the provided
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index 0dc17a436e275..80cd9b9967110 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -21,7 +21,6 @@
 #include <compiler/utils/device_info.h>
 #include <compiler/utils/metadata.h>
 #include <compiler/utils/sub_group_analysis.h>
-#include <compiler/utils/vectorization_factor.h>
 #include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/Module.h>
@@ -64,8 +63,7 @@ static cl::opt<std::string> VeczPassPipeline(
         "available before a certain pass, add 'require<foo-analysis>'."));
 
 namespace vecz {
-using FnVectorizationResult =
-    std::pair<Function *, compiler::utils::VectorizationFactor>;
+using FnVectorizationResult = std::pair<Function *, llvm::ElementCount>;
 AnalysisKey VeczPassOptionsAnalysis::Key;
 
 PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
@@ -207,7 +205,7 @@ PreservedAnalyses VeczPassOptionsPrinterPass::run(Module &M,
       if (O.factor.isScalable()) {
         OS << "vscale x ";
       }
-      OS << O.factor.getKnownMin();
+      OS << O.factor.getKnownMinValue();
 
       if (O.vecz_auto) {
         OS << ", (auto)";
@@ -263,7 +261,7 @@ std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(Function &F) {
     }
     // Else we must vectorize such that we multiply the existing mux sub-group
     // size up to the required one.
-    vecz_opts.factor = compiler::utils::VectorizationFactor::getFixedWidth(
+    vecz_opts.factor = ElementCount::getFixed(
         *reqd_sg_size / compiler::utils::getMuxSubgroupSize(F));
     vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions);
     return vecz_opts;
@@ -357,8 +355,7 @@ std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
     return std::nullopt;
   }
 
-  vecz_opts.factor =
-      compiler::utils::VectorizationFactor::getFixedWidth(*best_width);
+  vecz_opts.factor = ElementCount::getFixed(*best_width);
 
   return vecz_opts;
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 0b599bc64b019..5e25705209cb9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -507,10 +507,8 @@ bool Packetizer::Impl::packetize() {
             B, VL, /*WidestType*/ 32, VU.width())) {
       VL = RVVVL;
     } else {
-      auto *const Scaling =
-          ConstantInt::get(VL->getType(), VU.width().getKnownMinValue());
       auto *const VectorLength =
-          VU.width().isScalable() ? B.CreateVScale(Scaling) : Scaling;
+          B.CreateElementCount(VL->getType(), VU.width());
       VL = B.CreateIntrinsic(Intrinsic::umin, {VL->getType()},
                              {VL, VectorLength});
 
@@ -784,12 +782,7 @@ bool Packetizer::Impl::packetize() {
           if (VL) {
             VecgroupSize = VL;
           } else {
-            auto *const VFVal = B.getInt32(SimdWidth.getKnownMinValue());
-            if (!SimdWidth.isScalable()) {
-              VecgroupSize = VFVal;
-            } else {
-              VecgroupSize = B.CreateVScale(VFVal);
-            }
+            VecgroupSize = B.CreateElementCount(I32Ty, SimdWidth);
           }
           assert(VecgroupSize && "Could not determine vector group size");
 
@@ -1326,12 +1319,7 @@ Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
   auto *idx = CI->getArgOperand(argIdx + 1);
   // We need to sanitize the input index so that it stays within the range of
   // one vectorized group.
-  auto *const minVal =
-      ConstantInt::get(idx->getType(), SimdWidth.getKnownMinValue());
-  Value *idxFactor = minVal;
-  if (SimdWidth.isScalable()) {
-    idxFactor = B.CreateVScale(minVal);
-  }
+  Value *idxFactor = B.CreateElementCount(idx->getType(), SimdWidth);
   auto *const vecIdx = B.CreateURem(idx, idxFactor);
 
   Value *val = nullptr;
@@ -3429,10 +3417,8 @@ Value *Packetizer::Impl::vectorizeWorkGroupCall(
   // |-----------------|-----------------|
   // |  0   1   2   3  |  4   5   6   7  |
   if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
-    auto SimdWithAsVal = B.getInt32(SimdWidth.getKnownMinValue());
-    IDToSplat = B.CreateMul(IDToSplat, !SimdWidth.isScalable()
-                                           ? SimdWithAsVal
-                                           : B.CreateVScale(SimdWithAsVal));
+    IDToSplat = B.CreateMul(
+        IDToSplat, B.CreateElementCount(IDToSplat->getType(), SimdWidth));
   }
 
   // Broadcast the builtin's return value.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index e1a42789c796f..99f97a5d3f280 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -233,10 +233,8 @@ llvm::Value *getIntrinsicVL(llvm::IRBuilderBase &B, llvm::Value *VL,
   }
 
   // Else create a 'default' VL which covers the entire scalable vector.
-  return B.CreateVScale(
-      B.getIntN(XLenTyWidth,
-                cast<VectorType>(wideTy)->getElementCount().getKnownMinValue()),
-      N);
+  return B.CreateElementCount(XLen,
+                              cast<VectorType>(wideTy)->getElementCount());
 }
 
 /// @brief Returns a pair with the `vrgather` intrinsic variation to use and the
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 7dd0b5422c30d..2079d53431942 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -926,10 +926,8 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   Value *Width = nullptr;
   if (IsVP) {
     Width = VL;
-  } else if (EC.isScalable()) {
-    Width = B.CreateVScale(ConstantInt::get(IVTy, EC.getKnownMinValue()));
   } else {
-    Width = ConstantInt::get(IVTy, EC.getFixedValue());
+    Width = B.CreateElementCount(IVTy, EC);
   }
 
   B.CreateBr(Loop);
@@ -1048,14 +1046,11 @@ bool VectorizationContext::emitMaskedAtomicBody(
   const bool IsVector = ValArg->getType()->isVectorTy();
 
   Value *const IdxStart = B.getInt32(0);
-  ConstantInt *const KnownMin = B.getInt32(MA.VF.getKnownMinValue());
   Value *IdxEnd;
   if (MA.IsVectorPredicated) {
     IdxEnd = F.getArg(3 + IsCmpXchg);
-  } else if (MA.VF.isScalable()) {
-    IdxEnd = B.CreateVScale(KnownMin);
   } else {
-    IdxEnd = KnownMin;
+    IdxEnd = B.CreateElementCount(B.getInt32Ty(), MA.VF);
   }
 
   Value *RetVal = nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index 41feec41bd3c9..080f0a7828d2e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -248,8 +248,7 @@ VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx,
   const unsigned SimdDimIdx = Opts.vec_dim_idx;
   const unsigned LocalSize = Opts.local_size;
   const bool Auto = Opts.vecz_auto;
-  auto VF =
-      ElementCount::get(Opts.factor.getKnownMin(), Opts.factor.isScalable());
+  auto VF = Opts.factor;
 
   if (!Kernel || VF.isScalar()) {
     ++VeczBail;
@@ -345,8 +344,7 @@ bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) {
   const auto dim = vu.dimension();
 
   // emit output metadata based on vectorization result
-  auto finalVF = compiler::utils::VectorizationFactor(vf.getKnownMinValue(),
-                                                      vf.isScalable());
+  auto finalVF = vf;
 
   const compiler::utils::VectorizationInfo info{
       finalVF, dim, vu.choices().vectorPredication()};
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 197c4e9bfc928..ffeb4810af918 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -20,7 +20,6 @@
 #include <compiler/utils/optimal_builtin_replacement_pass.h>
 #include <compiler/utils/pass_machinery.h>
 #include <compiler/utils/sub_group_analysis.h>
-#include <compiler/utils/vectorization_factor.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/ADT/StringSwitch.h>
 #include <llvm/Analysis/AliasAnalysis.h>
@@ -178,16 +177,9 @@ static vecz::VeczPassOptions getDefaultPassOptions() {
   }
 
   const auto factor = SIMDWidth ? SIMDWidth : 4;
-  auto VF = compiler::utils::VectorizationFactor::getFixedWidth(factor);
-  if (VeczSimdWidth) {
-    VF.setKnownMin(VeczSimdWidth);
-  }
+  auto VF = llvm::ElementCount::get(VeczSimdWidth ? VeczSimdWidth : factor,
+                                    VeczScalable == llvm::cl::BOU_TRUE);
 
-  if (VeczScalable == llvm::cl::BOU_TRUE) {
-    VF.setIsScalable(true);
-  } else if (VeczScalable == llvm::cl::BOU_FALSE) {
-    VF.setIsScalable(false);
-  }
   vecz::VeczPassOptions passOpts;
   passOpts.choices = Choices;
   passOpts.factor = VF;
@@ -231,7 +223,7 @@ static bool parsePassOptionsSwitch(
       if (vals.consume_front("a")) {
         opt.vecz_auto = true;
       } else if (!vals.consumeInteger(10, vf)) {
-        opt.factor = compiler::utils::VectorizationFactor::getFixedWidth(vf);
+        opt.factor = llvm::ElementCount::getFixed(vf);
       }
       if (vals.consume_front(".")) {
         unsigned dim;
@@ -251,7 +243,10 @@ static bool parsePassOptionsSwitch(
         opt.local_size = simd_width;
       }
       // <scalable_spec> ::= 's'
-      opt.factor.setIsScalable(vals.consume_front("s"));
+      if (vals.consume_front("s")) {
+        opt.factor =
+            llvm::ElementCount::getScalable(opt.factor.getKnownMinValue());
+      }
       // <predicated_spec> ::= 'p'
       if (vals.consume_front("p")) {
         opt.choices.enableVectorPredication();
@@ -438,8 +433,8 @@ int main(const int argc, const char *const argv[]) {
         bool found = false;
         for (auto &result : results) {
           // FIXME this probably not the best way to do this
-          found |=
-              result.second.vf.getKnownMin() >= expected.factor.getKnownMin();
+          found |= result.second.vf.getKnownMinValue() >=
+                   expected.factor.getKnownMinValue();
         }
         if (!found) {
           llvm::errs() << "Error: Failed to vectorize function '" << f.getName()

From dedf80aca99cd5b4f9fe12e69a4ac0212650c808 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 12 Jun 2025 16:38:02 +0100
Subject: [PATCH 162/182] [NFC] Remove code for old debug info format.

We always use the new debug info format now.
---
 .../source/work_item_loops_pass.cpp           | 36 +++++++++----------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 4578a27686028..0c5bc4dc4ded9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -213,30 +213,26 @@ struct ScheduleGenerator {
           old_var->getFile(), old_var->getLine(), old_var->getType(),
           /*AlwaysPreserve=*/false, DINode::FlagZero,
           old_var->getAlignInBits());
+
       // Create intrinsic
-      if (!module.IsNewDbgInfoFormat) {
-        auto *const DII = cast<Instruction *>(DIB.insertDeclare(
-            barrier.getDebugAddr(), new_var, expr, wrapperDbgLoc, block));
-
-        // Bit of a HACK to produce the same debug output as the Mem2Reg
-        // pass used to do.
-        auto *const DVIntrinsic = cast<DbgVariableIntrinsic>(DII);
-        ConvertDebugDeclareToDebugValue(DVIntrinsic, SI, DIB);
-      } else {
-        auto *const DVR = static_cast<DbgVariableRecord *>(
-            cast<DbgRecord *>(DIB.insertDeclare(barrier.getDebugAddr(), new_var,
-                                                expr, wrapperDbgLoc, block)));
 
-        // This is nasty, but LLVM errors out on trailing debug info, we need a
-        // subsequent instruction even if we delete it immediately afterwards.
-        auto *DummyInst = new UnreachableInst(module.getContext(), block);
+#if LLVM_VERSION_LESS(21, 0)
+      assert(module.IsNewDbgInfoFormat &&
+             "Modules should be using the new debug info format");
+#endif
+      auto *const DVR =
+          static_cast<DbgVariableRecord *>(cast<DbgRecord *>(DIB.insertDeclare(
+              barrier.getDebugAddr(), new_var, expr, wrapperDbgLoc, block)));
 
-        // Bit of a HACK to produce the same debug output as the Mem2Reg
-        // pass used to do.
-        ConvertDebugDeclareToDebugValue(DVR, SI, DIB);
+      // This is nasty, but LLVM errors out on trailing debug info, we need a
+      // subsequent instruction even if we delete it immediately afterwards.
+      auto *DummyInst = new UnreachableInst(module.getContext(), block);
 
-        DummyInst->eraseFromParent();
-      }
+      // Bit of a HACK to produce the same debug output as the Mem2Reg
+      // pass used to do.
+      ConvertDebugDeclareToDebugValue(DVR, SI, DIB);
+
+      DummyInst->eraseFromParent();
     };
     for (auto debug_pair : barrier.getDebugIntrinsics()) {
       RecreateDebugIntrinsic(debug_pair.first->getVariable(),

From 75f885bd08c59c64df4c1cb88f10a87ffba2d353 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 20 Jun 2025 15:44:48 +0100
Subject: [PATCH 163/182] [LLVM 21] Allow nuw in more tests.

LLVM 21 adds nuw flags for vscale multiplications. Allow these in tests.
---
 .../test/lit/llvm/ScalableVectors/define_subgroup_scans.ll    | 4 ++--
 .../vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll   | 4 ++--
 .../test/lit/llvm/VectorPredication/compute_vector_length.ll  | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/load_add_store.ll    | 4 ++--
 .../vecz/test/lit/llvm/VectorPredication/udiv.ll              | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 6a8a686d0903f..26887bced392d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -44,7 +44,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 ; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
 ; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
-; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
@@ -79,7 +79,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
 ; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
 ; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
-; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 19093a2f13153..386844b89a495 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -34,7 +34,7 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
   ret void
 ; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_size(
 ; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK: [[W:%.*]] = shl i32 [[VSCALE]], 2
+; CHECK: [[W:%.*]] = shl {{(nuw )?}}i32 [[VSCALE]], 2
 ; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[W]])
 ; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}}
 }
@@ -47,7 +47,7 @@ define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrs
 ; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_local_id(
 ; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
 ; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK: [[SHL:%.*]] = shl i32 %1, 2
+; CHECK: [[SHL:%.*]] = shl {{(nuw )?}}i32 [[VSCALE]], 2
 ; CHECK: [[MUL:%.*]] = mul i32 %call, [[SHL]]
 ; CHECK: [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[MUL]], i64 0
 ; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index 633fac20e4050..042114787f31d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -49,7 +49,7 @@ define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace
 ; CHECK-S4: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK-S4: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
 ; CHECK-S4: [[VF0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-S4: [[VF1:%.*]] = shl i64 [[VF0]], 2
+; CHECK-S4: [[VF1:%.*]] = shl {{(nuw )?}}i64 [[VF0]], 2
 ; CHECK-S4: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 [[VF1]])
 ; CHECK-S4: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32
 ; CHECK-S4: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 7e8f0770dc215..6d25a6a5a924b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -55,7 +55,7 @@ entry:
 ; CHECK_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
 ; CHECK_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK_1S: [[T1:%.*]] = shl i64 [[T0]], 2
+; CHECK_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2
 ; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
 ; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> (shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 4 x i1> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
@@ -94,7 +94,7 @@ entry:
 ; CHECK_V4_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK_V4_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
 ; CHECK_V4_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK_V4_1S: [[T1:%.*]] = shl i64 [[T0]], 2
+; CHECK_V4_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2
 ; CHECK_V4_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK_V4_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
 ; Each WI performs 4 elements, so multiply the VL by 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index bf082b4530bc8..7a080a850ca8c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -41,7 +41,7 @@ entry:
 ; CHECK: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
 ; CHECK: [[T0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK: [[T1:%.*]] = shl i64 [[T0]], 1
+; CHECK: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 1
 ; CHECK: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK: [[VL:%.*]] = trunc i64 [[T2]] to i32
 ; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> (shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 2 x i1> (undef|poison), <vscale x 2 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])

From 56b5011acfe164609df6114d8c3a079198e45b96 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Wed, 16 Jul 2025 08:07:16 +0100
Subject: [PATCH 164/182] Replace getNextNonDebugInstruction() with
 getNextNode()

llvm PR #144383 removed getNextNonDebugInstruction() and replaced its usage with getNextNode().
Since this replacement seems to have no ill effects, we make the same
change here.
---
 .../compiler_pipeline/source/barrier_regions.cpp            | 6 +++---
 .../compiler_passes/vecz/source/transform/packetizer.cpp    | 6 +++---
 .../vecz/source/transform/remove_intptr_pass.cpp            | 2 +-
 .../compiler_passes/vecz/source/transform/scalarizer.cpp    | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 0160470bb4cd7..b81098a0f62e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -1188,7 +1188,7 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
   // The entry kernel might have allocas in it that don't get removed,
   // so better make sure to insert after them.
   while (isa<AllocaInst>(insert_point)) {
-    insert_point = insert_point->getNextNonDebugInstruction();
+    insert_point = insert_point->getNextNode();
   }
 
   // It puts all the GEPs at the start of the kernel, but only once
@@ -1258,9 +1258,9 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
       // Place the new store immediately after the definition, but if it's a
       // PHI node we have to make sure to put it after any other PHI nodes.
       Instruction *inst = cast<Instruction>(vmap[live_var]);
-      Instruction *insert_point = inst->getNextNonDebugInstruction();
+      Instruction *insert_point = inst->getNextNode();
       while (isa<PHINode>(insert_point)) {
-        insert_point = insert_point->getNextNonDebugInstruction();
+        insert_point = insert_point->getNextNode();
       }
       IRBuilder<> B(insert_point);
       if (!isStructWithScalables(live_var->getType())) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 5e25705209cb9..8b75aa9fe02a1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -612,12 +612,12 @@ bool Packetizer::Impl::packetize() {
   for (auto &I : EntryBB) {
     auto *const alloca = dyn_cast<AllocaInst>(&I);
     if (!alloca) {
-      insertPt = I.getNextNonDebugInstruction();
+      insertPt = I.getNextNode();
       continue;
     }
 
     while (isa<AllocaInst>(insertPt)) {
-      insertPt = insertPt->getNextNonDebugInstruction();
+      insertPt = insertPt->getNextNode();
     }
 
     // It's possible for some uses of the alloca to be packetized and others
@@ -3484,7 +3484,7 @@ Value *Packetizer::Impl::vectorizeAlloca(AllocaInst *alloca) {
   // Put the GEP after all allocas.
   Instruction *insertPt = alloca;
   while (isa<AllocaInst>(*insertPt)) {
-    insertPt = insertPt->getNextNonDebugInstruction();
+    insertPt = insertPt->getNextNode();
   }
   B.SetInsertPoint(insertPt);
   deleteInstructionLater(alloca);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
index 9d66e14e73eef..d9438c49a2ec8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
@@ -68,7 +68,7 @@ PreservedAnalyses RemoveIntPtrPass::run(Function &F,
 
         Instruction *insert = phi;
         while (isa<PHINode>(insert)) {
-          insert = insert->getNextNonDebugInstruction();
+          insert = insert->getNextNode();
         }
 
         // Populate the replacement PHI node
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index a70b822601043..84e3fac49d47d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -109,7 +109,7 @@ Value *Scalarizer::getGather(Value *V) {
 
   // Have to build after any PHI nodes.
   while (isa<PHINode>(insert)) {
-    insert = insert->getNextNonDebugInstruction();
+    insert = insert->getNextNode();
   }
   IRBuilder<> B(insert);
 
@@ -641,9 +641,9 @@ SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) {
     }
     insert = &*insertAfter;
   } else if (auto *Inst = dyn_cast<Instruction>(V)) {
-    insert = Inst->getNextNonDebugInstruction();
+    insert = Inst->getNextNode();
     while (isa<PHINode>(insert)) {
-      insert = insert->getNextNonDebugInstruction();
+      insert = insert->getNextNode();
     }
   } else {
     return nullptr;

From d93bf5e163609f06b04a7c614f0700212e146671 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Thu, 17 Jul 2025 09:32:13 +0100
Subject: [PATCH 165/182] [LLVM TIP] Fix use of ConvertDebugDeclareToDebugValue

The function ConvertDebugDeclareToDebugValue() has changed to not accept
DbgVariableIntrinsic and cases where this was used in conjunction with
findDbgDeclares() have been remove in llvm PR #149037. We no longer
generate old style debug, so we just remove the code altogether.
---
 .../vecz/source/transform/basic_mem2reg_pass.cpp             | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index 5d9d78e36d7b3..fa29f71b8894e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -181,11 +181,6 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
     if (StoreInst *Store = dyn_cast<StoreInst>(U)) {
       StoredValue = Store->getValueOperand();
       ToDelete.push_back(Store);
-      DIBuilder DIB(*Alloca->getModule(), /*AllowUnresolved*/ false);
-      auto DbgIntrinsics = findDbgDeclares(Alloca);
-      for (auto oldDII : DbgIntrinsics) {
-        ConvertDebugDeclareToDebugValue(oldDII, Store, DIB);
-      }
       break;
     }
   }

From f2e46fef7d2656e9db293d207532081da6b0bb18 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Sat, 19 Jul 2025 19:05:22 +0100
Subject: [PATCH 166/182] [NFC] Remove more old debug info handling.

As more old debug info functions and types are being removed from LLVM
22, prepare for this by removing our handling of it, as we already know
that regardless of LLVM version, we are already no longer using it.
---
 .../include/compiler/utils/barrier_regions.h  |  12 --
 .../source/barrier_regions.cpp                |  12 --
 .../source/pass_functions.cpp                 |  16 ---
 .../source/work_item_loops_pass.cpp           |   4 -
 .../vecz/source/transform/scalarizer.cpp      |  64 +++--------
 .../vecz/source/vectorization_helpers.cpp     | 104 +-----------------
 6 files changed, 20 insertions(+), 192 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
index c40d1743b0bfc..3f1b89d03c1ae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -38,7 +38,6 @@
 namespace llvm {
 class BasicBlock;
 class CallInst;
-class DbgDeclareInst;
 class FenceInst;
 class Function;
 class Instruction;
@@ -152,15 +151,6 @@ class Barrier {
   /// @brief replaces a subkernel with a given function
   void replaceSubkernel(llvm::Function *from, llvm::Function *to);
 
-  /// @brief Type containing list of debug intrinsics and the source variable
-  /// byte offset in the live variables struct.
-  // TODO CA-1115 llvm.dbg.declare is being deprecated
-  using debug_intrinsics_t =
-      llvm::SmallVector<std::pair<llvm::DbgDeclareInst *, unsigned>, 4>;
-  const debug_intrinsics_t &getDebugIntrinsics() const {
-    return debug_intrinsics_;
-  }
-
   using debug_variable_records_t =
       llvm::SmallVector<std::pair<llvm::DbgVariableRecord *, unsigned>, 4>;
   const debug_variable_records_t &getDebugDbgVariableRecords() const {
@@ -269,8 +259,6 @@ class Barrier {
   barrier_block_block_set_t barrier_successor_set_;
   /// @brief Map between barrier ids and call instructions invoking stubs
   debug_stub_map_t barrier_stub_call_map_;
-  /// @brief List of debug intrinsics and byte offsets into live variable struct
-  debug_intrinsics_t debug_intrinsics_;
   /// @brief List of debug DbgVariableRecords and byte offsets into live
   /// variable struct
   debug_variable_records_t debug_variable_records_;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index b81098a0f62e9..a14efe7c4157a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -957,12 +957,6 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() {
 
     // Check if the alloca has a debug info source variable attached. If
     // so record this and the matching byte offset into the struct.
-    auto DbgIntrinsics = findDbgDeclares(member.value);
-    for (auto DII : DbgIntrinsics) {
-      if (auto dbgDeclare = dyn_cast<DbgDeclareInst>(DII)) {
-        debug_intrinsics_.push_back(std::make_pair(dbgDeclare, offset));
-      }
-    }
     const auto DVRDeclares = findDVRDeclares(member.value);
     for (auto *const DVRDeclare : DVRDeclares) {
       debug_variable_records_.push_back(std::make_pair(DVRDeclare, offset));
@@ -1425,12 +1419,6 @@ BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
 
   // Loop over all instructions, and copy them over.
   for (Instruction &i : *bb) {
-    // Don't clone over debug intrinsics since we're going to create them
-    // manually later.
-    if (isa<DbgDeclareInst>(&i)) {
-      continue;
-    }
-
     Instruction *new_inst = i.clone();
     if (i.hasName()) new_inst->setName(i.getName() + name_suffix);
     new_inst->insertInto(new_bb, new_bb->end());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 0252f527ee063..95791fee0340a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -125,22 +125,6 @@ bool funcContainsDebugMetadata(const llvm::Function &func,
         vmap.MD()[loc].reset(loc);
         foundDI = true;
       }
-
-      if (auto DebugIntrinsic = llvm::dyn_cast<llvm::DbgInfoIntrinsic>(&Inst)) {
-        llvm::DILocalVariable *DIVar = nullptr;
-        if (auto DbgVarIntrinsic =
-                llvm::dyn_cast<llvm::DbgVariableIntrinsic>(DebugIntrinsic)) {
-          DIVar = DbgVarIntrinsic->getVariable();
-        } else {
-          continue;  // TODO CA-1115 - we don't handle DbgLabelInsts yet
-        }
-        if (DIVar) {
-          vmap.MD()[DIVar].reset(DIVar);
-          auto varLoc = DIVar->getScope();
-          vmap.MD()[varLoc].reset(varLoc);
-          foundDI = true;
-        }
-      }
     }
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 0c5bc4dc4ded9..dc08a1faa4d75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -234,10 +234,6 @@ struct ScheduleGenerator {
 
       DummyInst->eraseFromParent();
     };
-    for (auto debug_pair : barrier.getDebugIntrinsics()) {
-      RecreateDebugIntrinsic(debug_pair.first->getVariable(),
-                             debug_pair.second);
-    }
     for (auto debug_pair : barrier.getDebugDbgVariableRecords()) {
       RecreateDebugIntrinsic(debug_pair.first->getVariable(),
                              debug_pair.second);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 84e3fac49d47d..ba1a80885a808 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -685,7 +685,21 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
 
   multi_llvm::DIBuilder DIB(*Original->getModule(), false);
 
-  auto CreateAndInsertDIExpr = [&](auto InsertDIExpr) {
+  for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) {
+    DILocalVariable *DILocal = nullptr;
+    DebugLoc DILoc;
+
+    switch (DVR->getType()) {
+      case DbgVariableRecord::LocationType::Value:
+      case DbgVariableRecord::LocationType::Declare:
+        DILocal = DVR->getVariable();
+        DILoc = DVR->getDebugLoc();
+        break;
+      default:
+        continue;
+    }
+
+    // Create new DbgVariableRecord across enabled SIMD lanes
     const auto bitSize = Original->getType()->getScalarSizeInBits();
     for (unsigned lane = 0; lane < Width; ++lane) {
       Value *LaneVal = Packet->at(lane);
@@ -703,62 +717,18 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
             DIExpression::createFragmentExpression(DIB.createExpression(),
                                                    lane * bitSize, bitSize);
         if (DIExpr) {
-          InsertDIExpr(LaneVal, *DIExpr);
+          DIB.insertDbgValueIntrinsic(LaneVal, DILocal, *DIExpr, DILoc,
+                                      Original->getIterator());
           VectorElements.insert(LaneVal);
         }
       }
     }
-  };
-
-  for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) {
-    DILocalVariable *DILocal = nullptr;
-    DebugLoc DILoc;
-
-    switch (DVR->getType()) {
-      case DbgVariableRecord::LocationType::Value:
-      case DbgVariableRecord::LocationType::Declare:
-        DILocal = DVR->getVariable();
-        DILoc = DVR->getDebugLoc();
-        break;
-      default:
-        continue;
-    }
-
-    // Create new DbgVariableRecord across enabled SIMD lanes
-    CreateAndInsertDIExpr([&](Value *LaneVal, DIExpression *DIExpr) {
-      DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc,
-                                  Original->getIterator());
-    });
   }
 
   auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM);
   if (!MDV) {
     return;
   }
-
-  for (User *U : MDV->users()) {
-    DILocalVariable *DILocal = nullptr;
-    DebugLoc DILoc;
-
-    // These methods aren't virtual in DbgInfoIntrinsic for some reason
-    // TODO CA-1115 - Support llvm.dbg.addr() intrinsic
-    if (DbgValueInst *const DVI = dyn_cast<DbgValueInst>(U)) {
-      DILocal = DVI->getVariable();
-      DILoc = DVI->getDebugLoc();
-    } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(U)) {
-      DILocal = DDI->getVariable();
-      DILoc = DDI->getDebugLoc();
-    } else {
-      continue;
-    }
-
-    // Create new llvm.dbg.value() intrinsic across enabled SIMD lanes
-    CreateAndInsertDIExpr(
-        [&](Value *const LaneVal, DIExpression *const DIExpr) {
-          DIB.insertDbgValueIntrinsic(LaneVal, DILocal, DIExpr, DILoc,
-                                      Original->getIterator());
-        });
-  }
 }
 
 SimdPacket *Scalarizer::assignScalar(SimdPacket *P, Value *V) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 9268d67116627..041f20abe74c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -296,102 +296,9 @@ void cloneDebugInfo(const VectorizationUnit &VU) {
   // Changing the scope to point to the new vectorized function, rather
   // than the scalar function.
 
-  std::vector<Instruction *> DIIntrinsicsToDelete;
-  std::vector<Metadata *> VectorizedLocals;
-
   for (auto &BBItr : *VU.vectorizedFunction()) {
     for (auto &InstItr : BBItr) {
-      // Instruction is a llvm.dbg.value() or llvm.dbg.declare() intrinsic
-      // TODO CA-1115 - Support llvm.dbg.addr() intrinsic
-      if (DbgInfoIntrinsic *const DII = dyn_cast<DbgInfoIntrinsic>(&InstItr)) {
-        // Delete this intrinsic later
-        DIIntrinsicsToDelete.push_back(DII);
-
-        // Generate a new DebugLoc pointing to vectorized function
-        const DebugLoc &ScalarLoc = DII->getDebugLoc();
-
-        // If location is inlined, we need to change the function it's inlined
-        // into to our vectorized kernel, keeping the base location the same.
-        DebugLoc VectorLoc;
-        const DILocation *InlinedLoc = ScalarLoc.getInlinedAt();
-        DISubprogram *OriginalFunc = VectorDI;
-
-        if (InlinedLoc) {
-          OriginalFunc = ScalarLoc->getScope()->getSubprogram();
-          if (InlinedLoc->getInlinedAt()) {
-            // We don't support nested inlined locations currently, abandon
-            // creating dbg intrinsic as otherwise it will fail in validation.
-            continue;
-          }
-
-          const DebugLoc InlinedAtLoc = getDILocation(
-              InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI);
-          VectorLoc = getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(),
-                                    ScalarLoc.getScope(), InlinedAtLoc);
-        } else {
-          VectorLoc =
-              getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(), VectorDI);
-        }
-
-        // New DILocalVariable in the scope of vectorized function
-        DILocalVariable *VectorLocal = nullptr;
-        if (DbgValueInst *const DVI = dyn_cast<DbgValueInst>(DII)) {
-          if (!DVI->getValue()) {
-            // Debug value has been optimized out
-            continue;
-          }
-
-          // Find DILocalVariable the intrinsic references
-          const DILocalVariable *const ScalarLocal = DVI->getVariable();
-
-          // Create a copy of DILocalVariable but in vectorized function scope
-          if (ScalarLocal->getArg() == 0) {
-            VectorLocal = DIB.createAutoVariable(
-                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getFile(),
-                ScalarLocal->getLine(),
-                dyn_cast<DIType>(ScalarLocal->getType()));
-          } else {
-            VectorLocal = DIB.createParameterVariable(
-                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getArg(),
-                ScalarLocal->getFile(), ScalarLocal->getLine(),
-                dyn_cast<DIType>(ScalarLocal->getType()));
-          }
-
-          // New llvm.dbg.value() with correct scope
-          DIB.insertDbgValueIntrinsic(DVI->getValue(), VectorLocal,
-                                      DVI->getExpression(), VectorLoc,
-                                      DVI->getIterator());
-        } else if (DbgDeclareInst *const DDI = dyn_cast<DbgDeclareInst>(DII)) {
-          // Find DILocalVariable the intrinsic references
-          const DILocalVariable *const ScalarLocal = DDI->getVariable();
-
-          // Create a copy of DILocalVariable but in vectorized function scope
-          if (ScalarLocal->getArg() == 0) {
-            VectorLocal = DIB.createAutoVariable(
-                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getFile(),
-                ScalarLocal->getLine(),
-                dyn_cast<DIType>(ScalarLocal->getType()));
-          } else {
-            VectorLocal = DIB.createParameterVariable(
-                OriginalFunc, ScalarLocal->getName(), ScalarLocal->getArg(),
-                ScalarLocal->getFile(), ScalarLocal->getLine(),
-                dyn_cast<DIType>(ScalarLocal->getType()));
-          }
-
-          // New llvm.dbg.declare() with correct scope
-          DIB.insertDeclare(DDI->getAddress(), VectorLocal,
-                            DDI->getExpression(), VectorLoc,
-                            DDI->getIterator());
-        } else {
-          continue;  // No other DbgInfoIntrinsic subclasses
-        }
-
-        if (VectorizedLocals.end() == std::find(VectorizedLocals.begin(),
-                                                VectorizedLocals.end(),
-                                                VectorLocal)) {
-          VectorizedLocals.push_back(VectorLocal);
-        }
-      } else if (InstItr.getDebugLoc()) {
+      if (InstItr.getDebugLoc()) {
         // Update debug info line numbers to have vectorized kernel scope,
         // taking care to preserve inlined locations.
         const DebugLoc &ScalarLoc = InstItr.getDebugLoc();
@@ -413,19 +320,14 @@ void cloneDebugInfo(const VectorizationUnit &VU) {
     }
   }
 
-  // Delete intrinsics we have replaced
-  for (auto Instr : DIIntrinsicsToDelete) {
-    Instr->eraseFromParent();
-  }
-
   // Replace temporary MDNode with list of vectorized DILocals we have created
   // In LLVM 7.0 the variables attribute of DISubprogram was changed to
   // retainedNodes
   auto *VectorizedKernelVariables = VectorDI->getRetainedNodes().get();
   assert(VectorizedKernelVariables && "Could not get retained nodes");
   if (VectorizedKernelVariables->isTemporary()) {
-    auto NewLocals = MDTuple::getTemporary(
-        VectorizedKernelVariables->getContext(), VectorizedLocals);
+    auto NewLocals =
+        MDTuple::getTemporary(VectorizedKernelVariables->getContext(), {});
     VectorizedKernelVariables->replaceAllUsesWith(NewLocals.get());
   }
 

From 5cddabcdcb949f0016796a9705646bbcaff2db62 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 21 Jul 2025 12:02:24 +0100
Subject: [PATCH 167/182] Fix check for GEPs with scalar indices

When a GEP has a scalar index, indexPackets holds a null pointer. We
were mistakenly not checking for that, we were instead checking for it
to hold a SimdPacket that holds a null pointer, but that cannot happen.
This PR corrects the check.
---
 .../vecz/source/transform/scalarizer.cpp      |  2 +-
 .../vecz/test/lit/llvm/scalarize-gep.ll       | 72 +++++++++++++++++++
 2 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index ba1a80885a808..e70356989c4f9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -1519,7 +1519,7 @@ SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) {
     SmallVector<Value *, 4> scalarIndices;
     unsigned indexN = 1U;
     for (auto *idx : indexPackets) {
-      if (idx->at(i)) {
+      if (idx) {
         scalarIndices.push_back(idx->at(i));
       } else {
         scalarIndices.push_back(GEP->getOperand(indexN));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
new file mode 100644
index 0000000000000..8b56e4d5aad5c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
@@ -0,0 +1,72 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k gep -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @gep(ptr addrspace(1) %data, ptr addrspace(1) %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %ptrdata = getelementptr inbounds <2 x ptr>, ptr addrspace(1) %data, i64 %call
+  %ptrdatavec = load <2 x ptr addrspace(1)>, ptr addrspace(1) %ptrdata
+  %ptrdatavec.gep = getelementptr inbounds i32, <2 x ptr addrspace(1)> %ptrdatavec, i64 1
+  %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %ptrdatavec, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+  %vec2 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %ptrdatavec.gep, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+  %vec.add = add <2 x i32> %vec1, %vec2
+  %ptrout = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i64 %call
+  store <2 x i32> %vec.add, ptr addrspace(1) %ptrout
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32 noundef)
+
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)>, i32, <2 x i1>, <2 x ptr addrspace(1)>)
+
+; The full scalarization has not completely removed the vectors, the gather
+; operation should have been replaced by non-vector loads, but check that at
+; least we do not crash.
+
+; CHECK: void @__vecz_v4_gep({{.*}})
+; CHECK: entry:
+; CHECK:   %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+; CHECK:   %ptrdata = getelementptr <2 x ptr>, ptr addrspace(1) %data, i64 %call
+; CHECK:   %0 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 0
+; CHECK:   %1 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 1
+; CHECK:   %ptrdatavec1 = load ptr addrspace(1), ptr addrspace(1) %0, align 1
+; CHECK:   %ptrdatavec2 = load ptr addrspace(1), ptr addrspace(1) %1, align 1
+; CHECK:   %2 = insertelement <2 x ptr addrspace(1)> undef, ptr addrspace(1) %ptrdatavec1, i32 0
+; CHECK:   %3 = insertelement <2 x ptr addrspace(1)> %2, ptr addrspace(1) %ptrdatavec2, i32 1
+; CHECK:   %ptrdatavec.gep3 = getelementptr i32, ptr addrspace(1) %ptrdatavec1, i64 1
+; CHECK:   %ptrdatavec.gep4 = getelementptr i32, ptr addrspace(1) %ptrdatavec2, i64 1
+; CHECK:   %4 = insertelement <2 x ptr addrspace(1)> undef, ptr addrspace(1) %ptrdatavec.gep3, i32 0
+; CHECK:   %5 = insertelement <2 x ptr addrspace(1)> %4, ptr addrspace(1) %ptrdatavec.gep4, i32 1
+; CHECK:   %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %3, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+; CHECK:   %6 = extractelement <2 x i32> %vec1, i32 0
+; CHECK:   %7 = extractelement <2 x i32> %vec1, i32 1
+; CHECK:   %vec2 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %5, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+; CHECK:   %8 = extractelement <2 x i32> %vec2, i32 0
+; CHECK:   %9 = extractelement <2 x i32> %vec2, i32 1
+; CHECK:   %vec.add5 = add i32 %6, %8
+; CHECK:   %vec.add6 = add i32 %7, %9
+; CHECK:   %ptrout = getelementptr <2 x i32>, ptr addrspace(1) %out, i64 %call
+; CHECK:   %10 = getelementptr i32, ptr addrspace(1) %ptrout, i32 0
+; CHECK:   %11 = getelementptr i32, ptr addrspace(1) %ptrout, i32 1
+; CHECK:   store i32 %vec.add5, ptr addrspace(1) %10, align 4
+; CHECK:   store i32 %vec.add6, ptr addrspace(1) %11, align 4
+; CHECK:   ret void

From 94a47191c478c9ed3af20291870706f79aaa6fa2 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Mon, 21 Jul 2025 10:58:15 +0100
Subject: [PATCH 168/182] [NFC] Remove some old LLVM support from tests.

Our minimum supported LLVM version at the moment is LLVM 19, but we
still had some tests that handled LLVM 15 and LLVM 16. This PR removes
that handling.
---
 .../compiler_passes/vecz/test/lit/llvm/undef_ub.ll         | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
index a3db9d0350186..a05dce1dc1481 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
@@ -14,8 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k test -w 4 -S < %s | FileCheck %t
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 source_filename = "Unknown buffer"
@@ -41,6 +40,4 @@ entry:
 ; The "undefs" in the above IR should "optimize" to a trap call and an unreachable
 ; terminator instruction.
 ; CHECK: define spir_kernel void @__vecz_v4_test
-; Before LLVM 17 there's no such trap: the UB is just that the function returns early.
-; CHECK-LT17: ret void
-; CHECK-GE17: unreachable
+; CHECK: unreachable

From 91e710485bcd27599fe863bfb2ab2cd4183045d8 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Tue, 19 Aug 2025 13:44:50 +0100
Subject: [PATCH 169/182] Remove CA references under compiler_pipeline and vecz

There are multiple references which are not publicly visible. Remove
the ones under compiler_pipeline and vecz as part of this process.
---
 .../include/compiler/utils/device_info.h                 | 2 +-
 .../compiler_pipeline/source/builtin_info.cpp            | 2 +-
 .../compiler_pipeline/source/cl_builtin_info.cpp         | 6 +++---
 .../compiler_pipeline/source/mux_builtin_info.cpp        | 1 -
 .../source/replace_local_module_scope_variables_pass.cpp | 1 -
 .../source/unique_opaque_structs_pass.cpp                | 2 +-
 .../compiler_pipeline/source/work_item_loops_pass.cpp    | 2 +-
 .../compiler_passes/vecz/source/control_flow_boscc.cpp   | 9 ++++-----
 .../compiler_passes/vecz/source/pass.cpp                 | 3 ++-
 .../vecz/source/transform/basic_mem2reg_pass.cpp         | 5 ++---
 .../source/transform/control_flow_conversion_pass.cpp    | 2 +-
 .../compiler_passes/vecz/source/transform/packetizer.cpp | 7 +++----
 .../vecz/source/transform/ternary_transform_pass.cpp     | 2 +-
 .../compiler_passes/vecz/source/vector_target_info.cpp   | 2 +-
 .../vecz/source/vector_target_info_arm.cpp               | 4 ++--
 .../vecz/source/vector_target_info_riscv.cpp             | 2 +-
 .../vecz/source/vectorization_context.cpp                | 2 +-
 .../vecz/test/lit/llvm/basic_vecz_mem2reg.ll             | 4 ++--
 .../vecz/test/lit/llvm/builtin_pointer_return.ll         | 3 +--
 .../test/lit/llvm/partial_linearization_exit_masks.ll    | 2 +-
 .../vecz/test/lit/llvm/ternary_transform.ll              | 2 +-
 21 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
index 177ae0c99b4df..36ff58c4d0e7b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
@@ -35,7 +35,7 @@ namespace utils {
 ///
 /// NOTE: Must be kept in sync with mux_floating_point_capabilities_e in
 /// mux/include/mux/mux.h! This should probably be placed in an intermediary
-/// mux/compiler library and shared as part of CA-4236.
+/// mux/compiler library and shared.
 enum device_floating_point_capabilities_e {
   /// @brief Denormals supported.
   device_floating_point_capabilities_denorm = 0x1,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index 2fa7ae9b9b926..ac09df2b062a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -314,7 +314,7 @@ BuiltinUniformity BuiltinInfo::isBuiltinUniform(const Builtin &B,
     case eMuxBuiltinGetLocalLinearId:
     case eMuxBuiltinGetGlobalLinearId:
       // TODO: This is fine for vectorizing in the x-axis, but currently we do
-      // not support vectorizing along y or z (see CA-2843).
+      // not support vectorizing along y or z.
       return SimdDimIdx ? eBuiltinUniformityNever
                         : eBuiltinUniformityInstanceID;
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 27e8fe7541f1e..7029af0c0cd32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -1533,7 +1533,7 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
         NameMangler Mangler(&F->getContext());
         const auto name = Mangler.demangleName(F->getName());
         if (name == "vload_half") {
-          // TODO CA-4691 handle "vload_halfn"
+          // TODO handle "vload_halfn"
           return emitBuiltinInlineVLoadHalf(F, B, Args);
         }
       } break;
@@ -1551,7 +1551,7 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
         NameMangler Mangler(&F->getContext());
         Lexer L(Mangler.demangleName(F->getName()));
         if (L.Consume("vstore_half")) {
-          // TODO CA-4691 handle "vstore_halfn"
+          // TODO handle "vstore_halfn"
           return emitBuiltinInlineVStoreHalf(F, L.TextLeft(), B, Args);
         }
       } break;
@@ -1969,7 +1969,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineAsLLVMBinaryIntrinsic(
   const Triple TT(B.GetInsertBlock()->getModule()->getTargetTriple());
   if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) {
     // fmin and fmax fail CTS on arm targets.
-    // This is a HACK and should be removed when CA-3595 is resolved.
+    // This is a HACK and should be removed when it is resolved.
     return nullptr;
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index 706a197a75a97..f9e206aaf44e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -531,7 +531,6 @@ Function *BIMuxInfoConcept::defineMemBarrier(Function &F, unsigned,
   // our set of default set of targets can't make use of anything but a
   // single-threaded fence. We're also ignoring the kind of memory being
   // controlled by the barrier.
-  // See CA-2997 and CA-3042 for related discussions.
   auto &M = *F.getParent();
   setDefaultBuiltinAttributes(F);
   F.setLinkage(GlobalValue::InternalLinkage);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index 1be6cdaec9226..afd407a657299 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -696,7 +696,6 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
 
       // We can't guarantee a subprogram for all functions.
       // FIXME: Should we be able to? Do we need to clone subprograms somehow?
-      // See CA-4241.
       if (!DISubprogram) {
         continue;
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
index 0f630076d6884..0d04940827e2f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
@@ -74,7 +74,7 @@ static bool shouldClone(compiler::utils::StructTypeRemapper &StructTypeRemapper,
     }
   }
 
-  // TODO: Check globals (see CA-3833).
+  // TODO: Check globals.
 
   // If an instruction makes use of a type but
   // isn't of that type e.g. a cast it will necessarily get caught by
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index dc08a1faa4d75..6686558f20fab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -1921,7 +1921,7 @@ PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
       // FIXME: Also mark them as internal in case they contain symbols we
       // haven't resolved as part of the work-item loop wrapping process. We
       // rely on GlobalOptPass to remove such functions; this is the same root
-      // issue as CA-4126.
+      // issue as some mux builtins require DCE for correctness.
       F.setLinkage(GlobalValue::InternalLinkage);
     }
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 44fe2dae7a73f..9ec9d6d686dd4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -293,7 +293,6 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
     // take several elements into account:
     // - The length of the duplicated code
     // - branch probability
-    // - TODO: CA-1221
     // size_t cost =
     //    std::accumulate(Region->predicatedBlocks.begin(),
     //    Region->predicatedBlocks.end(), 0,
@@ -352,11 +351,11 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
   // doesn't seem to matter, as long as we can fully identify the predicated
   // subset of the SESE region, so we are really working with Multiple-Entry,
   // Single-Exit regions here. This was the cause of the BOSCC Back Door bug
-  // that was encountered previously (CA-2711), where the entry block of a
+  // that was encountered previously, where the entry block of a
   // supposed SESE region did not actually dominate everything in the region,
   // which in this case was caused by an additional non-divergent code path
   // (the "back door" entry point), but it is equally possible for two
-  // divergence-causing branches to enter a predicated region (CA-3194).
+  // divergence-causing branches to enter a predicated region.
   //
   // a)    A*      b)    A       c)    A       d)    A      .
   //      / \           / \           / \           / \     .
@@ -383,7 +382,7 @@ bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
   // immediate post-dominator of B, the first-encountered divergence causing
   // block. Therefore the two overlapping regions have different exit blocks.
   //
-  // Another situation can arise (CA-3851) where the SESE region can contain
+  // Another situation can arise where the SESE region can contain
   // two completely unconnected predicated subregions. Although the DCBI is
   // SESE compact, a SESE region can still contain other, nested SESE regions.
   // Since an entry point into the predicated subregion is not necessarily the
@@ -648,7 +647,7 @@ bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
   VECZ_FAIL_IF(!computeBlockOrdering());
 
   // NOTE doing the Liveness Analysis here is potentially dangerous, since we
-  // have yet to fully restore SSA form (CA-3703).
+  // have yet to fully restore SSA form.
   liveness = &AM.getResult<LivenessAnalysis>(F);
   RC->recalculate(F);
   VECZ_FAIL_IF(!blendFinalize());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index 80cd9b9967110..a904e557847ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -112,7 +112,8 @@ PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
   } else {
     if (auto Err = Mach.getPB().parsePassPipeline(PM, VeczPassPipeline)) {
       // NOTE this is a command line user error print, not a debug print.
-      // We may want to hoist this out of Vecz once CA-4134 is resolved.
+      // We may want to hoist this out of Vecz once replacing RunVeczPass with
+      // a passbuilder is resolved.
       errs() << "vecz pipeline: " << toString(std::move(Err)) << "\n";
       return PreservedAnalyses::all();
     }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index fa29f71b8894e..72b8488942d4e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -119,7 +119,7 @@ bool BasicMem2RegPass::canPromoteAlloca(AllocaInst *Alloca) const {
       //   %v = load i16, ptr %a
       // We can only promote the alloca if we can bitcast between the two
       // underlying types as well.
-      // We could probably zero-extend or trunc if we had to? See CA-4382.
+      // We could probably zero-extend or trunc if we had to?
       const unsigned DstPointeeBits = U->getType()->getPrimitiveSizeInBits();
       if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) {
         return false;
@@ -213,8 +213,7 @@ bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
       //   %a = alloca i32
       //   store i16, ptr %a
       //   %v = load i32, ptr %a
-      // Note: we could do other things if the type sizes didn't match. See
-      // CA-4382.
+      // Note: we could do other things if the type sizes didn't match.
       if (Load->getType()->getPrimitiveSizeInBits() !=
           NewValue->getType()->getPrimitiveSizeInBits()) {
         return false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index b22e429bb1c53..40f05dea2a145 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -1333,7 +1333,7 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   if (!callee) {
     callee = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
   }
-  VECZ_FAIL_IF(!callee);  // TODO: CA-1505: Support indirect function calls.
+  VECZ_FAIL_IF(!callee);  // TODO: Support indirect function calls.
   // Check to see if this is a function that we know we won't be able to
   // handle in any other way.
   VECZ_FAIL_IF(callee->cannotDuplicate());
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 8b75aa9fe02a1..a329a4c02b273 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -536,7 +536,7 @@ bool Packetizer::Impl::packetize() {
               B, VectorType::get(B.getInt32Ty(), SimdWidth), "index.vec");
         }
 
-        // CA-3943 this implementation looks unlikely to be correct, but for
+        // This implementation looks unlikely to be correct, but for
         // now we just maintain the original behaviour, until we have a better
         // idea of what is going on or whether any of this is still needed.
         // This case will never be encountered during kernel vectorization.
@@ -1239,7 +1239,7 @@ Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
 
   // Reduce the packet values in-place.
   // TODO: can we add 'reassoc' to the floating-point reductions to absolve
-  // them of ordering? See CA-3969.
+  // them of ordering?
   op.getPacketValues(packetWidth, opPackets);
 
   assert((!VL || packetWidth) &&
@@ -2088,8 +2088,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
     // Some llvm intrinsic functions like abs have argument that are constants
     // and define as llvm_i1_ty. This means that thoses operand can't
     // be packetized. To solve that temporary, we use this vector so every
-    // cases can set independently what operand must be skipped
-    // CA-3696
+    // cases can set independently what operand must be skipped.
     SmallVector<bool, maxOperands> operandsToSkip(maxOperands, false);
     switch (IntrID) {
       case Intrinsic::abs:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
index d6707529dfca5..d32406c693441 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -107,7 +107,7 @@ bool shouldTransform(SelectInst *Select, const StrideAnalysisResult &SAR) {
     // scalar loads and stores. Performing this transform on vectors was
     // historically banned due to internal limitations, but these days we
     // *should* be able to. It's just that we don't know whether it's
-    // beneficial: see CA-4337.
+    // beneficial.
     for (User *U : GEP->users()) {
       if (auto *const LI = dyn_cast<LoadInst>(U)) {
         if (LI->getType()->isVectorTy()) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index ea6b305c082c5..7712f5a682d37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -1300,7 +1300,7 @@ unsigned TargetInfo::getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
   }
 
   // The floor of 8 prevents poor double precision performance.
-  // Not sure why (CA-3461 related?)
+  // Not sure why.
   return std::max(MaxVecRegBitWidth / BitWidth, 8u);
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index ea5c7284e2556..d74c8b2d08c31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -171,7 +171,7 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
   VECZ_FAIL_IF(HasMask);
   VECZ_FAIL_IF(stride < 0);
 
-  // TODO CA-3100 fetch information on SubTargetInfo
+  // TODO fetch information on SubTargetInfo
   // load instructions seems to be easily split in the backend whereas stores
   // generate a backend error because of invalid data type on vector operands.
   // Vector operands are enabled in the backend only when SubTargetInfo ensures
@@ -337,7 +337,7 @@ bool TargetInfoAArch64::optimizeInterleavedGroup(
   VECZ_FAIL_IF(HasMask);
   VECZ_FAIL_IF(stride < 0);
 
-  // TODO CA-3100 fetch information on SubTargetInfo
+  // TODO fetch information on SubTargetInfo
   // load instructions seems to be easily split in the backend whereas stores
   // generate a backend error because of invalid data type on vector operands.
   // Vector operands are enabled in the backend only when SubTargetInfo ensures
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 99f97a5d3f280..99e6cc7837982 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -101,7 +101,7 @@ bool TargetInfoRISCV::isVPVectorLegal(const llvm::Function &F,
 
 // Should be target-dependent. Take RISCV legal types for now.
 // FIXME: LLVM 14 adds better support for legalization of vp intrinsics, but
-// not RISCV ones like vrgather_vv. See CA-4071.
+// not RISCV ones like vrgather_vv.
 bool TargetInfoRISCV::isVectorTypeLegal(Type *Ty) const {
   assert(Ty->isVectorTy() && "Expecting a vector type.");
   (void)Ty;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 2079d53431942..d721828133ef9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -233,7 +233,7 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   if (!F) {
     F = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
   }
-  VECZ_FAIL_IF(!F);  // TODO: CA-1505: Support indirect function calls.
+  VECZ_FAIL_IF(!F);  // TODO: Support indirect function calls.
   LLVMContext &ctx = F->getContext();
 
   // We will handle printf statements, but handling every possible vararg
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
index fcfbbee2bd38a..89df1272376cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
@@ -59,14 +59,14 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK:  %4 = bitcast i32 %3 to <2 x i16>
 
 ; Note: we can't optimize this as the allocated type size and loaded type sizes
-; don't match. Maybe we could trunc %3 from i32 to i16? See CA-4382.
+; don't match. Maybe we could trunc %3 from i32 to i16?
 
 ; CHECK: define spir_kernel void @__vecz_v4_load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
 ; CHECK:  %data = alloca i32, align 4
 ; CHECK:  %4 = load i16, ptr %data, align 2
 
 ; Note: we can't optimize this as the allocated type size and loaded type sizes
-; don't match. Maybe we could trunc %3 from i32 to i16? See CA-4382.
+; don't match. Maybe we could trunc %3 from i32 to i16?
 
 ; CHECK: define spir_kernel void @__vecz_v4_store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
 ; CHECK:  %data = alloca i32, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
index 74f64b5b77c12..3d3826d128333 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -26,8 +26,7 @@ declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
 declare spir_func <4 x float> @_Z5fractDv4_fPS_(<4 x float>, <4 x float>*)
 declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
 
-; FIXME: Both of these are instantiating when we have vector equivalents: see
-; CA-4046.
+; FIXME: Both of these are instantiating when we have vector equivalents.
 
 define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
   %iouta = alloca float
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
index fad2ee91be6cb..111a4771b76f9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
@@ -18,7 +18,7 @@
 
 ; This test ensures that VECZ does not crash during control flow conversion due
 ; to a missing exit mask. As such, we need only verify that the return code from
-; veczc is 0, and FileCheck is not required. See CA-3117 for details.
+; veczc is 0, and FileCheck is not required.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
index 8121eec00c7c7..1d12758ff31f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
@@ -107,7 +107,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
 ; CHECK: store i64 %b, ptr %c2, align 4
 
-; Note: we don't perform this transform on vector accesses - see CA-4337.
+; Note: we don't perform this transform on vector accesses.
 ; CHECK: define spir_kernel void @__vecz_v4_test_vector_scalar_cond(i64 %a, <2 x i32> %b, ptr %c)
 ; CHECK:   %gid = call i64 @__mux_get_global_id(i32 0)
 ; CHECK:   %cond = icmp eq i64 %a, %gid

From b7087e554ff8c7b080a0eaad54104e4569c1e708 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Tue, 19 Aug 2025 12:28:57 +0100
Subject: [PATCH 170/182] Changed copyright license to point to correct license
 file path

---
 .../compiler_pipeline/include/compiler/utils/address_spaces.h   | 2 +-
 .../compiler_pipeline/include/compiler/utils/attributes.h       | 2 +-
 .../compiler_pipeline/include/compiler/utils/barrier_regions.h  | 2 +-
 .../compiler_pipeline/include/compiler/utils/builtin_info.h     | 2 +-
 .../compiler_pipeline/include/compiler/utils/cl_builtin_info.h  | 2 +-
 .../include/compiler/utils/define_mux_builtins_pass.h           | 2 +-
 .../compiler_pipeline/include/compiler/utils/device_info.h      | 2 +-
 .../compiler_pipeline/include/compiler/utils/dma.h              | 2 +-
 .../include/compiler/utils/encode_kernel_metadata_pass.h        | 2 +-
 .../include/compiler/utils/group_collective_helpers.h           | 2 +-
 .../compiler_pipeline/include/compiler/utils/mangling.h         | 2 +-
 .../compiler_pipeline/include/compiler/utils/metadata.h         | 2 +-
 .../include/compiler/utils/optimal_builtin_replacement_pass.h   | 2 +-
 .../compiler_pipeline/include/compiler/utils/pass_functions.h   | 2 +-
 .../compiler_pipeline/include/compiler/utils/pass_machinery.h   | 2 +-
 .../include/compiler/utils/prepare_barriers_pass.h              | 2 +-
 .../compiler/utils/replace_local_module_scope_variables_pass.h  | 2 +-
 .../compiler_pipeline/include/compiler/utils/scheduling.h       | 2 +-
 .../include/compiler/utils/sub_group_analysis.h                 | 2 +-
 .../include/compiler/utils/target_extension_types.h             | 2 +-
 .../include/compiler/utils/unique_opaque_structs_pass.h         | 2 +-
 .../include/compiler/utils/work_item_loops_pass.h               | 2 +-
 .../compiler_pipeline/include/multi_llvm/dibuilder.h            | 2 +-
 .../compiler_pipeline/include/multi_llvm/instructions.h         | 2 +-
 .../compiler_pipeline/include/multi_llvm/instructions.inc       | 2 +-
 .../compiler_pipeline/include/multi_llvm/intrinsic.h            | 2 +-
 .../compiler_pipeline/include/multi_llvm/llvm_version.h         | 2 +-
 .../compiler_pipeline/include/multi_llvm/loop_utils.h           | 2 +-
 .../compiler_pipeline/include/multi_llvm/multi_llvm.h           | 2 +-
 .../include/multi_llvm/target_transform_info.h                  | 2 +-
 .../compiler_pipeline/include/multi_llvm/targetinfo.h           | 2 +-
 .../compiler_pipeline/include/multi_llvm/vector_type_helper.h   | 2 +-
 .../compiler_passes/compiler_pipeline/source/attributes.cpp     | 2 +-
 .../compiler_pipeline/source/barrier_regions.cpp                | 2 +-
 .../compiler_passes/compiler_pipeline/source/builtin_info.cpp   | 2 +-
 .../compiler_pipeline/source/cl_builtin_info.cpp                | 2 +-
 .../compiler_pipeline/source/define_mux_builtins_pass.cpp       | 2 +-
 .../compiler_passes/compiler_pipeline/source/dma.cpp            | 2 +-
 .../compiler_pipeline/source/encode_kernel_metadata_pass.cpp    | 2 +-
 .../compiler_pipeline/source/group_collective_helpers.cpp       | 2 +-
 .../compiler_passes/compiler_pipeline/source/mangling.cpp       | 2 +-
 .../compiler_passes/compiler_pipeline/source/metadata.cpp       | 2 +-
 .../compiler_pipeline/source/mux_builtin_info.cpp               | 2 +-
 .../source/optimal_builtin_replacement_pass.cpp                 | 2 +-
 .../compiler_passes/compiler_pipeline/source/pass_functions.cpp | 2 +-
 .../compiler_passes/compiler_pipeline/source/pass_machinery.cpp | 2 +-
 .../compiler_pipeline/source/prepare_barriers_pass.cpp          | 2 +-
 .../source/replace_local_module_scope_variables_pass.cpp        | 2 +-
 .../compiler_passes/compiler_pipeline/source/scheduling.cpp     | 2 +-
 .../compiler_pipeline/source/sub_group_analysis.cpp             | 2 +-
 .../compiler_pipeline/source/target_extension_types.cpp         | 2 +-
 .../compiler_pipeline/source/unique_opaque_structs_pass.cpp     | 2 +-
 .../compiler_pipeline/source/work_item_loops_pass.cpp           | 2 +-
 .../SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h | 2 +-
 .../compiler_passes/vecz/include/vecz/vecz_choices.h            | 2 +-
 .../compiler_passes/vecz/include/vecz/vecz_target_info.h        | 2 +-
 .../vecz/source/analysis/control_flow_analysis.cpp              | 2 +-
 .../vecz/source/analysis/divergence_analysis.cpp                | 2 +-
 .../vecz/source/analysis/instantiation_analysis.cpp             | 2 +-
 .../compiler_passes/vecz/source/analysis/liveness_analysis.cpp  | 2 +-
 .../vecz/source/analysis/packetization_analysis.cpp             | 2 +-
 .../vecz/source/analysis/simd_width_analysis.cpp                | 2 +-
 .../compiler_passes/vecz/source/analysis/stride_analysis.cpp    | 2 +-
 .../vecz/source/analysis/uniform_value_analysis.cpp             | 2 +-
 .../vecz/source/analysis/vectorizable_function_analysis.cpp     | 2 +-
 .../vecz/source/analysis/vectorization_unit_analysis.cpp        | 2 +-
 .../compiler_passes/vecz/source/control_flow_boscc.cpp          | 2 +-
 .../compiler_passes/vecz/source/control_flow_roscc.cpp          | 2 +-
 .../compiler_passes/vecz/source/debugging.cpp                   | 2 +-
 .../vecz/source/include/analysis/control_flow_analysis.h        | 2 +-
 .../vecz/source/include/analysis/divergence_analysis.h          | 2 +-
 .../vecz/source/include/analysis/instantiation_analysis.h       | 2 +-
 .../vecz/source/include/analysis/liveness_analysis.h            | 2 +-
 .../vecz/source/include/analysis/packetization_analysis.h       | 2 +-
 .../vecz/source/include/analysis/simd_width_analysis.h          | 2 +-
 .../vecz/source/include/analysis/stride_analysis.h              | 2 +-
 .../vecz/source/include/analysis/uniform_value_analysis.h       | 2 +-
 .../source/include/analysis/vectorizable_function_analysis.h    | 2 +-
 .../vecz/source/include/analysis/vectorization_unit_analysis.h  | 2 +-
 .../compiler_passes/vecz/source/include/control_flow_boscc.h    | 2 +-
 .../compiler_passes/vecz/source/include/control_flow_roscc.h    | 2 +-
 .../compiler_passes/vecz/source/include/debugging.h             | 2 +-
 .../compiler_passes/vecz/source/include/ir_cleanup.h            | 2 +-
 .../compiler_passes/vecz/source/include/llvm_helpers.h          | 2 +-
 .../compiler_passes/vecz/source/include/memory_operations.h     | 2 +-
 .../compiler_passes/vecz/source/include/offset_info.h           | 2 +-
 .../compiler_passes/vecz/source/include/reachability.h          | 2 +-
 .../compiler_passes/vecz/source/include/simd_packet.h           | 2 +-
 .../vecz/source/include/transform/common_gep_elimination_pass.h | 2 +-
 .../source/include/transform/control_flow_conversion_pass.h     | 2 +-
 .../source/include/transform/inline_post_vectorization_pass.h   | 2 +-
 .../vecz/source/include/transform/instantiation_pass.h          | 2 +-
 .../source/include/transform/interleaved_group_combine_pass.h   | 2 +-
 .../vecz/source/include/transform/packetization_helpers.h       | 2 +-
 .../vecz/source/include/transform/packetization_pass.h          | 2 +-
 .../compiler_passes/vecz/source/include/transform/packetizer.h  | 2 +-
 .../compiler_passes/vecz/source/include/transform/passes.h      | 2 +-
 .../vecz/source/include/transform/printf_scalarizer.h           | 2 +-
 .../vecz/source/include/transform/scalarization_pass.h          | 2 +-
 .../compiler_passes/vecz/source/include/transform/scalarizer.h  | 2 +-
 .../vecz/source/include/transform/ternary_transform_pass.h      | 2 +-
 .../compiler_passes/vecz/source/include/vectorization_context.h | 2 +-
 .../compiler_passes/vecz/source/include/vectorization_helpers.h | 2 +-
 .../vecz/source/include/vectorization_heuristics.h              | 2 +-
 .../compiler_passes/vecz/source/include/vectorization_unit.h    | 2 +-
 .../compiler_passes/vecz/source/include/vectorizer.h            | 2 +-
 .../compiler_passes/vecz/source/include/vecz_pass_builder.h     | 2 +-
 .../compiler_passes/vecz/source/ir_cleanup.cpp                  | 2 +-
 .../compiler_passes/vecz/source/llvm_helpers.cpp                | 2 +-
 .../compiler_passes/vecz/source/memory_operations.cpp           | 2 +-
 .../compiler_passes/vecz/source/offset_info.cpp                 | 2 +-
 .../lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp | 2 +-
 .../SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def   | 2 +-
 .../compiler_passes/vecz/source/reachability.cpp                | 2 +-
 .../compiler_passes/vecz/source/simd_packet.cpp                 | 2 +-
 .../vecz/source/transform/basic_mem2reg_pass.cpp                | 2 +-
 .../vecz/source/transform/builtin_inlining_pass.cpp             | 2 +-
 .../vecz/source/transform/common_gep_elimination_pass.cpp       | 2 +-
 .../vecz/source/transform/control_flow_conversion_pass.cpp      | 2 +-
 .../vecz/source/transform/inline_post_vectorization_pass.cpp    | 2 +-
 .../vecz/source/transform/instantiation_pass.cpp                | 2 +-
 .../vecz/source/transform/interleaved_group_combine_pass.cpp    | 2 +-
 .../vecz/source/transform/loop_rotate_custom_pass.cpp           | 2 +-
 .../vecz/source/transform/packetization_helpers.cpp             | 2 +-
 .../vecz/source/transform/packetization_pass.cpp                | 2 +-
 .../compiler_passes/vecz/source/transform/packetizer.cpp        | 2 +-
 .../compiler_passes/vecz/source/transform/passes.cpp            | 2 +-
 .../vecz/source/transform/pre_linearize_pass.cpp                | 2 +-
 .../compiler_passes/vecz/source/transform/printf_scalarizer.cpp | 2 +-
 .../vecz/source/transform/remove_intptr_pass.cpp                | 2 +-
 .../vecz/source/transform/scalarization_pass.cpp                | 2 +-
 .../compiler_passes/vecz/source/transform/scalarizer.cpp        | 2 +-
 .../vecz/source/transform/simplify_infinite_loop_pass.cpp       | 2 +-
 .../vecz/source/transform/squash_small_vectors_pass.cpp         | 2 +-
 .../vecz/source/transform/ternary_transform_pass.cpp            | 2 +-
 .../vecz/source/transform/uniform_reassociation_pass.cpp        | 2 +-
 .../compiler_passes/vecz/source/vector_target_info.cpp          | 2 +-
 .../compiler_passes/vecz/source/vector_target_info_arm.cpp      | 2 +-
 .../compiler_passes/vecz/source/vector_target_info_riscv.cpp    | 2 +-
 .../compiler_passes/vecz/source/vectorization_choices.cpp       | 2 +-
 .../compiler_passes/vecz/source/vectorization_context.cpp       | 2 +-
 .../compiler_passes/vecz/source/vectorization_helpers.cpp       | 2 +-
 .../compiler_passes/vecz/source/vectorization_heuristics.cpp    | 2 +-
 .../compiler_passes/vecz/source/vectorization_unit.cpp          | 2 +-
 .../compiler_passes/vecz/source/vectorizer.cpp                  | 2 +-
 .../compiler_passes/vecz/source/vecz_pass_builder.cpp           | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg    | 2 +-
 .../vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll       | 2 +-
 .../vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll       | 2 +-
 .../vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll       | 2 +-
 .../vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll       | 2 +-
 .../vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll       | 2 +-
 .../vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll    | 2 +-
 .../vecz/test/lit/llvm/Boscc/duplicate_preheader.ll             | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll   | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization0.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization1.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization10.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization11.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization12.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization13.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization14.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization15.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization16.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization17.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization18.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization19.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization2.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization20.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization21.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization22.ll         | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization3.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization4.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization5.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization6.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization7.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization8.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/partial_linearization9.ll          | 2 +-
 .../vecz/test/lit/llvm/Boscc/scalable_linearization.ll          | 2 +-
 .../lit/llvm/PartialScalarization/define_interleaved_store.ll   | 2 +-
 .../PartialScalarization/define_interleaved_store_as_masked.ll  | 2 +-
 .../test/lit/llvm/PartialScalarization/vector_phi_uniform.ll    | 2 +-
 .../test/lit/llvm/PartialScalarization/vector_phi_varying.ll    | 2 +-
 .../vecz/test/lit/llvm/RISCV/broadcast_vector.ll                | 2 +-
 .../vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll           | 2 +-
 .../vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg      | 2 +-
 .../vecz/test/lit/llvm/RISCV/packetize_shuffle.ll               | 2 +-
 .../vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll          | 2 +-
 .../vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll        | 2 +-
 .../vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll        | 2 +-
 .../vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll         | 2 +-
 .../vecz/test/lit/llvm/RISCV/select_scalar_vector.ll            | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll      | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll      | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/builtins.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll  | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll               | 2 +-
 .../test/lit/llvm/ScalableVectors/define_interleaved_store.ll   | 2 +-
 .../llvm/ScalableVectors/define_interleaved_store_as_masked.ll  | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll    | 2 +-
 .../lit/llvm/ScalableVectors/define_masked_scatter_gather.ll    | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll | 2 +-
 .../test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll   | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/extract_element.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll  | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll         | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/insert_element.ll        | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll      | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/intrinsics.ll            | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/lit.local.cfg            | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/load_add_store.ll        | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll     | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/metadata.ll              | 2 +-
 .../test/lit/llvm/ScalableVectors/packetize_mask_varying.ll     | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll         | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/select.ll                | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll  | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/shuffle.ll               | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll  | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll     | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll        | 2 +-
 .../subgroup_scans_spv_khr_uniform_group_instructions.ll        | 2 +-
 .../subgroup_scans_spv_khr_uniform_group_instructions_vp.ll     | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll     | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/vectors.ll               | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/widen_vload.ll           | 2 +-
 .../vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll        | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll     | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/choice.ll              | 2 +-
 .../test/lit/llvm/VectorPredication/compute_vector_length.ll    | 2 +-
 .../lit/llvm/VectorPredication/define_interleaved_load_store.ll | 2 +-
 .../test/lit/llvm/VectorPredication/define_masked_load_store.ll | 2 +-
 .../lit/llvm/VectorPredication/define_masked_scatter_gather.ll  | 2 +-
 .../test/lit/llvm/VectorPredication/define_subgroup_scans.ll    | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/load_add_store.ll      | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/masked_atomics.ll      | 2 +-
 .../test/lit/llvm/VectorPredication/packetize_mask_varying.ll   | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/scatter_gather.ll      | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll | 2 +-
 .../subgroup_reductions_spv_khr_uniform_group_instructions.ll   | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll      | 2 +-
 .../subgroup_scans_spv_khr_uniform_group_instructions.ll        | 2 +-
 .../vecz/test/lit/llvm/VectorPredication/udiv.ll                | 2 +-
 .../test/lit/llvm/VectorWidening/define_interleaved_load.ll     | 2 +-
 .../llvm/VectorWidening/define_interleaved_load_as_masked.ll    | 2 +-
 .../test/lit/llvm/VectorWidening/delete_packetized_memop.ll     | 2 +-
 .../lit/llvm/VectorWidening/extractelement_constant_index.ll    | 2 +-
 .../lit/llvm/VectorWidening/extractelement_runtime_index.ll     | 2 +-
 .../lit/llvm/VectorWidening/extractelement_runtime_index2.ll    | 2 +-
 .../lit/llvm/VectorWidening/extractelement_runtime_index3.ll    | 2 +-
 .../lit/llvm/VectorWidening/insertelement_constant_index.ll     | 2 +-
 .../insertelement_constant_index_constant_value.ll              | 2 +-
 .../test/lit/llvm/VectorWidening/insertelement_runtime_index.ll | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll     | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll    | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll    | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll    | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll    | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll | 2 +-
 .../lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll     | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/vector_copy.ll            | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll     | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_abs.ll              | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_binops.ll           | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_copysign.ll         | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fma.ll              | 2 +-
 .../test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll    | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll          | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll         | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll      | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fshl.ll             | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_fshr.ll             | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll    | 2 +-
 .../vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll             | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/alloca_alias.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll        | 2 +-
 .../vecz/test/lit/llvm/async_workgroup_copy_uniform.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/atomicrmw.ll             | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/bitcast_function.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll   | 2 +-
 .../vecz/test/lit/llvm/builtin_inlining_addsat.ll               | 2 +-
 .../vecz/test/lit/llvm/builtin_inlining_clamp.ll                | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll  | 2 +-
 .../vecz/test/lit/llvm/builtin_inlining_memcpy.ll               | 2 +-
 .../vecz/test/lit/llvm/builtin_inlining_negative.ll             | 2 +-
 .../vecz/test/lit/llvm/builtin_inlining_positive.ll             | 2 +-
 .../vecz/test/lit/llvm/builtin_pointer_return.ll                | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/call_instantiation.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/cmpxchg.ll               | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/constant_address.ll      | 2 +-
 .../vecz/test/lit/llvm/constant_address_with_uniform.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll    | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll  | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_order_y.ll       | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_order_z.ll       | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_ptrs.ll          | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll    | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll  | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_varying_if.ll    | 2 +-
 .../vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/convert3.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/convert4.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/define_gather_load.ll    | 2 +-
 .../vecz/test/lit/llvm/define_gather_load_as_masked.ll          | 2 +-
 .../vecz/test/lit/llvm/define_interleaved_load.ll               | 2 +-
 .../vecz/test/lit/llvm/define_interleaved_load_as_masked.ll     | 2 +-
 .../vecz/test/lit/llvm/define_interleaved_store.ll              | 2 +-
 .../vecz/test/lit/llvm/define_interleaved_store_as_masked.ll    | 2 +-
 .../vecz/test/lit/llvm/define_internal_builtins.ll              | 2 +-
 .../vecz/test/lit/llvm/define_masked_gather_load.ll             | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/define_masked_load.ll    | 2 +-
 .../vecz/test/lit/llvm/define_masked_scatter_store.ll           | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/define_masked_store.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll  | 2 +-
 .../vecz/test/lit/llvm/define_scatter_store_as_masked.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll | 2 +-
 .../vecz/test/lit/llvm/delete_packetized_memop.ll               | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/diverging_loop.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll    | 2 +-
 .../vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/expect_assume.ll         | 2 +-
 .../vecz/test/lit/llvm/extractelement_constant_index.ll         | 2 +-
 .../vecz/test/lit/llvm/extractelement_runtime_index.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/gep_duplication.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/indirect_call.ll         | 2 +-
 .../vecz/test/lit/llvm/inlined_function_debug_info.ll           | 2 +-
 .../vecz/test/lit/llvm/insert_element_debug_info.ll             | 2 +-
 .../vecz/test/lit/llvm/insertelement_constant_index.ll          | 2 +-
 .../vecz/test/lit/llvm/insertelement_runtime_index.ll           | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll | 2 +-
 .../vecz/test/lit/llvm/interleaved_defuse_instantiated.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/intrinsics.ll            | 2 +-
 .../vecz/test/lit/llvm/invalid_cached_assumption_regression.ll  | 2 +-
 .../vecz/test/lit/llvm/invalid_cached_vu_regression.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/load_add_store.ll        | 2 +-
 .../vecz/test/lit/llvm/loop_call_instantiation.ll               | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_atomics.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll | 2 +-
 .../vecz/test/lit/llvm/masked_calls_max_builtin.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll | 2 +-
 .../vecz/test/lit/llvm/masked_group_collective.ll               | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll    | 2 +-
 .../vecz/test/lit/llvm/masked_interleaved_as_scatter.ll         | 2 +-
 .../vecz/test/lit/llvm/masked_interleaved_group.ll              | 2 +-
 .../vecz/test/lit/llvm/masked_interleaved_group2.ll             | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masked_store.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll   | 2 +-
 .../vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll               | 2 +-
 .../vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll               | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride10.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride11.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride12.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride13.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride14.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride15.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride16.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride17.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride18.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride2.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride3.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride4.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride5.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride6.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride7.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride8.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/memop_stride9.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll  | 2 +-
 .../vecz/test/lit/llvm/multiple_kernels_inlining.ll             | 2 +-
 .../vecz/test/lit/llvm/multiple_vectorization_flags.ll          | 2 +-
 .../vecz/test/lit/llvm/multiple_vectorizations.ll               | 2 +-
 .../vecz/test/lit/llvm/multiple_vectorizations_nested.ll        | 2 +-
 .../vecz/test/lit/llvm/multiple_vectorizations_vp.ll            | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/no_vecz1.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/no_vecz2.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll  | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll      | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll      | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isfinited.ll          | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isfinitef.ll          | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll         | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll         | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isinfd.ll             | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isinff.ll             | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll         | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll         | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnand.ll             | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnanf.ll             | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll      | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll      | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnormald.ll          | 2 +-
 .../vecz/test/lit/llvm/onearg_relationals_isnormalf.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/packetization_branch.ll  | 2 +-
 .../vecz/test/lit/llvm/packetization_debug_info.ll              | 2 +-
 .../vecz/test/lit/llvm/packetization_nonvarying.ll              | 2 +-
 .../vecz/test/lit/llvm/packetization_uniform_branch.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/packetize_i48.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll  | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_conditional.ll         | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_default_conditional.ll | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll    | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll   | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_default_reduce.ll      | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll   | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll      | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll     | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll        | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_noreduce.ll            | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_noreduce2.ll           | 2 +-
 .../vecz/test/lit/llvm/packetize_uniform_reduce.ll              | 2 +-
 .../vecz/test/lit/llvm/partial_linearization0.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization1.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization10.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization11.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization12.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization13.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization14.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization15.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization16.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization17.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization18.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization19.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization2.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization20.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization21.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization22-llvm18.ll        | 2 +-
 .../vecz/test/lit/llvm/partial_linearization22.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization23.ll               | 2 +-
 .../vecz/test/lit/llvm/partial_linearization3.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization4.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization5.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization6.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization7.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization8.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization9.ll                | 2 +-
 .../vecz/test/lit/llvm/partial_linearization_exit_masks.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll         | 2 +-
 .../vecz/test/lit/llvm/pass_pipeline_printafter.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/printf_float.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/regression_by_all.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/remove_intptr.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll        | 2 +-
 .../vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalar_splat.ll          | 2 +-
 .../lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll | 2 +-
 .../vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll     | 2 +-
 .../vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll        | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll   | 2 +-
 .../vecz/test/lit/llvm/scalarization_calls_uniform.ll           | 2 +-
 .../vecz/test/lit/llvm/scalarization_debug_info.ll              | 2 +-
 .../vecz/test/lit/llvm/scalarization_instructions.ll            | 2 +-
 .../vecz/test/lit/llvm/scalarization_instructions_uniform.ll    | 2 +-
 .../vecz/test/lit/llvm/scalarization_masked_load_store.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/scan_fact.ll             | 2 +-
 .../vecz/test/lit/llvm/secretly_scalar_load_store.ll            | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/select-no-crash.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll       | 2 +-
 .../vecz/test/lit/llvm/simplify-masked-memops.ll                | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll   | 2 +-
 .../vecz/test/lit/llvm/squash_extract_sext_bigendian.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll   | 2 +-
 .../vecz/test/lit/llvm/squash_extract_zext_bigendian.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/stride_aligned.ll        | 2 +-
 .../vecz/test/lit/llvm/stride_aligned_scalarized.ll             | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/stride_analysis.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll     | 2 +-
 .../vecz/test/lit/llvm/stride_misaligned_scalarized.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/struct_phi.ll            | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/struct_select.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll   | 2 +-
 .../subgroup_reductions_spv_khr_uniform_group_instructions.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll        | 2 +-
 .../llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll   | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/ternary_transform.ll     | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_different_strides.ll   | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_divergent_gep.ll       | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_divergent_source.ll    | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_negative.ll            | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_positive.ll            | 2 +-
 .../lit/llvm/ternary_transform_uniform_cond_diff_strides.ll     | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_uniform_condition.ll   | 2 +-
 .../lit/llvm/ternary_transform_uniform_condition_packetized.ll  | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_uniform_source.ll      | 2 +-
 .../vecz/test/lit/llvm/ternary_transform_uniform_sources.ll     | 2 +-
 .../vecz/test/lit/llvm/too_large_simdwidth_packetization.ll     | 2 +-
 .../vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll      | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/undef_ub.ll              | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll  | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/uniform_loop.ll          | 2 +-
 .../vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll          | 2 +-
 .../vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll          | 2 +-
 .../vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll          | 2 +-
 .../vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll | 2 +-
 .../vecz/test/lit/llvm/uniform_reassociation1.ll                | 2 +-
 .../vecz/test/lit/llvm/uniform_reassociation2.ll                | 2 +-
 .../vecz/test/lit/llvm/uniform_reassociation3.ll                | 2 +-
 .../vecz/test/lit/llvm/unmangled_builtin_call.ll                | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/user_calls.ll            | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/varying_load1.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/varying_load2.ll         | 2 +-
 .../vecz/test/lit/llvm/vector_intrinsics_scalarization.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll    | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_printf.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_printf32.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_printf64.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll  | 2 +-
 .../test/lit/llvm/vector_printf_floats_no_double_support.ll     | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vector_size_1.ll         | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll   | 2 +-
 .../vecz/test/lit/llvm/vecz_scalar_gather_load.ll               | 2 +-
 .../vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll          | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll       | 2 +-
 .../compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll     | 2 +-
 .../compiler_passes/vecz/tools/source/veczc.cpp                 | 2 +-
 587 files changed, 587 insertions(+), 587 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
index 09216f9c02032..42097cdcb900e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
index cc19a11db7f9e..177eaa0a432d8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
index 3f1b89d03c1ae..9bae40595d480 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
index 5147db4ffb7de..8b77ed7ee38da 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
index 9dda278e03b0d..5e6f3fe26e9b6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
index e1e74ec666a8a..525c125a886f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
index 36ff58c4d0e7b..6ec701f758159 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
index 2dfb8121b891a..a5c13add7e21d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
index c01cb00528d85..d3557ddf3034f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
index 2ef9ed9907720..c565c3c93870f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
index 087acd2518549..abbd1abca093e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
index 9fc1337564cc2..6950169a68eb2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
index c28241bb71351..ec32ecec950c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
index cb0b457cc60f4..3d33531c350da 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
index 997071ed1ef2d..9d1e8516867cd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
index 1df7c2f1c2c25..743846da1c109 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
index 03808913c4711..9c94da90a7da6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
index ba1fb4f44e1da..e5742b324f96a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
index 822bf9f7a2f47..726538f5beef2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
index c825e6b9cb124..e8c2c226590d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
index 370b58702816b..e56e847f1da12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
index ff2d50ed0170e..ddf7e65b3e91b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
index 0f0a06a723dbd..487315a8077a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
index a8506921affd8..19ec81efc57ef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
index 12221a798d931..80ae42c23fe4f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
index 0ca8be0f867fb..f8e5800fa901c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
index f1c9f3bdee1c3..55cc6fca85f8f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
index fecfbec7cabc2..2d6e8ba84b242 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index e238e6465ac21..f80557fcf4810 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
index 06e0522759524..eaab65e1eeb1f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
index 3b936883b4b7d..bae892aaa2194 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
index f6fb52dabf054..269e0d28c8b22 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
index 584a719cc13b0..6a92014ea6f48 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index a14efe7c4157a..494ca6d0727ea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index ac09df2b062a2..3242031d61c6f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index 7029af0c0cd32..be330e550855e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
index 5229fa6064f25..61f8d0a83d7b1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
index 4b1b656aeae46..310fa182fc1d3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
index 2bdc18a595d2a..3e065a052f8f0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
index e808e0494f716..04b15ff2cf79e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
index 3597b72eeca10..d911e18b51977 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
index 2daae4607f9ff..85a9ef5dd2929 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index f9e206aaf44e5..788d493310eb6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index f548d652f4510..832389282760d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 95791fee0340a..842d870738dd6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
index 355cc9bc17a03..7f6e262177bbd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
index 0c6067b3f6871..c610e76ee5c67 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index afd407a657299..510db56fd498a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
index 4301ce93cabfc..9bc40bf4282f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
index c2bf37e7fcff8..1a122433268bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
index 54a8097653edf..37c739e22f4f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
index 0d04940827e2f..c7a75ae3dd4bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 6686558f20fab..71663cb7f4314 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index 75c0045705a58..ef5a8ad656fa0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
index e714e66fee19a..a06adcaac8b88 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
index fb5b1aa69cdf6..97a91c9266c9d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
index 6ccf6d5f2e99b..cdeb01e71d77c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
index 4a9e3a2594a0a..64d99ee71e8c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index 54f64218d0b16..e6076fcb634d7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
index 9ec9917acfb19..2a41508892371 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
index f6f4d9a920574..88eb38c4fe860 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
index 8fc023539213b..f92ba7358ed51 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
index 745a3bd6f8381..b98a149c97b12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index 85cb97be03335..f9f2e84a59958 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
index d112bcd90af9a..7365e339c1a86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
index 715d72e4daec0..f53afec4161fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index 9ec9d6d686dd4..ea693925d67a3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
index f59cc6209b361..d0174fb3ba4bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
index 0a175f60a6464..1ddb912ca5a55 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
index 3bd7e78538dc5..f6409f00bb7a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
index 4350da506c06a..8027f90d742fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
index 2a93187cdd979..6859f39690193 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
index 0d6ed87f25a31..d9d8d7fb264de 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
index 5fc33d3857223..9321ad2ed7267 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
index 43ea34eb9ed96..55f79f9866c8f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
index ec30ae43729b2..7f5a1cbaf7293 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
index 1e9071de6b137..b0a083ad69b5e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
index 3a6eebb423b0c..e82d297c4d5ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
index fda2d27c328c0..a2f60888635a9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
index 51bd562be6f00..b1eecfe2854e4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
index 6e15810ef16fd..3ed4dd469f797 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
index a5ae45cdcd83b..6faa2f2b52f15 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
index 3c90865c7f5f2..bf971807ee2bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
index 10a0f0adf7d94..434981ddb9abc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
index 36d23b9d0958b..a28100507c7e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
index c6d55e351145e..1715fbd8b5aeb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
index 402913d912925..a8b87da22aada 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
index 220b74e8424c3..07329e4384b18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
index be3ab1ac66520..b76cccdf4d998 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
index 71a5807fb1ba2..7b6dd46175da6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
index 18c9466c55c73..1a4b7cb74e109 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
index 9d010d167a49c..d2a400ed261b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
index f73f17c20197d..88efc00d560bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index 991d30ee992ab..24c81f90084b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
index f817258a97ef9..ccc52a26912d7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
index 2c8b76306d3cd..4f030ee15fa8f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
index 1af52bb29a086..1f62cc200c967 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
index af4cd7ed67b0b..f10dbf27048de 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
index e0d1156d72c06..71425d2da4a7b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
index 757a7b7fbb926..bcd25451fabb9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
index 54baef8617a34..8453a47a5ae03 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index 756e9db6bf3a3..96a73a5962cfb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
index c4b004892ffd9..978b02a6f202c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
index f79a058b16acc..129f6af29b362 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
index 812fc1de48966..d6ff8aa20eca3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
index d3d42aecbd066..1e3771957e96e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
index 9dc3d3559cdcb..e707974569cde 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
index d862839961420..72a421ef10545 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
index 57013bebc77ba..d3b2271d549c1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index aa000931e98a7..d5aa4b62f3f8d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index c0ce76424d150..32cabec59b297 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index a904e557847ed..cd5df6d5011e9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
index 9b418f773d355..0cba927e215da 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
index 7703c8260b822..4744a3fab1897 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
index 0f31e329086d3..6f0c952bf64c4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
index 72b8488942d4e..e8c6c086828a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
index 39ff78796a3bc..9e865838c021e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
index af3286a6bb2a1..7a6e7d00fb05e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 40f05dea2a145..f33f20a0e878a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index f7e07793b73cb..24896a5af6134 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
index 832c76cd8f8d5..1f509ba022787 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index 79513f649ae2c..31ce6983d3eab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
index aee00d8c43233..949977f889a03 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index beda48d076d62..cab8b13f1236b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
index 3462bb05ebb7b..efe1cefbc54b4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index a329a4c02b273..9f8e9ed1a43e1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index 13897a143e591..2b8edb6f41cbc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index 597b91f09979f..05913dbe3cca1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
index aa8ffedce3b9e..734a1e1fb93d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
index d9438c49a2ec8..419f41649c58d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
index c48b94b56109f..09502d2756243 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index e70356989c4f9..7b147b238ea26 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
index 2e23b6157b024..96fe0429af36f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
index d00bef685e6a8..4b09013f07756 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
index d32406c693441..a794e4abe524c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
index 18fbd5848c5c3..7d0faee34c9cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 7712f5a682d37..f1c02a2d134df 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index d74c8b2d08c31..892c81483cb84 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 99e6cc7837982..34a0e0f5b5834 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
index 1703ff0f490d9..cb9feaea7ddbf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index d721828133ef9..a15c0d0dabca5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 041f20abe74c7..38e7752532d45 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
index 4a5a270eca2fe..7084bb6a4211d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
index 1a25d7deaf658..22fdd0cee503d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index 080f0a7828d2e..f5576eddc0de4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index 61b56aa70eb98..6fd9fdcb83bbf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
index 90dd000ea6e29..13f31884ad10f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
@@ -4,7 +4,7 @@
 # Exceptions; you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
index 16a72cc565aff..4a73b10725a00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
index fb45e248d8a4f..fc0cc97549baf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
index 1b402d7e66254..f000efae816a6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
index 88abe95158263..82c8454716a5f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
index 2750a23db71ff..cd0d380e50e54 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
index c09fcd7f7dbfb..b6327e55775cd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
index 51191f7a20b95..0132338fac6af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
index d9d0aa467205c..e886b1ada4903 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
index 88e6392293a9c..c0cc77227d2c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
index 80aa461758366..a4d4b1f888f7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
index caac5a4913b05..64c5d6e7cbc28 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
index d3fcb5d70f7b6..33ea2580691fb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
index 0b105a2bd0304..cca12985f5031 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
index ab7f0a99dce07..e39f0e1361850 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
index 0861376c97c47..a967bec0aed5d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
index 7c7df2bcaf60e..3f8e7f2b3a395 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
index b6d4689590781..96261d872a3df 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
index aa6a0574e072a..acc9bee5af397 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
index 339bdf357c1ab..1a07de7f75123 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
index 180455e75554d..4b423f2d3f079 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
index 914dc5a28a347..270774ef0c142 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
index 7212b71ed9e23..4d54162e00c90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
index da33027e89833..1a3e5764611b1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
index 2dc5add4f2f75..b1626a8d0c7cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
index 63a13af815eae..e9567cc00d194 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
index 6e4485b743385..2c25911eeba63 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
index 0c281548e2b54..f9868c86a2d0b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
index dc06f7bb8372f..37ca06b926eca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
index f0a36128da223..401dfd4781787 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
index acfb739e81564..9e7184f5507ce 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
index c08463fce5e5c..a91c3e08f752f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
index c5fe9d0b5efb2..716511b063592 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
index 289c00aba9f32..acd9dcba0bb7e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
index d82b4174fa513..5c6f686043c6f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
index f50d14347636e..f7536ca9ad196 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
index bfdbe5321f762..f1b5f3582dd7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
index e5326572eb93d..ab42eddff1897 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
index e106fc964a61b..1245dc2ca0c0e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
index 8eabade1279f6..43bb9c44eb492 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
index 7aa669f2f6b8b..83976b565214c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
index 93fb9fa339d13..637866abbc7ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
index 96c5af4ff09ab..e6eaf8579bad4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
index 9f7e53bff1ac7..fee72ee014ac9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
index 5de908b0f3bd9..692a8cc7ecc5a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index 387eee059943f..8cba4e91c50bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -5,7 +5,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
index 8ccca62c6958b..775a370dc8f9b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
index 8df7fa9db66b8..55c4486376589 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index 97c2c61f539c5..292130f4298d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index b8d714e5aa0ac..c4b19b54dc990 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
index 6b200207cf85a..1b8b665128138 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
@@ -4,7 +4,7 @@
 # Exceptions; you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
index 0de0d65443bf0..5f5fc6aed8d59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
index 85d9a06556440..8ebecbb9ae5e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
index 86d8e27d53d21..a720241586957 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
index b890c711027dc..8b54db3622b3d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
index 41e4e99f8f060..4b1094391d3fb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 8628498656dd6..113064d801114 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
index f24052cb60f26..239be0b82fc01 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
index 64b179504e2f2..7710d97c909fd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 5e0520a55f42b..c41760af170a7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
index 0bfd6536581a6..f58b2bd62f539 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
index 66d1abccd24bc..484415bb395db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
index bfa7f69334400..5d29c785dab6a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index 497e9a54c4e7a..0a5a36cc138dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index 9d8a468504b3a..f6e841d4fc987 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
index 98b7a2580137d..39de104569875 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
index bef3ee020dc97..4d81ce68d3a9b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 26887bced392d..2723beb889593 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
index f99394619e9c6..5631b3d101122 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index 77ea2d3f7bea2..be6fc339be0ef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
index 2c1c76a4c47b1..023a617b6e2bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
index 78538bb0832e2..9528dd86c8a77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index d2ed9cef94e72..0b6a835147d0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index 708edd894d717..cfb29eaffb7d8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
index 12632b5696d2b..701861f189541 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
index 335a35215ed45..1ea9ac8a10a70 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
@@ -4,7 +4,7 @@
 # Exceptions; you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
index eb9d20e6486ce..d44cbf1bf4a12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
index d1bc2db6f979b..a3026450fd767 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
index d59356adddd4b..e97fd6da75ffe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index b391a57c27ba3..1d8faa1badc8c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
index af4773e3fb058..c6e25c5f327e1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
index 9a693646c7ad2..e555e306e55e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index 0d58887f98584..62be9049501d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 110fc935b9e5c..b514eeb2c60b0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
index ad8599ba51f50..28f4e99f7fb28 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index 386844b89a495..ac112bfe44bba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
index 0161057de1534..612a67f496406 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
index f51a13bfd957c..06d079f2128ac 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
index 455df1ab0be8a..7ad386dedb3e4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
index 009ed6abda0f8..14bee4967bfbc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
index 9ce170d82a36c..f8ed17cf10c67 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
index 15c13446b3762..67564ff601810 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
index f223a95ddec14..1ce02ded94b4f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
index b28940d1204a5..88e5c930a0be7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
index 9ce6828f9a284..8a161cb1c2194 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
index 6a26896527739..bdef00fb38803 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index 042114787f31d..b1cd005666d45 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index 4ea882d804124..dd38362fc144c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
index 549f8cb8e79f9..3ae0852b693f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
index f05b6106c5032..c4f7194953b68 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
index 66801a1ca8c60..e043a43db80d6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 6d25a6a5a924b..a4b8c1cb54c8b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
index 35a478caaaee7..03492705536f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 91d9b9c3f22ec..0eecebc5e2a0f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
index 15d66ea84ae28..9b1d3f3500bad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
index 9e35021ec6536..c5f015913aead 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 054f7d91cafa6..c632bbefc304d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
index 1d186c09d93cc..a2da3addcbccf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
index 145faeec0e6f1..3ec97bda6fb12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index 7a080a850ca8c..8178c59d943e1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
index d86cefcb3adb5..0202575883035 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
index d86cefcb3adb5..0202575883035 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
index 280e4a912cf0f..c2c7b68912910 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
index 5a2d34b7553d4..d7fe492093e37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
index 33286c6dd1ec0..8a9fccf525fca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
index 3b48683b42ccd..178751cb6f23a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
index 703a012103674..39c3f0fe006e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
index 097d862c9735e..a467db71f7dad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
index 86cefa3d69d25..9c023a64e57ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
index 89ea36ef5057a..05ccf997a7d0a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
index 7811de134bb56..2498cb54e5209 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
index 05d077c8ae6ec..3aae73704e7c9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
index cdf064834d133..08a97d76842e7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
index f9f8f4b3f260d..1431fa1c19573 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
index adf50dd8880bf..83054e694801a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
index fb0bbc392bdbd..945ac791355c0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
index b5c53224e4c94..86139d572338b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
index bf59e8d5b115f..05117b1b691dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
index 08668c3d1df13..d33853b4e8d32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
index ef4065e605744..1a8350794513d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
index ce2aa29230092..6fab62e9ca4a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
index dff334cc859f3..6a9ad584fbcde 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
index 99119d074b41a..95226cb3c6e9d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
index 66dc8c706779a..96fbeb3ad959c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
index 4eccec152d9e2..5b82688d52770 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
index 14d681721b6a2..7b11bc9e63808 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
index 567c014a5e68a..b433fb0fff646 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
index 1f60ef3fd04d2..2760239937542 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
index d5263bb5c2bfd..a37c5e7cd7014 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
index 739ef93b8e334..daf71de9b2446 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
index bba75141b6eca..1974c22c15a81 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
index 0f0cc9e699349..6b6f41e066ae1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
index 2bb278d1d0cdf..38ea8eb57c60e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
index e68c4e6c2e757..15ce1517417b2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
index b90360dc75b6e..cbdb41925babf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
index 5ce84466aefd6..3d39ca518818f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
index 02e39a996c33d..4a3f38ba7ad0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
index 2fbadb0e80f9d..786c7236e1585 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
index de5f90bbf3f9e..c403cf419d301 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
index d20f8408fb3d0..e87ff74f7a6e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
index e881f871544c8..08fc176beee7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
index 89df1272376cf..71035cb07e9e8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
index 68ecafdf70027..4a2c09ca9ff69 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
index aebcc3952b715..890f63e748592 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
index f89467ce4b86a..37d1ff7cebffa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
index dec5a3d4632a4..141543d69b0fd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
index f83fc0e1015f5..1bcc968885303 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
index a3a5773be00bd..e99d01d477e1f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
index b1e0c7fc88366..65b7e5697a68b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
index ee1cd2ef038c7..86591570fbcab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
@@ -5,7 +5,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
index 08a65fc010dd4..7ad572a7cebed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
index 3f8b5bd0d5ec1..0a1c85af00cda 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
index c2a393c1a9fe9..379428725bb39 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
index 3d3826d128333..d6bc1e0d2c71d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
index 90d3eb156f5c2..6ee06a5479108 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
index 3871ad80a0efe..2df00a15e33cf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
index 76354dce7a39f..0894b60d9fc7a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
index 02d303d887a2d..e2d1ef91aec8e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
index 6110f78ad36e3..bcc6bfd84b57a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
index c276a9763abb4..f156402249999 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
index 8e7adf00fa033..e3c8c8f136f05 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
index 0352df0afd216..d8a82cce6d422 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
index 853fb9229ce48..99c0a220d0727 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
index 4447d91905941..b8a23afb5a39c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
index f3081873d808c..508d105fa78f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
index 817e443ddbfc2..c4a2b075b4664 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
index 09f1f73e0c8e3..7c2afc456be6d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
index 216f1e5ca00cf..07d638f131350 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
index 34e5d1449e10c..422e2be0e3237 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
index 79952d45c1464..f4f363b7e5c17 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
index 1f1a971c98b85..48bfa3ad25429 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
index 84c2c2cadcd1a..e21a9ae64f0d7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
index ce30f09a99424..ce62413ee0f78 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
index d095cfe7d104d..66c9e8a218134 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
index 265b4c9586159..1c5752a1da9c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
index b9440e1f9fd18..394bc03896118 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
index 09058bc2f13f7..20733e5eeee21 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
index ea009d8ab6739..e23e36465a672 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
index 28767529ffa54..bd17447cb7889 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
index beab4dbd5c4e6..bc33844fafee2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
index e700c04555d97..21412fc239186 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
index 39cbd38d2cb5c..1f736694807fa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
index 281afd0867c02..326b7cf69d6a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
index ca90efb1ecce7..860e7e226d59f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
index b3465aa688f19..1ce3ddc2368c6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
index 48f17235bccf7..8bfa6cd569ea9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
index 9eacfe58ca85c..36d32ee735e21 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
index 157e28cb1261c..f300fff8801f7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
index 5abfe81e27bf4..2f720c7a49ec0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
index 988d7926097e4..a20dc32f71b38 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
index 2aa8fefac7a28..bc4270c9e2a8c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
index 2f8e542423592..cf228937ec2bc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
index f114912fc0bbf..604fa9c86da29 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
index f017218847a76..a9b0dbaad5388 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
index 61927cde9dd53..4512408948dc3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
index 7f766f04b74ca..55d15033ecfc2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
index 1e738db395c2e..26608325b841f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
index 2b2e85e7c4514..b45e215814d49 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
index 98fdad1edadbe..c14a5a421f95e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index f88e542a6a86a..227f7f5280d06 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
index 29f09991d0e7c..0ecccdb14e767 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
index b6d927747a650..146f7d15f0d0d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
index 3131dc3b75f60..16367f8197290 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
index 72ca3181302aa..5f95b1edde16f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
index d0b6cafc01a5f..ec254f12ab85f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
index 6cc46cc26b748..d2fda25173763 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
index a2a3fc4023ce6..17768daa08af3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
index b85ec08e1b9bf..0b94abd180c31 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
index ec79e3578faa6..d74607eea657e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
index 5ccbd9e0f6a25..5f68305bfc205 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
index 9af73ae1d1d82..6b42e5fe4ca62 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
index e6d958c7e826d..e81e139e52dad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
index 436540a5163e9..4ffad2c31b104 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
index 78c181ed64348..5f661497b794b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
index 452dcae3f22c3..deef39666e8a1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
index ffe4fd78b8419..5c061dadd28fc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
index c08d3b682f972..65811dcc45ff2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
index 60d213f6879ba..bc6d2bf2b7ab7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
index 148776ddda6dc..6340be83b9f66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
index 74c4ada6bb1c2..464c6b89db6d9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
index 43dcc6217ce77..f094ca8e2616a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
index 11d14417f9ecf..1d822b93be424 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
index c61afc692155a..0e2d567fd426c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
index 0890b70d7c6b9..5b7492f8c1761 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
index cd1652f0d9910..5be3ef46596f0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
index 5b462f57ad479..19fb3bda34b03 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
index 4cee814b2d641..ac1d5fc674484 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
index bb6caaa83a627..060ca2bc249fd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
index 07f9f01299ba7..f3875519a10e8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
index 4e87a4b86ed52..aa872c84a60b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
index 37804e47e003c..54d11670a365e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
index d3caf8e11a9af..a9ed4f24f16ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
index 4e87a4b86ed52..aa872c84a60b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
index d74a7821cee0c..ba49af776ff08 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
index efc2d1b86ada0..0281dad79916b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
index 89ddca506ba2c..d99f4a812a6ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
index 84d223297a7f1..767d17bb96b86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
index 8876efb92b760..a2cf76be8fab5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
index 5b155cea4399b..b6c74f6bfed51 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
index 8b4de50d0fe52..ab533910ee8b2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
index 5deaed20f4b2e..d1eb22ce6c643 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
index 80a27d7a77a19..92dd028dc1ee0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
index 6f91e244d1c0c..1a0b92bfb652f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
index 3d58276697b1b..4dc7b34841204 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
index cb833c98b3bc8..549b3a30626dc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
index c3e4fa3945a59..744df39852de9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
index 1f2db39dc3510..65031454b1470 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
index 1ce80a13e77b7..dfb67303ad8ed 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
index c6fdc014c284c..82088d13746cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
index ca48f1362b82c..a30fafd7a5b56 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
index e735d164bdbeb..3aa408292b16e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
index 341beb52832af..ed574554f9426 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
index 98206b26898d9..3656854643217 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
index 172ba57d8d040..4ebcd9ec22693 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
index 082e4f15f7165..d61a641d5251e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
index 248c2e879b9e1..709ae760784a9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
index 3128c97e23b68..b455570f66c49 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
index afc72a8fd7df9..fc1d4b9eac4b0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
index e4cdc728b8ad8..1cff6e5415803 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
index 2a2a81848fe54..0ce5bfdfdd701 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
index 951af5b9babd7..168bf625a4c37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
index 9081069d4d41a..c11210f1097ea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
index 5bff531a0c4df..67b641587a6af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
index 3468b00dfc5e9..56129f29e5ddd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
index b522285e862c1..ef9cadee9528c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
index 065a63c9161af..75862737a2c86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
index e36282ee7bf4e..0d2c7e0073757 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
index efcc8fd8ddff8..3b885da041f3f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
index 51763b4ab8286..1a5b038b5489d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
index 59a470ef3233e..6dee2711d597c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
index 7928f829155c9..6ffb049b982e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
index 98a7d4bede10d..880bb8d621d10 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
index cd05f11142c55..0e0c0a7574e83 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
index 2cec2611d77f7..eaf7917c6dfa0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
index 22595b3dfabee..0438341148fdc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
index 2e67d46209a81..ae11c9692391e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
index f6628bcf3f6e2..7631fc7818e40 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index 4432b71a7d440..eaaba4463f9e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
index d4f9e932c60b9..9750f6bae94d0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
index 1d343b32bef17..faf4e1cb43485 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
index d86793e433c67..7b27991e6740f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
index a18a022d0c607..4723d16da1af5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
index 5046107c4ed33..c23396643f9d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
index 13bbe9556fb3a..019d77387fe7c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
index 1d6962386952e..91cac389bbd0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
index 85466be8a1527..982b2352ced3a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
index 57ebb217c5ae6..2e5a7b31a1665 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
index d6399fa4ec372..a77b4e08bc1b7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
index abb7b8efe8723..e23a4e52ab81e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
index 8d25758ef4869..fd65118718f99 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
index 57101e9421d75..9e0a24b6879e6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
index 02e0405b267b3..b4b050210fb0c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
index fc19214884173..93634442feb66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
index 4b4ca63435695..716ee2540db66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
index 0d71d96d7011e..61dd05df92f1a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
index e4a19066e79bb..873ea7a983eae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
index a68b3d9eda689..7e5becea883fa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
index 3318c9d1e797b..17d186cc11900 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
index 71d05eb3f69d3..7721a7577a09a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
index 30b5c603d288b..be2f0f909e0c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
index fbef9b7593967..098fd2a80f5ea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
index c7fc571b91837..165092cd8c1ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
index bdbf0d5c88b6b..96155f725946f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
index 7bfeb0054f12b..48295e243c7ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
index fe9c347315149..0ed3fe5c32596 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
index 4e7d3dd3f6f64..903ba12b02fd9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
index 3dd1c4adb4953..6810eb855c5f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
index edce8b7c2e5a8..9e59e6bf7092b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
index bb155865e445d..56369b161964e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
index c6b2608c7604f..bc11225496785 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
index 36cdfa9b7bdb4..26ca097f327c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
index 291dafd8e1456..2b78be90a7e76 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
index 4dfe9cf837e7b..58a1f2548f38e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
index 143b073882487..ffabf74a42b22 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
index f223cea33546b..a9158f7ff59c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
index 31585396866e9..a65b8bad7dd25 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
index 73dffa88a178c..5425139b5d888 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
index 79279bc37c768..1c59a75ab15d8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
index 78a21ccea682a..b5c22f6b5c588 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
index 5ff8cfff49728..12ff83e3ac98d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
index 111a4771b76f9..2f8b137532493 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
index a3dbc7703c5fc..08f72b45bf6de 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
index d861f5f930442..2a0c3ad8f0b77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
index 954e6fe8aaaef..4b289100a3ffb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index 206b5b9e3eec3..8e6c5a0954238 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
index bb6f070318c66..0885f8a058592 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
index 68b766a9b93aa..e13dc4ed88a66 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
index a946ab0eaf5f7..a0c94959a7fa2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
index 9b280eca43289..34c892ca5dea6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
index 44bd970d07cad..695e6d0a39696 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
index 39be7c37e5468..533f710b34a01 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
index b04924ee08cc5..cc64e2641a2b0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
index 2ca52052cabe6..8dd706f51977b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
index c62058b3c90b9..64234b9019781 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
index b9d95d33360e1..7f4a881552699 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
index ebf66549adb38..dcf78d89930d2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
index a850d6f99ac6d..1c9a90a942684 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
index 39792aee4f089..fb2b8e8076f5f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
index 89ddc9091c3a6..c563b79b6917e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
index be8fd26dae033..62ea24d8e2c5e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
index 43bf13e839152..0f67b19351679 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
index 869b3a73e7840..4249ff6ce6435 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
index 4bce1b7985b98..97ccb3494c1ac 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
index ceba451f571cf..f016562ea54ef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
index 1848a31b1d4c7..85189b3651eb5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
index f286d5c81b408..3e4db8b32697c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
index 09f7f00ff9603..74bc119bb130c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
index 7380496abd278..8de16c7ccfe09 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
index 89556f91437a0..443104d84af75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
index b57d0df435528..7dd47ae4ffcc9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
index 8b56e4d5aad5c..e50bc7584796d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
index be26c56824cd3..492f39758391e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
index d995fd652fd03..d7bbd4a2d9ed8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
index 33e9cf9970c6f..39a73d0f013ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
index 09135be13353a..e82e58b6ac662 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
index 4f68469ff25ee..0b4377802877b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
index fe988f643200f..2728251ca02b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
index 1593bdffa7087..b1082899dce4d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
index 048713d1c16e5..373c37fb20114 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
index 1cd2d4bc4574e..240d52a220cda 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
index e7d298b2b07a0..23533d2130155 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
index 6cd500dbafa72..12b0cca975cd0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
index ef84853e03bba..b28b347ade826 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
index a8eb1b595f9ce..dc7c2fed68520 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
index b292e06626c50..c329b342b5835 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
index 4c4fd2a2b92d5..94e72dc92e09f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
index a2ef40270c4ed..e336a961b2576 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
index 01c0d0d79fff2..5615f7107d892 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
index 614e3d52c3202..5c62c31b2a6ad 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
index 7d0d5ca6b77aa..82a81725f7320 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
index 3bf5a299acc07..95dfb9f4ef732 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
index 58ec273625333..3dac89baae9d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
index 24c8521652f68..e9273ceaf80f2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
index 801a622d56e54..5fe0c0296a512 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
index d9e4308e07701..a4b88856af96a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
index d2fc09ce1d187..bf1f2b19b178b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index 6460d40e1acae..d202a1f67fcbc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
index 0bcb01081a3fb..3bc98c18d60a5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
index 8f5e65d11968a..4719739ded72b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
index 8722e6f13edcf..ad98dbfe5f788 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
index aa3de91133f61..691b7aba7100f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
index eae0ed336e8e1..cd5cd3b8f9bec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
index e2cc382506e6e..7eaf85a414023 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
index 3c6650d26ac38..99f08c8efa9cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
index 5f3f1815805ce..dec28cd3cdd5e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
index 1d12758ff31f6..2a8464528d01d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
index 1fab059100023..69756d0886cc3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
index d92bbe80429be..7636e5411a171 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
index e26fcd8d807a2..02573c3ce0b59 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
index 299dcf1978a90..fe73640be0612 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
index 70e34a3cbfb30..6eff9b6ad58e4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
index 2252c075b39e2..8e88963b75871 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
index 3bc49c589a3d5..3cee1ff3eb4b4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
index 36bf8e240fc54..1f2b59b23456d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
index 704a1a82d28b8..a9d1a37b305b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
index 9a95a5b85d92e..b577f149f82e3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
index 65b3015cf0289..8b5d83c3b6835 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
index 2b51497d7158c..0f667a71134e5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
index 523a70fde3913..966bebfc59fe6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
index a05dce1dc1481..c996ab108cda9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
index a5d9c7b811555..d7b37641357b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
index a5d9c7b811555..d7b37641357b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
index ca7a8c7a4cbe5..86e3d6145c4c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
index 1d34290dc3bb0..f6e12b7d83615 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
index d970cc72f8b55..bc6dc059cb554 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
index c1a92b1660c68..4baf7d5791f7b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
index a7515f3c71f73..33033bd0d9518 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
index 9b7640aa94c88..ac8cb69ee5fc5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
index fa60304b44fbe..10f571c41fa94 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
index 0db9a2703400e..fcbf4cf948fec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
index 2873b8759b971..e8a291afbfb7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
index a0973a053e579..e698f17df7339 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
index 6871734fa38b2..ccc581108605a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
index 23bd57fb36b6b..2f68a9297f6b5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
index 252af7d45ca4e..5a90f9cdf0b55 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
index 11392ed374520..7755913a779a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
index 9f7e53bff1ac7..fee72ee014ac9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
index e5c8fe4263bc3..a1a0fed20654e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
index 082d251e544d1..5582091b8ccd5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
index ad71e33a0eaa0..1e5257625ac75 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
index 39ca271f5539d..d63db033b2971 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
index db2b534f0069b..a426c804c1fe1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
index 25b91286ace66..82b5926f3d280 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
index 18249480262c7..d2010a9e95b6b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
index 3b6d84de22076..0a121a27a795d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
index 8a3f1de861de8..813dcfe9cc94a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
index 71ab928440cb9..da33e218bbff8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
index 73b4b679fa85b..a7f8ba693664d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
index 0fbba9c59df96..2496b1b1d675d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
index 856180d6c002c..3461a335d6845 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
@@ -4,7 +4,7 @@
 ; Exceptions; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
-;     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index ffeb4810af918..2b722b490582e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -4,7 +4,7 @@
 // Exceptions; you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//     https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

From 1213b0cbb225a5b5ca34108c94e97b07e21cc5be Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 26 Aug 2025 12:37:25 +0100
Subject: [PATCH 171/182] [LLVM 22] A few more lifetime fixes.

* Obtain lifetime_(start|end) pointer argument by getting last argument.
* Do not update size argument when there is no size argument.
* Update tests.
---
 .../vecz/source/transform/packetizer.cpp      | 32 +++++++++++++------
 .../vecz/test/lit/llvm/divergent_loop_bug.ll  |  4 +--
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 9f8e9ed1a43e1..0cd291649ad7f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -2050,22 +2050,34 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
     auto IntrID = Intrinsic::ID(Callee->getIntrinsicID());
     if (IntrID == llvm::Intrinsic::lifetime_end ||
         IntrID == llvm::Intrinsic::lifetime_start) {
-      auto *ptr = CI->getOperand(1);
+      auto *ptr = CI->getArgOperand(CI->arg_size() - 1);
       if (auto *const bcast = dyn_cast<BitCastInst>(ptr)) {
         ptr = bcast->getOperand(0);
       }
 
       if (auto *const alloca = dyn_cast<AllocaInst>(ptr)) {
         if (!needsInstantiation(Ctx, *alloca)) {
-          // If it's an alloca we can widen, we can just change the size
-          const llvm::TypeSize allocSize =
-              Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
-          const auto lifeSize =
-              allocSize.isScalable() || SimdWidth.isScalable()
-                  ? -1
-                  : allocSize.getKnownMinValue() * SimdWidth.getKnownMinValue();
-          CI->setOperand(
-              0, ConstantInt::get(CI->getOperand(0)->getType(), lifeSize));
+#if LLVM_VERSION_GREATER_EQUAL(23, 0)
+          const bool HaveSizeArg = false;
+#elif LLVM_VERSION_GREATER_EQUAL(22, 0)
+          // TODO Remove runtime check when we no longer need to worry about
+          // older LLVM 22 snapshots.
+          const bool HaveSizeArg = CI->arg_size() == 2;
+#else
+          const bool HaveSizeArg = true;
+#endif
+          if (HaveSizeArg) {
+            // If it's an alloca we can widen, we can just change the size
+            const llvm::TypeSize allocSize =
+                Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
+            const auto lifeSize =
+                allocSize.isScalable() || SimdWidth.isScalable()
+                    ? -1
+                    : allocSize.getKnownMinValue() *
+                          SimdWidth.getKnownMinValue();
+            CI->setOperand(
+                0, ConstantInt::get(CI->getOperand(0)->getType(), lifeSize));
+          }
           results.push_back(CI);
         }
       }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
index 36d32ee735e21..8160714150664 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
@@ -48,7 +48,7 @@ entry.if.end17_crit_edge:                          ; preds = %entry
 ; %or.cond branch.
 ; CHECK: if.then:
 ; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]])
-; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) #9
+; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]])
 ; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]])
 ; CHECK: %mul7 = fmul float %2, -2.950000e+01
 ; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
@@ -113,7 +113,7 @@ entry.if.end17_crit_edge:                          ; preds = %entry
 ; %or.cond branch.
 ; CHECK: if.then:
 ; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]])
-; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) #9
+; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]])
 ; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]])
 ; CHECK: %mul7 = fmul float %2, -2.950000e+01
 ; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00

From e1663a0f8c96b6d68cb1b47564c1f5190ca820ab Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 27 Aug 2025 10:32:49 +0100
Subject: [PATCH 172/182] Use poison rather than undef.

We were generating undef in many places because in older versions of
LLVM, poison did not yet exist. As we no longer support those older
versions of LLVM, and current LLVM is deprecating undef, switch over to
poison.
---
 .../source/cl_builtin_info.cpp                | 10 ++++-----
 .../source/pass_functions.cpp                 |  4 ++--
 ...lace_local_module_scope_variables_pass.cpp |  2 +-
 .../vecz/source/ir_cleanup.cpp                |  6 +++---
 .../vecz/source/llvm_helpers.cpp              |  4 ++--
 .../transform/packetization_helpers.cpp       | 19 +++++++++--------
 .../vecz/source/transform/packetizer.cpp      | 21 ++++++++++---------
 .../vecz/source/transform/passes.cpp          |  2 +-
 .../vecz/source/transform/scalarizer.cpp      |  6 +++---
 .../transform/simplify_infinite_loop_pass.cpp |  2 +-
 .../vecz/source/vector_target_info.cpp        | 21 ++++++++++---------
 .../vecz/source/vector_target_info_riscv.cpp  | 18 ++++++++--------
 .../vecz/source/vectorization_context.cpp     |  2 +-
 .../vecz/test/lit/llvm/Boscc/boscc_merge3.ll  |  6 +++---
 .../vector_phi_uniform.ll                     |  4 ++--
 .../test/lit/llvm/RISCV/broadcast_vector.ll   | 12 +++++------
 .../lit/llvm/RISCV/define_subgroup_scans.ll   |  8 +++----
 .../llvm/RISCV/define_subgroup_scans_vp.ll    | 10 ++++-----
 .../test/lit/llvm/RISCV/extract_element.ll    |  8 +++----
 .../test/lit/llvm/RISCV/insert_element.ll     |  2 +-
 .../test/lit/llvm/RISCV/packetize_shuffle.ll  |  4 ++--
 .../lit/llvm/RISCV/packetize_shuffle_bool.ll  |  4 ++--
 .../llvm/RISCV/packetize_shuffle_concat.ll    |  4 ++--
 .../llvm/RISCV/packetize_shuffle_narrow.ll    |  4 ++--
 .../lit/llvm/RISCV/packetize_shuffle_wider.ll |  4 ++--
 .../lit/llvm/RISCV/select_scalar_vector.ll    |  2 +-
 .../llvm/ScalableVectors/broadcast_vector.ll  |  2 +-
 .../define_masked_scatter_gather.ll           |  2 +-
 .../ScalableVectors/define_subgroup_scans.ll  |  6 +++---
 .../define_subgroup_scans_vp.ll               |  6 +++---
 .../ScalableVectors/packetize_mask_varying.ll |  2 +-
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |  8 +++----
 .../define_subgroup_scans.ll                  |  6 +++---
 .../packetize_mask_varying.ll                 |  2 +-
 .../extractelement_constant_index.ll          |  2 +-
 .../extractelement_runtime_index.ll           |  2 +-
 .../extractelement_runtime_index2.ll          |  2 +-
 .../extractelement_runtime_index3.ll          |  2 +-
 .../insertelement_constant_index.ll           |  2 +-
 .../llvm/VectorWidening/scalar_vector_user.ll |  2 +-
 .../llvm/VectorWidening/vector_phi_varying.ll |  8 +++----
 .../lit/llvm/VectorWidening/widen_fmuladd2.ll | 16 +++++++-------
 .../vecz/test/lit/llvm/alloca_alias.ll        |  2 +-
 .../lit/llvm/define_gather_load_as_masked.ll  |  2 +-
 .../test/lit/llvm/define_interleaved_load.ll  |  2 +-
 .../llvm/define_interleaved_load_as_masked.ll |  2 +-
 .../lit/llvm/define_masked_gather_load.ll     |  2 +-
 .../test/lit/llvm/define_subgroup_scans.ll    | 10 ++++-----
 .../lit/llvm/insert_element_debug_info.ll     |  2 +-
 .../vecz/test/lit/llvm/scalar_vector_user.ll  |  2 +-
 .../llvm/scalarization_masked_load_store.ll   |  4 ++--
 .../vecz/test/lit/llvm/scalarize-gep.ll       |  4 ++--
 .../vecz/test/lit/llvm/stride_aligned.ll      |  2 +-
 .../lit/llvm/stride_aligned_scalarized.ll     |  2 +-
 .../vecz/test/lit/llvm/stride_misaligned.ll   |  2 +-
 .../lit/llvm/stride_misaligned_scalarized.ll  |  2 +-
 .../vecz/test/lit/llvm/subgroup_shuffle.ll    | 10 ++++-----
 .../test/lit/llvm/subgroup_shuffle_down.ll    |  8 +++----
 .../vecz/test/lit/llvm/subgroup_shuffle_up.ll |  8 +++----
 .../test/lit/llvm/subgroup_shuffle_xor.ll     | 16 +++++++-------
 .../vecz/test/lit/llvm/vector_phi_uniform.ll  |  4 ++--
 .../vecz/test/lit/llvm/vector_phi_varying.ll  |  8 +++----
 .../lit/llvm/vecz_scalar_interleaved_load.ll  |  2 +-
 63 files changed, 179 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index be330e550855e..a60dc0deb09ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -1691,7 +1691,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineCross(IRBuilder<> &B,
     Src1Lanes.push_back(B.CreateExtractElement(Src1, B.getInt32(i)));
   }
 
-  Value *Result = UndefValue::get(RetTy);
+  Value *Result = PoisonValue::get(RetTy);
   for (unsigned i = 0; i < 3; i++) {
     const int Idx0 = SrcIndices[(i * 2) + 0];
     const int Idx1 = SrcIndices[(i * 2) + 1];
@@ -2035,11 +2035,11 @@ Value *CLBuiltinInfo::emitBuiltinInlineAs(Function *F, llvm::IRBuilder<> &B,
       if (i < SrcVecTy->getNumElements()) {
         Indices.push_back(B.getInt32(i));
       } else {
-        Indices.push_back(UndefValue::get(B.getInt32Ty()));
+        Indices.push_back(PoisonValue::get(B.getInt32Ty()));
       }
     }
     Value *Mask = ConstantVector::get(Indices);
-    Src = B.CreateShuffleVector(Src, UndefValue::get(SrcVecTy), Mask);
+    Src = B.CreateShuffleVector(Src, PoisonValue::get(SrcVecTy), Mask);
   }
 
   // Common case: as_* is a simple bitcast.
@@ -2139,7 +2139,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineVLoad(Function *F, unsigned Width,
     return nullptr;
   }
   auto *DataTy = FixedVectorType::get(EltTy, Width);
-  Value *Data = UndefValue::get(DataTy);
+  Value *Data = PoisonValue::get(DataTy);
 
   // Emit the base pointer.
   Value *Offset = Args[0];
@@ -2596,7 +2596,7 @@ Value *CLBuiltinInfo::emitBuiltinInlineShuffle(BuiltinID BuiltinID,
       MaskedMask, FixedVectorType::get(B.getInt32Ty(), MaskWidth), false);
 
   // Create the shufflevector instruction.
-  Value *Arg1 = (isShuffle2 ? Args[1] : UndefValue::get(ShuffleTy));
+  Value *Arg1 = (isShuffle2 ? Args[1] : PoisonValue::get(ShuffleTy));
   return B.CreateShuffleVector(Args[0], Arg1, MaskedMask, "shuffle");
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index 842d870738dd6..ce84591aaf7af 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -204,7 +204,7 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
       // InsertElement to place it in a new vector and the second is a
       // ShuffleVector to duplicate the value across the vector.
       auto numEls = constantVec->getNumOperands();
-      llvm::Value *undef = llvm::UndefValue::get(
+      llvm::Value *undef = llvm::PoisonValue::get(
           llvm::FixedVectorType::get(splatVal->getType(), numEls));
       llvm::Type *i32Ty = llvm::Type::getInt32Ty(constant->getContext());
       auto insert = llvm::InsertElementInst::Create(
@@ -217,7 +217,7 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
     } else if (llvm::ConstantArray *constantArr =
                    llvm::dyn_cast<llvm::ConstantArray>(constant)) {
       auto numEls = constantArr->getNumOperands();
-      llvm::Value *undef = llvm::UndefValue::get(constantArr->getType());
+      llvm::Value *undef = llvm::PoisonValue::get(constantArr->getType());
       llvm::Instruction *insertedIns = nullptr;
       for (unsigned int i = 0; i < numEls; i++) {
         auto *insertNext = llvm::InsertValueInst::Create(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index 510db56fd498a..e8f6152b685f0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -484,7 +484,7 @@ PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
           castedLocal->insertBefore(inst->getIterator());
 
           auto indexTy = Type::getInt32Ty(M.getContext());
-          Value *newCv = UndefValue::get(cv->getType());
+          Value *newCv = PoisonValue::get(cv->getType());
 
           // We can't simply 'setOperand' in a 'ConstantVector'. We have to
           // recreate it from scratch.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
index 72a421ef10545..f7b3317f3b681 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -97,7 +97,7 @@ void IRCleanup::deleteInstructions() {
       } else if (PHINode *Phi = dyn_cast<PHINode>(I)) {
         if (AreUsersDead(Phi, InstructionsToDelete, WorkList,
                          VisitedForCycles)) {
-          Phi->replaceAllUsesWith(UndefValue::get(Phi->getType()));
+          Phi->replaceAllUsesWith(PoisonValue::get(Phi->getType()));
           Phi->eraseFromParent();
           progress = true;
         } else {
@@ -117,7 +117,7 @@ void IRCleanup::deleteInstructions() {
         if (Op && Op->isLoad()) {
           // We need to replace loads with nops, as we need to have a value for
           // their users, which will be removed later on.
-          I->replaceAllUsesWith(UndefValue::get(Op->getDataType()));
+          I->replaceAllUsesWith(PoisonValue::get(Op->getDataType()));
           I->eraseFromParent();
         } else {
           WorkList.insert(I);
@@ -138,6 +138,6 @@ void IRCleanup::deleteInstructions() {
 }
 
 void IRCleanup::deleteInstructionNow(Instruction *I) {
-  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->replaceAllUsesWith(PoisonValue::get(I->getType()));
   I->eraseFromParent();
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
index d3b2271d549c1..a6252e834ad43 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
@@ -50,7 +50,7 @@ FixedVectorType *vecz::getVectorType(Value *V) {
 /// @param[in] T Type to get default value of.
 /// @param[in] V Default value to use for numeric type
 ///
-/// @return Default value, which will be undef for non-numeric types
+/// @return Default value, which will be poison for non-numeric types
 Value *vecz::getDefaultValue(Type *T, uint64_t V) {
   if (T->isIntegerTy()) {
     return ConstantInt::get(T, V);
@@ -60,7 +60,7 @@ Value *vecz::getDefaultValue(Type *T, uint64_t V) {
     return ConstantFP::get(T, V);
   }
 
-  return UndefValue::get(T);
+  return PoisonValue::get(T);
 }
 
 /// @brief Get the shuffle mask as sequence of integers.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index cab8b13f1236b..c0e996593ebee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -106,7 +106,7 @@ Value *createOptimalShuffle(IRBuilder<> &B, Value *srcA, Value *srcB,
   if (shuffleA && isa<UndefValue>(srcB)) {
     auto *const srcMask = getShuffleMask(shuffleA);
     auto *const newMask = ConstantExpr::getShuffleVector(
-        srcMask, UndefValue::get(srcMask->getType()), maskC);
+        srcMask, PoisonValue::get(srcMask->getType()), maskC);
 
     return B.CreateShuffleVector(shuffleA->getOperand(0),
                                  shuffleA->getOperand(1), newMask, name);
@@ -229,7 +229,7 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
     }
   }
 
-  auto *undef = UndefValue::get(srcs.front()->getType());
+  auto *undef = PoisonValue::get(srcs.front()->getType());
   for (auto &src : srcs) {
     src = createOptimalShuffle(B, src, undef, mask);
   }
@@ -385,7 +385,8 @@ Value *Packetizer::Result::getAsValue() const {
   auto name = scalar->getName();
 
   if (FixedVectorType::isValidElementType(eleTy)) {
-    Value *gather = UndefValue::get(FixedVectorType::get(eleTy, packet.size()));
+    Value *gather =
+        PoisonValue::get(FixedVectorType::get(eleTy, packet.size()));
 
     IRBuilder<> B(buildAfter(packet.back(), packetizer.F));
     for (unsigned i = 0; i < packet.size(); i++) {
@@ -408,7 +409,7 @@ Value *Packetizer::Result::getAsValue() const {
     info->vector = B.CreateShuffleVector(parts[0], parts[1], mask,
                                          Twine(name, ".concatenate"));
   } else {
-    Value *gather = UndefValue::get(ArrayType::get(eleTy, packet.size()));
+    Value *gather = PoisonValue::get(ArrayType::get(eleTy, packet.size()));
 
     IRBuilder<> B(buildAfter(packet.back(), packetizer.F));
     for (unsigned i = 0; i < packet.size(); i++) {
@@ -455,7 +456,7 @@ PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
     assert(isa<FixedVectorType>(vecTy) && "Must be a fixed vector type here!");
     const unsigned scalarWidth = vecTy->getNumElements() / width;
     if (scalarWidth > 1 || scalar->getType()->isVectorTy()) {
-      auto *const undef = UndefValue::get(vec->getType());
+      auto *const undef = PoisonValue::get(vec->getType());
 
       // Build shuffle mask to perform the subvector extracts.
       IRBuilder<> B(buildAfter(vec, packetizer.F));
@@ -535,7 +536,7 @@ PacketRange Packetizer::Result::widen(unsigned width) const {
   auto *it = parts.begin();
   IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
   if (newWidth > 1) {
-    auto *const undef = UndefValue::get(vecTy);
+    auto *const undef = PoisonValue::get(vecTy);
 
     // Build shuffle mask to perform the subvector extracts.
     for (size_t i = 0, origIdx = 0; i < width; ++i) {
@@ -580,7 +581,7 @@ PacketRange Packetizer::Result::narrow(unsigned width) const {
     // Build vectors out of pairs of scalar values
     const auto name = scalar->getName();
     IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
-    Value *undef = UndefValue::get(FixedVectorType::get(ty, 2));
+    Value *undef = PoisonValue::get(FixedVectorType::get(ty, 2));
     for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) {
       Value *in = B.CreateInsertElement(undef, parts[pairIdx], B.getInt32(0),
                                         Twine(name, ".gather"));
@@ -684,7 +685,7 @@ const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
   if (isa<PoisonValue>(scalar)) {
     result = PoisonValue::get(getWideType(ty, factor));
   } else if (isa<UndefValue>(scalar)) {
-    result = UndefValue::get(getWideType(ty, factor));
+    result = PoisonValue::get(getWideType(ty, factor));
   } else if (ty->isVectorTy() && factor.isScalable()) {
     IRBuilder<> B(buildAfter(scalar, F));
     result = createScalableBroadcastOfFixedVector(TI, B, scalar, factor);
@@ -703,7 +704,7 @@ const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
     }
 
     IRBuilder<> B(buildAfter(scalar, packetizer.F));
-    result = createOptimalShuffle(B, scalar, UndefValue::get(ty), mask,
+    result = createOptimalShuffle(B, scalar, PoisonValue::get(ty), mask,
                                   Twine(scalar->getName(), ".broadcast"));
   } else if (auto *const C = dyn_cast<Constant>(scalar)) {
     result = ConstantVector::getSplat(factor, C);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 0cd291649ad7f..ec7bae09a73ec 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -1491,7 +1491,7 @@ Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) {
     // It's possible that for some targets and for some combinations of vector
     // width and vectorization factor, that going through memory would be
     // faster.
-    Value *ExtractedVec = UndefValue::get(DataVecTy);
+    Value *ExtractedVec = PoisonValue::get(DataVecTy);
     const unsigned DataNumElts = DataVecTy->getElementCount().getFixedValue();
     auto *const BaseIdx = B.CreateMul(VecIdx, B.getInt32(DataNumElts));
     for (unsigned i = 0; i < DataNumElts; i++) {
@@ -1625,7 +1625,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
   assert(RegularShuffleFn);
 
   auto *const VecData = PackData.getAsValue();
-  Value *CombinedShuffle = UndefValue::get(VecData->getType());
+  Value *CombinedShuffle = PoisonValue::get(VecData->getType());
 
   for (unsigned i = 0; i < VF; i++) {
     auto *Idx = B.getInt32(i);
@@ -1638,7 +1638,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
     if (auto *DataVecTy = dyn_cast<VectorType>(Data->getType()); !DataVecTy) {
       DataElt = B.CreateExtractElement(VecData, VecGroupIdx);
     } else {
-      DataElt = UndefValue::get(DataVecTy);
+      DataElt = PoisonValue::get(DataVecTy);
       auto VecWidth = DataVecTy->getElementCount().getFixedValue();
       // VecGroupIdx is the 'base' of the subvector, whose elements are stored
       // sequentially from that point.
@@ -1893,7 +1893,7 @@ Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
     } else {
       // For vector data types we need to extract consecutive elements starting
       // at the sub-vector whose index is Idx.
-      Elt = UndefValue::get(DataVecTy);
+      Elt = PoisonValue::get(DataVecTy);
       auto VecWidth = DataVecTy->getElementCount().getFixedValue();
       // Idx is the 'base' of the subvector, whose elements are stored
       // sequentially from that point.
@@ -2624,7 +2624,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       auto *const newPtrTy = FixedVectorType::get(ptrTy, simdWidth);
 
       auto *const idxVector = ConstantVector::get(indices);
-      auto *const undef = UndefValue::get(newPtrTy);
+      auto *const undef = PoisonValue::get(newPtrTy);
       for (auto &vecPtr : ptrPacket) {
         vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
         vecPtr = B.CreateShuffleVector(vecPtr, undef, widenMask);
@@ -2792,7 +2792,7 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
           }
         }
 
-        auto *const undef = UndefValue::get(maskPacket.front()->getType());
+        auto *const undef = PoisonValue::get(maskPacket.front()->getType());
         for (auto &vecMask : maskPacket) {
           vecMask = createOptimalShuffle(B, vecMask, undef, widenMask);
         }
@@ -3533,7 +3533,7 @@ Value *Packetizer::Impl::vectorizeExtractValue(ExtractValueInst *ExtractValue) {
   }
 
   Type *CompositeTy = getWideType(Extracts[0]->getType(), SimdWidth);
-  Value *Result = UndefValue::get(CompositeTy);
+  Value *Result = PoisonValue::get(CompositeTy);
   for (decltype(Width) i = 0; i < Width; i++) {
     Result = B.CreateInsertElement(Result, Extracts[i], B.getInt32(i));
   }
@@ -3655,7 +3655,7 @@ ValuePacket Packetizer::Impl::packetizeInsertElement(
         Mask.push_back(i / ScalarWidth);
       }
 
-      auto *Undef = UndefValue::get(Elts.front()->getType());
+      auto *Undef = PoisonValue::get(Elts.front()->getType());
       for (unsigned i = 0; i < packetWidth; ++i) {
         results.push_back(createOptimalShuffle(B, Elts[i], Undef, Mask, Name));
       }
@@ -3792,7 +3792,7 @@ ValuePacket Packetizer::Impl::packetizeExtractElement(
 
     auto resultWidth = Width / packetWidth;
     if (packetWidth == 1) {
-      srcVals.push_back(UndefValue::get(srcVals.front()->getType()));
+      srcVals.push_back(PoisonValue::get(srcVals.front()->getType()));
     } else {
       resultWidth *= 2;
     }
@@ -3813,7 +3813,8 @@ ValuePacket Packetizer::Impl::packetizeExtractElement(
     Value *Indices = packetizeIfVarying(Index);
     PACK_FAIL_IF(!Indices);
 
-    Result = UndefValue::get(getWideType(ExtractElement->getType(), SimdWidth));
+    Result =
+        PoisonValue::get(getWideType(ExtractElement->getType(), SimdWidth));
     if (Indices != Index) {
       Type *IdxTy = Index->getType();
       SmallVector<Constant *, 16> Offsets;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index 2b8edb6f41cbc..d24d615dcf764 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -114,7 +114,7 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
       if (CMask->isZeroValue()) {
         // A null mask means no lane executes the memory operation.
         if (BuiltinDesc->isLoad()) {
-          CI->replaceAllUsesWith(UndefValue::get(BuiltinDesc->getDataType()));
+          CI->replaceAllUsesWith(PoisonValue::get(BuiltinDesc->getDataType()));
         }
         ToDelete.push_back(CI);
       } else if (CMask->isAllOnesValue()) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 7b147b238ea26..5b79287c14959 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -129,7 +129,7 @@ Value *Scalarizer::getGather(Value *V) {
                B.CreateVectorSplat(ElementCount::getFixed(P->size()), splat);
   }
 
-  Value *Result = UndefValue::get(V->getType());
+  Value *Result = PoisonValue::get(V->getType());
   for (unsigned i = 0; i < P->size(); i++) {
     if (auto *At = P->at(i)) {
       if (!isa<UndefValue>(At)) {
@@ -452,7 +452,7 @@ Value *Scalarizer::scalarizeOperandsExtractElement(ExtractElementInst *Extr) {
     VECZ_FAIL_IF(!OrigVecPacket);
 
     IRBuilder<> B(Extr);
-    Value *Select = UndefValue::get(Extr->getType());
+    Value *Select = PoisonValue::get(Extr->getType());
     for (unsigned lane = 0; lane < VecWidth; lane++) {
       // Check if the the lane matches the extract index and select
       // the corresponding value
@@ -1415,7 +1415,7 @@ SimdPacket *Scalarizer::scalarizeShuffleVector(ShuffleVectorInst *Shuffle,
     Value *Extracted = nullptr;
     int MaskLane = Shuffle->getMaskValue(i);
     if (MaskLane < 0) {
-      Extracted = UndefValue::get(VecTy->getElementType());
+      Extracted = PoisonValue::get(VecTy->getElementType());
     } else if (MaskLane >= (int)SrcWidth) {
       MaskLane -= (int)SrcWidth;
       if (RHSPacket) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
index 96fe0429af36f..f94e60a3645a1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -78,7 +78,7 @@ PreservedAnalyses vecz::SimplifyInfiniteLoopPass::run(
     // predecessor to it.
     for (Instruction &I : *target) {
       if (auto *PHI = dyn_cast<PHINode>(&I)) {
-        PHI->addIncoming(UndefValue::get(PHI->getType()), virtualExit);
+        PHI->addIncoming(PoisonValue::get(PHI->getType()), virtualExit);
       }
     }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index f1c02a2d134df..c26d30fe90cb8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -132,7 +132,7 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   }
 
   // Create a vector out of these values.
-  Value *Result = UndefValue::get(Ty);
+  Value *Result = PoisonValue::get(Ty);
   for (unsigned i = 0; i < SimdWidth; i++) {
     Result = B.CreateInsertElement(Result, Values[i], B.getInt32(i));
   }
@@ -268,7 +268,7 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   }
   Exit = BasicBlock::Create(Ctx, "masked_load_exit", F);
 
-  Constant *const DefaultEleData = UndefValue::get(EleTy);
+  Constant *const DefaultEleData = PoisonValue::get(EleTy);
   SmallVector<Value *, 4> LoadedLanes;
   SmallVector<Value *, 4> LanePhis;
   for (unsigned i = 0; i < Width; i++) {
@@ -307,7 +307,7 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
 
   Value *Result = nullptr;
   if (Width > 1) {
-    Result = UndefValue::get(Ty);
+    Result = PoisonValue::get(Ty);
     for (unsigned i = 0; i < Width; i++) {
       Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i));
     }
@@ -494,7 +494,7 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   PointerType *PtrTy = dyn_cast<PointerType>(VecPtrTy->getElementType());
   VECZ_FAIL_IF(!PtrTy);
   Type *EleTy = Ty->getScalarType();
-  Constant *DefaultEleData = UndefValue::get(EleTy);
+  Constant *DefaultEleData = PoisonValue::get(EleTy);
 
   if (Ty->isVectorTy()) {
     const auto Legality =
@@ -511,7 +511,8 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
         Mask = applyEVLToMask(B, EVL, Mask);
         VECZ_FAIL_IF(!Mask);
         // Create the call to the function
-        Value *Args[] = {Ptr, B.getInt32(Alignment), Mask, UndefValue::get(Ty)};
+        Value *Args[] = {Ptr, B.getInt32(Alignment), Mask,
+                         PoisonValue::get(Ty)};
         CallInst *CI = B.CreateCall(MaskedGather, Args);
         if (CI) {
           CI->setCallingConv(MaskedGather->getCallingConv());
@@ -575,7 +576,7 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
   LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]);
   LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]);
   LanePhis.push_back(LastLanePhi);
-  Value *Result = UndefValue::get(Ty);
+  Value *Result = PoisonValue::get(Ty);
   for (unsigned i = 0; i < Width; i++) {
     Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i));
   }
@@ -781,7 +782,7 @@ Value *TargetInfo::createScalableBroadcast(IRBuilder<> &B, Value *vector,
   // Set the alignment to that of vector element type.
   auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
   return B.CreateMaskedGather(wideTy, gep, alignment, mask,
-                              UndefValue::get(wideTy));
+                              PoisonValue::get(wideTy));
 }
 
 Value *TargetInfo::createBroadcastIndexVector(IRBuilder<> &B, Type *ty,
@@ -943,7 +944,7 @@ llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
 
   if (isa<Constant>(mask)) {
     // Special case if the mask happens to be a constant.
-    return B.CreateShuffleVector(src, UndefValue::get(srcTy), mask);
+    return B.CreateShuffleVector(src, PoisonValue::get(srcTy), mask);
   }
 
   // The alloca must be inserted at the beginning of the function.
@@ -982,7 +983,7 @@ llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
   }
 
   return B.CreateMaskedGather(dstTy, gep, alignment, gatherMask,
-                              UndefValue::get(dstTy));
+                              PoisonValue::get(dstTy));
 }
 
 llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
@@ -993,7 +994,7 @@ llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
   assert(srcTy &&
          "TargetInfo::createVectorShuffle: source must have vector type");
 
-  auto *const undef = UndefValue::get(srcTy);
+  auto *const undef = PoisonValue::get(srcTy);
   const auto EC = srcTy->getElementCount();
   if (!EC.isScalable()) {
     // Special case for fixed-width vectors
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 34a0e0f5b5834..66906b1cdac34 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -357,8 +357,8 @@ llvm::Value *TargetInfoRISCV::createScalableExtractElement(
                                  indices, zero);
 
   SmallVector<Value *, 4> ops;
-  // Add the a pass-through operand - we set it to undef.
-  ops.push_back(UndefValue::get(srcTy));
+  // Add the a pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(srcTy));
   ops.push_back(src);
   ops.push_back(indices);
   ops.push_back(avl);
@@ -423,8 +423,8 @@ llvm::Value *TargetInfoRISCV::createScalableBroadcast(llvm::IRBuilder<> &B,
   auto *const avl = getIntrinsicVL(B, VL, wideTy, getTargetMachine());
 
   SmallVector<Value *, 4> ops;
-  // Add the pass-through operand - we set it to undef.
-  ops.push_back(UndefValue::get(vs2->getType()));
+  // Add the pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(vs2->getType()));
   ops.push_back(vs2);
   ops.push_back(vs1);
   ops.push_back(avl);
@@ -574,7 +574,7 @@ llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
   auto *const srcTy = cast<VectorType>(src->getType());
   if (isa<Constant>(mask)) {
     // Special case if the mask happens to be a constant.
-    return B.CreateShuffleVector(src, UndefValue::get(srcTy), mask);
+    return B.CreateShuffleVector(src, PoisonValue::get(srcTy), mask);
   }
 
   if (isa<FixedVectorType>(srcTy)) {
@@ -645,8 +645,8 @@ llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
   auto *const avl = getIntrinsicVL(B, VL, gatherTy, getTargetMachine());
 
   SmallVector<Value *, 4> ops;
-  // Add the pass-through operand - we set it to undef.
-  ops.push_back(UndefValue::get(gatherTy));
+  // Add the pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(gatherTy));
   ops.push_back(src);
   ops.push_back(mask);
   ops.push_back(avl);
@@ -678,8 +678,8 @@ llvm::Value *TargetInfoRISCV::createVectorSlideUp(llvm::IRBuilder<> &B,
   auto *const avl = getIntrinsicVL(B, VL, srcTy, getTargetMachine());
 
   SmallVector<Value *, 4> ops;
-  // Add the pass-through operand - we set it to undef.
-  ops.push_back(UndefValue::get(srcTy));
+  // Add the pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(srcTy));
   ops.push_back(src);
   ops.push_back(insert);
   ops.push_back(avl);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index a15c0d0dabca5..39939e7385782 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -860,7 +860,7 @@ bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
   if (!EC.isScalable() && !IsVP) {
     auto *const NeutralVal = compiler::utils::getNeutralVal(OpKind, EltTy);
     const auto Width = EC.getFixedValue();
-    auto *const UndefVal = UndefValue::get(VecTy);
+    auto *const UndefVal = PoisonValue::get(VecTy);
 
     // Put the Neutral element in a vector so we can shuffle it in.
     auto *const NeutralVec =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
index a4d4b1f888f7c..c73edafd0548d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -46,7 +46,7 @@ if.then2:                                     ; preds = %if.then1
 
 if.then3:                                     ; preds = %x51
   %load1 = load float, float addrspace(1)* %gep2, align 4
-  %ie_load1 = insertelement <4 x float> undef, float %load1, i32 0
+  %ie_load1 = insertelement <4 x float> poison, float %load1, i32 0
   br label %if.end2
 
 if.else3:                                    ; preds = %x51
@@ -60,8 +60,8 @@ if.then4:                                    ; preds = %x175
 
 if.end2:                                    ; preds = %x274, %x271, %if.then4, %x175, %x155, %x132
   %phi_gep2_load = phi <4 x float> [ %ie_load1, %if.then3 ], [ %vload2, %if.then4 ], [ %vload1, %if.else3 ]
-  %ie_m = insertelement <4 x float> undef, float %m, i32 0
-  %shuffle_ie_m = shufflevector <4 x float> %ie_m, <4 x float> undef, <4 x i32> zeroinitializer
+  %ie_m = insertelement <4 x float> poison, float %m, i32 0
+  %shuffle_ie_m = shufflevector <4 x float> %ie_m, <4 x float> poison, <4 x i32> zeroinitializer
   %fmul = fmul <4 x float> %shuffle_ie_m, %phi_gep2_load
   br label %if.end1
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
index fee72ee014ac9..7d9b0385dbb90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -31,8 +31,8 @@ for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
   %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
-  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
-  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
   %0 = extractelement <4 x i1> %cmp2, i64 0
   br i1 %0, label %for.body, label %for.end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index 8cba4e91c50bd..d8d02153048e0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -124,7 +124,7 @@ entry:
 ; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
-; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
+; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
 ; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:  [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
@@ -141,7 +141,7 @@ entry:
 ; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], shufflevector (<vscale x 128 x i32> insertelement (<vscale x 128 x i32> {{(undef|poison)}}, i32 31, {{i32|i64}} 0), <vscale x 128 x i32> {{(undef|poison)}}, <vscale x 128 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> poison)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 128 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
@@ -175,13 +175,13 @@ entry:
 ; CHECK-NEXT:  [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
-; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
+; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
 ; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:  store <4 x i32> zeroinitializer, ptr [[EXISTINGALLOC]], align 16
 ; CHECK-NEXT:  store i32 1, ptr [[EXISTINGALLOC]], align 16
 ; CHECK-NEXT:  [[V:%.*]] = load <4 x i32>, ptr [[EXISTINGALLOC]], align 16
 ; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.v4i32(<vscale x 16 x i32> poison, <4 x i32> [[V]], i64 0)
-; CHECK-NEXT:  [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[VS2]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
+; CHECK-NEXT:  [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
 ; CHECK-NEXT:  [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:  store <vscale x 16 x i32> [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 16
 ; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
@@ -198,12 +198,12 @@ entry:
 ; CHECK-NEXT:    [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[XLEN4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[XLEN4]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
 ; CHECK-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i8> @llvm.{{(experimental.)?}}vector.insert.nxv16i8.v4i8(<vscale x 16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
 ; CHECK-NEXT:    [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> {{(undef|poison)}}, i16 3, {{i32|i64}} 0), <vscale x 16 x i16> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP0]])
 ; CHECK: [[TMP4:%.*]] = trunc <vscale x 16 x i8> [[TMP3]] to <vscale x 16 x i1>
 ; CHECK: [[TMP5:%.*]] = fcmp oeq <vscale x 16 x float>
 ; CHECK: [[TMP8:%.*]] = and <vscale x 16 x i1> [[TMP5]], [[TMP4]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
index 775a370dc8f9b..938100148aaca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -56,7 +56,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 ;------- target-dependent dynamic shuffle code:
 ; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
 ; CHECK:   %[[VL:.+]] = mul i64 %[[VLSCALE]], 4
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -89,7 +89,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ;------- target-dependent dynamic shuffle code:
 ; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
 ; CHECK:   %[[VL:.+]] = mul i64 %[[VLSCALE]], 4
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -105,7 +105,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ;------- target-dependent slide-up code:
 ; CHECK:   %[[VLSCALE2:.+]] = call i64 @llvm.vscale.i64()
 ; CHECK:   %[[VL2:.+]] = mul i64 %[[VLSCALE2]], 4
-; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
+; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
 ; CHECK: }
@@ -126,7 +126,7 @@ declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vsc
 ; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
 
 ; Make sure the floating point version of the slide1up intrinsic is created
-; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> undef, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
+; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> poison, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
 
 declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>)
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
index 55c4486376589..e04749c9803e6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -54,7 +54,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<v
 
 ;------- target-dependent dynamic shuffle code:
 ; CHECK:   %[[VL:.+]] = zext i32 %1 to i64
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %[[SHUFFLE]]
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -84,7 +84,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 
 ;------- target-dependent dynamic shuffle code:
 ; CHECK:   %[[VL:.+]] = zext i32 %1 to i64
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -99,7 +99,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 
 ;------- target-dependent slide-up code:
 ; CHECK:   %[[VL2:.+]] = zext i32 %1 to i64
-; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> undef, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
+; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
 ; CHECK: }
@@ -120,7 +120,7 @@ declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(
 ; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
 
 ; Make sure the floating point version of the slide1up intrinsic is created
-; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> undef, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
+; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> poison, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
 
 declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
@@ -181,4 +181,4 @@ declare <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj
 ; CHECK: loop:
 ; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x double> [ %0, %entry ],
 ; CHECK:   %{{.+}} = call <vscale x 4 x double> @llvm.minnum.nxv4f64(<vscale x 4 x double> %[[VEC]], <vscale x 4 x double> %{{.+}})
-; CHECK:   call <vscale x 4 x double> @llvm.riscv.vfslide1up.nxv4f64.f64.i64({{(<vscale x 4 x double> undef, )?}}<vscale x 4 x double> %{{.+}}, double 0x7FF0000000000000, i64 %{{.+}})
+; CHECK:   call <vscale x 4 x double> @llvm.riscv.vfslide1up.nxv4f64.f64.i64({{(<vscale x 4 x double> poison, )?}}<vscale x 4 x double> %{{.+}}, double 0x7FF0000000000000, i64 %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index 292130f4298d1..63ff3edfca734 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -114,7 +114,7 @@ entry:
 ; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-NEXT:    [[VS1:%.*]] = add <vscale x 4 x i32> [[IDXSCALE]], [[SPLAT]]
 ; EE-NEXT:    [[T3:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
-; EE-NEXT:    [[T4:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T3]], i64 [[TMP2]])
+; EE-NEXT:    [[T4:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T3]], i64 [[TMP2]])
 ; EE-NEXT:    [[T5:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T4]], i64 0)
 
 ; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
@@ -140,7 +140,7 @@ entry:
 ; EE-UNI-VEC-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1]]
 
 ; EE-UNI-VEC-NEXT:    [[T4:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
-; EE-UNI-VEC-NEXT:    [[T5:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T4]], i64 [[T3]])
+; EE-UNI-VEC-NEXT:    [[T5:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T4]], i64 [[T3]])
 ; EE-UNI-VEC-NEXT:    [[T6:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T5]], i64 0)
 
 ; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
@@ -150,7 +150,7 @@ entry:
 ; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-INDICES-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1:%.*]]
 ; EE-INDICES-NEXT:    [[T5:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
-; EE-INDICES-NEXT:    [[T6:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[T3:%.*]], <vscale x 16 x i32> [[T5]], i64 [[T4]])
+; EE-INDICES-NEXT:    [[T6:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T3:%.*]], <vscale x 16 x i32> [[T5]], i64 [[T4]])
 ; EE-INDICES-NEXT:    [[T7:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T6]], i64 0)
 
 ; Check we promote from i1 to i8 before doing our memops and use vrgatherei16.
@@ -164,6 +164,6 @@ entry:
 ; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> {{(undef|poison)}}, i16 2, {{(i32|i64)}} 0), <vscale x 4 x i16> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
 ; EE-BOOL-NEXT:  [[VS1:%.*]] = {{add|or}} <vscale x 4 x i16> [[T11]], [[T9]]
 ; EE-BOOL-NEXT:  [[T12:%.*]] = call <vscale x 16 x i16> @llvm.{{(experimental.)?}}vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[VS1]], i64 0)
-; EE-BOOL-NEXT:  [[T13:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[T6]], <vscale x 16 x i16> [[T12]], i64 [[T7]])
+; EE-BOOL-NEXT:  [[T13:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[T6]], <vscale x 16 x i16> [[T12]], i64 [[T7]])
 ; EE-BOOL-NEXT:  [[T14:%.*]] = call <vscale x 4 x i8> @llvm.{{(experimental.)?}}vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8> [[T13]], i64 0)
 ; EE-BOOL-NEXT:  [[T15:%.*]] = trunc <vscale x 4 x i8> [[T14]] to <vscale x 4 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index c4b19b54dc990..2343a0d950e16 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -122,7 +122,7 @@ entry:
 ; IE-INDICES-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> {{%.*}}, i64 0)
 ; IE-INDICES:         [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
 ; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; IE-INDICES-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i32> [[IDX1]], i64 [[TMP5]])
+; IE-INDICES-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i32> [[IDX1]], i64 [[TMP5]])
 ; IE-INDICES-NEXT:    [[VS25:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[FIDX2]], i64 0)
 ; IE-INDICES-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; IE-INDICES-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[TMP9]], [[INNER]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
index 5f5fc6aed8d59..b614818e0c91b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -25,7 +25,7 @@ entry:
   %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
-  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
   store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
   ret void
@@ -37,7 +37,7 @@ declare i64 @__mux_get_global_id(i32) #1
 ; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
 ; CHECK: entry:
 ; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
-; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
 ; CHECK:  store <vscale x 16 x i32> %[[GATHER]]
 ; CHECK:  ret void
 ; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
index 8ebecbb9ae5e2..e2a7fc87785db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -26,7 +26,7 @@ entry:
   %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
   %in.bool = icmp ne <4 x i32> %in.data, zeroinitializer
-  %out.data = shufflevector <4 x i1> %in.bool, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %out.data = shufflevector <4 x i1> %in.bool, <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %out.sext = sext <4 x i1> %out.data to <4 x i32>
   %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
   store <4 x i32> %out.sext, <4 x i32> addrspace(1)* %out.ptr, align 32
@@ -42,7 +42,7 @@ declare i64 @__mux_get_global_id(i32) #1
 ; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
 ; CHECK:  %[[DATA_i1:.+]] = icmp ne <vscale x 16 x i32> %[[DATA]], zeroinitializer
 ; CHECK:  %[[DATA_i8:.+]] = zext <vscale x 16 x i1> %[[DATA_i1]] to <vscale x 16 x i8>
-; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> %[[DATA_i8]], <vscale x 16 x i16> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> %[[DATA_i8]], <vscale x 16 x i16> %{{.+}}, i64 %{{.+}})
 ; CHECK:  %[[GATHER_i1:.+]] = trunc <vscale x 16 x i8> %[[GATHER]] to <vscale x 16 x i1>
 ; CHECK:  %[[RESULT:.+]] = sext <vscale x 16 x i1> %[[GATHER_i1]] to <vscale x 16 x i32>
 ; CHECK:  store <vscale x 16 x i32> %[[RESULT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
index a720241586957..56fac919fb531 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -41,9 +41,9 @@ declare i64 @__mux_get_global_id(i32) #1
 ; CHECK:  %[[DATA:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
 ; CHECK:  %[[DATB:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
 ; CHECK:  %[[WIDENA:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATA]], i64 0)
-; CHECK:  %[[GATHERA:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[WIDENA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHERA:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[WIDENA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
 ; CHECK:  %[[WIDENB:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATB]], i64 0)
-; CHECK:  %[[GATHERB:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[WIDENB]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHERB:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[WIDENB]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
 ; CHECK:  %[[SELECT:.+]] = select <vscale x 16 x i1> %{{.+}}, <vscale x 16 x i32> %[[GATHERB]], <vscale x 16 x i32> %[[GATHERA]]
 ; CHECK:  store <vscale x 16 x i32> %[[SELECT]]
 ; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
index 8b54db3622b3d..8a40287b77add 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -25,7 +25,7 @@ entry:
   %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
-  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
   %out.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i64 %gid
   store <2 x i32> %out.data, <2 x i32> addrspace(1)* %out.ptr, align 32
   ret void
@@ -37,7 +37,7 @@ declare i64 @__mux_get_global_id(i32) #1
 ; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
 ; CHECK: entry:
 ; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
-; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
 ; CHECK:  %[[EXTRACT:.+]] = call <vscale x 8 x i32> @llvm.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %[[GATHER]], i64 0)
 ; CHECK:  store <vscale x 8 x i32> %[[EXTRACT]]
 ; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
index 4b1094391d3fb..f0877a2590095 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -25,7 +25,7 @@ entry:
   %gid = call i64 @__mux_get_global_id(i32 0)
   %in.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %in, i64 %gid
   %in.data = load <2 x i32>, <2 x i32> addrspace(1)* %in.ptr
-  %out.data = shufflevector <2 x i32> %in.data, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %out.data = shufflevector <2 x i32> %in.data, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
   store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
   ret void
@@ -38,7 +38,7 @@ declare i64 @__mux_get_global_id(i32) #1
 ; CHECK: entry:
 ; CHECK:  %[[DATA:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
 ; CHECK:  %[[WIDEN:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATA]], i64 0)
-; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> %[[WIDEN]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[WIDEN]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
 ; CHECK:  store <vscale x 16 x i32> %[[GATHER]]
 ; CHECK:  ret void
 ; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 113064d801114..055c64a47c627 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
 ; CHECK: [[idx0:%.*]] = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
 ; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> {{(undef|poison)}}, i16 1, {{i32|i64}} 0), <vscale x 8 x i16> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
-; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[vs2:%.*]], <vscale x 8 x i16> [[vs1:%.*]], i64 [[xlen:%.*]])
+; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[vs2:%.*]], <vscale x 8 x i16> [[vs1:%.*]], i64 [[xlen:%.*]])
 ; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
 ; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{i32|i64}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index c41760af170a7..7de1df26153d4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -111,7 +111,7 @@ entry:
 ; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> poison)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
index 4d81ce68d3a9b..642996e2d64c5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -86,5 +86,5 @@ declare i64 @__mux_get_global_id(i32)
 ; Test if the masked gather load is defined correctly
 ; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b(<vscale x 4 x ptr addrspace(1)>{{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
 ; CHECK: entry:
-; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> undef)
+; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> poison)
 ; CHECK: ret <vscale x 4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index 2723beb889593..de83911c22126 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -58,7 +58,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 ; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
 ;------- there will be a bitcast here if pointers are typed
 ; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> poison)
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -93,7 +93,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
 ;------- there will be a bitcast here if pointers are typed
 ; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> poison)
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -107,7 +107,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 ; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
 
 ;------- target-dependent slide-up code:
-; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %[[SCAN]], i32 -1)
 ; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
index 5631b3d101122..3bff18980d2aa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -60,7 +60,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<v
 ; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
 ; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> poison)
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %[[SHUFFLE]]
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -97,7 +97,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 ; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
 ; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
-; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> poison)
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -111,7 +111,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<v
 ; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
 
 ;------- target-dependent slide-up code:
-; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %[[SCAN]], i32 -1)
 ; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index 1d8faa1badc8c..b1889eafb1f20 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -27,7 +27,7 @@ entry:
   %idx = call i64 @__mux_get_global_id(i32 0)
   %mod_idx = urem i64 %idx, 2
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
-  %ins = insertelement <4 x i1> undef, i1 true, i32 0
+  %ins = insertelement <4 x i1> poison, i1 true, i32 0
   %cmp = icmp slt i64 %idx, 64
   br i1 %cmp, label %if.then, label %if.end
 if.then:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index b514eeb2c60b0..07e95ce18f74a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -26,8 +26,8 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
   %a = load i32, i32* %arrayidxa, align 4
   %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
-  %insert = insertelement <4 x i32> undef, i32 %a, i32 0
-  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %insert = insertelement <4 x i32> poison, i32 %a, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
   %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
   store <4 x i32> %splat, <4 x i32>* %arrayidxz
   ret void
@@ -48,8 +48,8 @@ define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x
   %idx = call i64 @__mux_get_global_id(i32 0)
   %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
   %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
-  %insert = insertelement <4 x i32> undef, i32 %a, i32 0
-  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %insert = insertelement <4 x i32> poison, i32 %a, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
   %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
   store <4 x i32> %splat, <4 x i32>* %arrayidxz
   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
index e043a43db80d6..45292969d3b35 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
@@ -58,7 +58,7 @@ declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>, i3
 ; CHECK:   %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0
 ; CHECK:   %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:   %[[VLMASK:.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %[[VLSPLAT]]
-; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> poison)
 
 ; CHECK:   %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -93,7 +93,7 @@ declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>, i3
 ; CHECK:   %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0
 ; CHECK:   %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:   %[[VLMASK:.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %[[VLSPLAT]]
-; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> undef)
+; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> poison)
 
 ; CHECK:   %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}}
 ; CHECK:   %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
@@ -107,7 +107,7 @@ declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>, i3
 ; CHECK:   %[[SCAN:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ]
 
 ;------- target-dependent slide-up goes here
-; CHECK:  %[[SLIDE:.+]] = shufflevector <4 x i32> %[[SCAN]], <4 x i32> undef, <4 x i32> <i32 {{[0-9]+}}, i32 0, i32 1, i32 2>
+; CHECK:  %[[SLIDE:.+]] = shufflevector <4 x i32> %[[SCAN]], <4 x i32> poison, <4 x i32> <i32 {{[0-9]+}}, i32 0, i32 1, i32 2>
 ; CHECK:  %[[RESULT:.+]] = insertelement <4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
 
 ; CHECK:   ret <4 x i32> %[[RESULT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 0eecebc5e2a0f..5c1df71ed9475 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -27,7 +27,7 @@ entry:
   %idx = call i64 @__mux_get_global_id(i32 0)
   %mod_idx = urem i64 %idx, 2
   %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
-  %ins = insertelement <4 x i1> undef, i1 true, i32 0
+  %ins = insertelement <4 x i1> poison, i1 true, i32 0
   %cmp = icmp slt i64 %idx, 64
   br i1 %cmp, label %if.then, label %if.end
 if.then:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
index d7fe492093e37..2d28ba251b055 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
@@ -35,6 +35,6 @@ declare i64 @__mux_get_global_id(i32) #1
 
 ; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
 ; CHECK: %[[LD:.+]] = load <16 x i64>
-; CHECK: %[[EXT:.+]] = shufflevector <16 x i64> %[[LD]], <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: %[[EXT:.+]] = shufflevector <16 x i64> %[[LD]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK: store <4 x i64> %[[EXT]]
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
index 8a9fccf525fca..7739407d482f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
@@ -41,7 +41,7 @@ entry:
 
 ; Extract directly from the widened source and insert directly into result
 ; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[LD]], i32 %x
-; CHECK: %[[INS0:.+]] = insertelement <4 x float> undef, float %[[EXT0]], i32 0
+; CHECK: %[[INS0:.+]] = insertelement <4 x float> poison, float %[[EXT0]], i32 0
 ; CHECK: %[[IDX1:.+]] = add i32 %x, 4
 ; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX1]]
 ; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
index 178751cb6f23a..405cff01c2e34 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
@@ -42,7 +42,7 @@ entry:
 ; Extract directly from the uniform source with vectorized indices and insert directly into result
 ; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[LD]], i32 0
 ; CHECK: %[[EXT0:.+]] = extractelement <4 x i8> %x, i32 %[[IND0]]
-; CHECK: %[[INS0:.+]] = insertelement <4 x i8> undef, i8 %[[EXT0]], i32 0
+; CHECK: %[[INS0:.+]] = insertelement <4 x i8> poison, i8 %[[EXT0]], i32 0
 ; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[LD]], i32 1
 ; CHECK: %[[EXT1:.+]] = extractelement <4 x i8> %x, i32 %[[IND1]]
 ; CHECK: %[[INS1:.+]] = insertelement <4 x i8> %[[INS0]], i8 %[[EXT1]], i32 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
index 39c3f0fe006e0..70d1908c8a9ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
@@ -48,7 +48,7 @@ entry:
 ; Extract directly from the widened source with vectorized indices and insert directly into result
 ; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0
 ; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND0]]
-; CHECK: %[[INS0:.+]] = insertelement <4 x float> undef, float %[[EXT0]], i32 0
+; CHECK: %[[INS0:.+]] = insertelement <4 x float> poison, float %[[EXT0]], i32 0
 ; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1
 ; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND1]]
 ; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
index a467db71f7dad..2d767313e0ddc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
@@ -46,7 +46,7 @@ entry:
 ; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr
 
 ; Insert elements turned into shufflevectors
-; CHECK: %[[WIDE:.+]] = shufflevector <4 x i32> %[[ELTS]], <4 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[WIDE:.+]] = shufflevector <4 x i32> %[[ELTS]], <4 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; CHECK: %[[INS:.+]] = shufflevector <16 x i32> %[[WIDE]], <16 x i32> %[[INTO]], <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 20, i32 21, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 28, i32 29, i32 14, i32 31>
 
 ; No more shuffles..
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
index 1a8350794513d..4c887d9c66be5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
@@ -45,7 +45,7 @@ loop:                                              ; preds = %entry, %loop
   %i.inc = add nuw nsw i64 %i, 1
   %cmp = icmp slt i64 %i.inc, %n
   %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address)
-  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> undef, <4 x i32> zeroinitializer
+  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> poison, <4 x i32> zeroinitializer
   %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0
   br i1 %cmp, label %loop, label %end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
index 6a9ad584fbcde..e5054ae1201e7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
@@ -25,8 +25,8 @@ define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %o
 entry:
   %call = call i64 @__mux_get_global_id(i32 0)
   %call.trunc = trunc i64 %call to i32
-  %call.splatinsert = insertelement <4 x i32> undef, i32 %call.trunc, i32 0
-  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %call.splatinsert = insertelement <4 x i32> poison, i32 %call.trunc, i32 0
+  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %cmp = icmp eq i64 %call, 0
   br i1 %cmp, label %for.end, label %for.cond
 
@@ -34,8 +34,8 @@ for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
   %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
-  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
-  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
   %0 = extractelement <4 x i1> %cmp2, i64 0
   br i1 %0, label %for.body, label %for.end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
index a37c5e7cd7014..c092dbd97ca09 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
@@ -71,14 +71,14 @@ declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 ; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA1]], <16 x float> %[[SB1]], <16 x float> %[[SC1]])
 
 ; It splits the 2 x <16 x float> results into 8 <4 x float> values
-; CHECK: %[[RES0:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[RES1:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK: %[[RES2:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK: %[[RES3:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK: %[[RES4:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[RES5:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK: %[[RES6:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK: %[[RES7:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[RES0:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[RES1:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[RES2:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: %[[RES3:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[RES4:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[RES5:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[RES6:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: %[[RES7:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; CHECK: store <4 x float> %[[RES0]], ptr %{{.+}}, align 16
 ; CHECK: store <4 x float> %[[RES1]], ptr %{{.+}}, align 16
 ; CHECK: store <4 x float> %[[RES2]], ptr %{{.+}}, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
index cbdb41925babf..7a7d4428bdd4c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
@@ -29,7 +29,7 @@ entry:
   call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull %0)
   %1 = trunc i64 %call to i32
   %conv = add nuw nsw i32 %1, 2
-  %2 = insertelement <4 x i32> undef, i32 %conv, i64 0
+  %2 = insertelement <4 x i32> poison, i32 %conv, i64 0
   %conv2 = add nuw nsw i32 %1, 3
   %3 = insertelement <4 x i32> %2, i32 %conv2, i64 1
   %4 = insertelement <4 x i32> %3, i32 %1, i64 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
index e21a9ae64f0d7..bcbf179616d32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -39,7 +39,7 @@ declare i64 @__mux_get_global_id(i32)
 
 ; Test if the scatter store is defined correctly
 ; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
-; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x i64> undef)
+; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x i64> poison)
 ; CHECK: ret <4 x i64>
 
 ; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
index ce62413ee0f78..bc1ab94fcd018 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -58,5 +58,5 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %[[TMP1:.*]] = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> undef)
+; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison)
 ; CHECK: ret <4 x double> %[[TMP2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
index 66c9e8a218134..e17494b5cf462 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -75,5 +75,5 @@ attributes #3 = { nobuiltin nounwind }
 ; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
 ; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
 ; CHECK: %1 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> undef)
+; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison)
 ; CHECK: ret <4 x double> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
index e23e36465a672..394eb61e4aaff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
@@ -79,5 +79,5 @@ declare i64 @__mux_get_global_id(i32)
 ; Test if the masked gather load is defined correctly
 ; CHECK: define <4 x i32> @__vecz_b_masked_gather_load4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x ptr addrspace(1)>{{( %0)?}}, <4 x i1>{{( %1)?}})
 ; CHECK: entry:
-; CHECK: %2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> undef)
+; CHECK: %2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> poison)
 ; CHECK: ret <4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
index 860e7e226d59f..2d31999d37d37 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
@@ -29,9 +29,9 @@ define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
 declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32>)
 ; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %0) {
 ; CHECK: entry:
-; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
 ; CHECK:  %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]]
-; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
+; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
 ; CHECK:  %[[RESULT:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]]
 ; CHECK:  ret <4 x i32> %[[RESULT]]
 ; CHECK: }
@@ -39,11 +39,11 @@ declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32>)
 declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32>)
 ; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> %0) {
 ; CHECK: entry:
-; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
 ; CHECK:  %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]]
-; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
+; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
 ; CHECK:  %[[ADD2:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]]
-; CHECK:  %[[ROTATE:.+]] = shufflevector <4 x i32> %[[ADD2]], <4 x i32> undef, <4 x i32> <i32 {{.+}}, i32 0, i32 1, i32 2>
+; CHECK:  %[[ROTATE:.+]] = shufflevector <4 x i32> %[[ADD2]], <4 x i32> poison, <4 x i32> <i32 {{.+}}, i32 0, i32 1, i32 2>
 ; CHECK:  %[[RESULT:.+]] = insertelement <4 x i32> %[[ROTATE]], i32 0, i64 0
 ; CHECK:  ret <4 x i32> %[[RESULT]]
 ; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index 227f7f5280d06..5f82e99e6d583 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -58,7 +58,7 @@ entry:
   %idx.ext = sext i32 %mul to i64, !dbg !32
   %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idx.ext, !dbg !32
   %call1 = call spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64 0, i32 addrspace(1)* %add.ptr) #3, !dbg !32
-  %extractVec = shufflevector <3 x i32> %call1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>, !dbg !32
+  %extractVec = shufflevector <3 x i32> %call1, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>, !dbg !32
   %storetmp = bitcast <3 x i32>* %tmp to <4 x i32>*, !dbg !32
   store <4 x i32> %extractVec, <4 x i32>* %storetmp, align 16, !dbg !32
   %2 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
index 4249ff6ce6435..e5fe580b0ac22 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
@@ -45,7 +45,7 @@ loop:                                              ; preds = %entry, %loop
   %i.inc = add nuw nsw i64 %i, 1
   %cmp = icmp slt i64 %i.inc, %n
   %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address)
-  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> undef, <4 x i32> zeroinitializer
+  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> poison, <4 x i32> zeroinitializer
   %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0
   br i1 %cmp, label %loop, label %end
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
index 8de16c7ccfe09..712271d2b12b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
@@ -27,8 +27,8 @@ declare void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float>, <2 x float>*,
 define spir_kernel void @scalarize_masked_memops(<2 x float>* %pa, <2 x float>* %pz) {
 entry:
   %idx = call i64 @__mux_get_global_id(i32 0)
-  %head = insertelement <2 x i64> undef, i64 %idx, i64 0
-  %splat = shufflevector <2 x i64> %head, <2 x i64> undef, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i64> poison, i64 %idx, i64 0
+  %splat = shufflevector <2 x i64> %head, <2 x i64> poison, <2 x i32> zeroinitializer
   %idxs = add <2 x i64> %splat, <i64 0, i64 1>
   %mask = icmp slt <2 x i64> %idxs, <i64 8, i64 8>
   %aptr = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
index e50bc7584796d..b40ac87870871 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
@@ -50,11 +50,11 @@ declare <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)>, i32, <2
 ; CHECK:   %1 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 1
 ; CHECK:   %ptrdatavec1 = load ptr addrspace(1), ptr addrspace(1) %0, align 1
 ; CHECK:   %ptrdatavec2 = load ptr addrspace(1), ptr addrspace(1) %1, align 1
-; CHECK:   %2 = insertelement <2 x ptr addrspace(1)> undef, ptr addrspace(1) %ptrdatavec1, i32 0
+; CHECK:   %2 = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) %ptrdatavec1, i32 0
 ; CHECK:   %3 = insertelement <2 x ptr addrspace(1)> %2, ptr addrspace(1) %ptrdatavec2, i32 1
 ; CHECK:   %ptrdatavec.gep3 = getelementptr i32, ptr addrspace(1) %ptrdatavec1, i64 1
 ; CHECK:   %ptrdatavec.gep4 = getelementptr i32, ptr addrspace(1) %ptrdatavec2, i64 1
-; CHECK:   %4 = insertelement <2 x ptr addrspace(1)> undef, ptr addrspace(1) %ptrdatavec.gep3, i32 0
+; CHECK:   %4 = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) %ptrdatavec.gep3, i32 0
 ; CHECK:   %5 = insertelement <2 x ptr addrspace(1)> %4, ptr addrspace(1) %ptrdatavec.gep4, i32 1
 ; CHECK:   %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %3, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
 ; CHECK:   %6 = extractelement <2 x i32> %vec1, i32 0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
index 5c62c31b2a6ad..73993e3c2883b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
@@ -52,7 +52,7 @@ entry:
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
-  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
   %call12 = tail call i64 @__mux_get_global_size(i32 noundef 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
index 82a81725f7320..dfba183808512 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
@@ -32,7 +32,7 @@ entry:
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
-  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
   %call12 = tail call i64 @__mux_get_global_size(i32 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
index 3dac89baae9d5..0b51e0f078b05 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
@@ -32,7 +32,7 @@ entry:
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
-  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
   %call12 = tail call i64 @__mux_get_global_size(i32 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
index e9273ceaf80f2..ffdb64718d8b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
@@ -32,7 +32,7 @@ entry:
   %reass.add = add nuw nsw i64 %mul7, %call1
   %reass.mul = mul nuw nsw i64 %reass.add, %call3
   %add8 = add nuw nsw i64 %reass.mul, %call
-  %vecinit = insertelement <4 x i64> undef, i64 %call3, i64 0
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
   %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
   %call12 = tail call i64 @__mux_get_global_size(i32 2)
   %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
index cd5cd3b8f9bec..b5783f2c9e55c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
@@ -49,12 +49,12 @@ define spir_kernel void @kernel(ptr %in, ptr %out) {
 ; CHECK: [[BASE:%.*]] = mul i32 %2, 2
 ; CHECK: [[IDX0:%.*]] = add i32 [[BASE]], 0
 ; CHECK: [[ELT0:%.*]] = extractelement <8 x float> %1, i32 [[IDX0]]
-; CHECK: [[TVEC:%.*]] = insertelement <2 x float> undef, float [[ELT0]], i32 0
+; CHECK: [[TVEC:%.*]] = insertelement <2 x float> poison, float [[ELT0]], i32 0
 ; CHECK: [[IDX1:%.*]] = add i32 [[BASE]], 1
 ; CHECK: [[ELT1:%.*]] = extractelement <8 x float> %1, i32 [[IDX1]]
 ; CHECK: [[VEC:%.*]] = insertelement <2 x float> [[TVEC]], float [[ELT1]], i32 1
 ; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 [[MUXIDX]])
-; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> undef,
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> poison,
 ; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 define spir_kernel void @kernel_vec_data(ptr %in, ptr %out) {
   %gid = tail call i64 @__mux_get_global_id(i32 0)
@@ -89,7 +89,7 @@ define spir_kernel void @kernel_const_idx(ptr %in, ptr %out) {
 ; at element index 2
 ; CHECK: [[VEC:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v8f32(<8 x float> {{%.*}}, i64 2)
 ; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 0)
-; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> undef,
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> poison,
 ; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 ; CHECK: store <8 x float> [[SPLAT]]
 define spir_kernel void @kernel_vec_data_const_idx(ptr %in, ptr %out) {
@@ -137,7 +137,7 @@ define spir_kernel void @kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, pt
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data(<2 x float> %val, ptr %out)
 ; It doesn't matter what sub-group index we choose because the data is uniform.
 ; Just splat it.
-; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> undef,
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> poison,
 ; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 ; CHECK: store <8 x float> [[SPLAT]]
 define spir_kernel void @kernel_uniform_vec_data(<2 x float> %val, ptr %out) {
@@ -153,7 +153,7 @@ define spir_kernel void @kernel_uniform_vec_data(<2 x float> %val, ptr %out) {
 ; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out)
 ; It doesn't matter what sub-group index we choose because the data is uniform.
 ; Just splat it.
-; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> undef,
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> poison,
 ; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 ; CHECK: store <8 x float> [[SPLAT]]
 define spir_kernel void @kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
index 7eaf85a414023..5a7c4b4e7f8fb 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
@@ -77,7 +77,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4
 ; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0
 ; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]]
-; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> undef, i8 [[ELT00]], i32 0
+; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> poison, i8 [[ELT00]], i32 0
 ; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1
 ; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]]
 ; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1
@@ -95,7 +95,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4
 ; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0
 ; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]]
-; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> undef, i8 [[ELT10]], i32 0
+; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> poison, i8 [[ELT10]], i32 0
 ; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1
 ; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]]
 ; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1
@@ -113,7 +113,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4
 ; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0
 ; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]]
-; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> undef, i8 [[ELT20]], i32 0
+; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> poison, i8 [[ELT20]], i32 0
 ; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1
 ; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]]
 ; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1
@@ -131,7 +131,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4
 ; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0
 ; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]]
-; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> undef, i8 [[ELT30]], i32 0
+; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> poison, i8 [[ELT30]], i32 0
 ; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1
 ; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]]
 ; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
index 99f08c8efa9cc..779596da58a14 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
@@ -101,7 +101,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4
 ; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0
 ; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]]
-; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> undef, i8 [[ELT00]], i32 0
+; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> poison, i8 [[ELT00]], i32 0
 ; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1
 ; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]]
 ; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1
@@ -119,7 +119,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4
 ; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0
 ; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]]
-; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> undef, i8 [[ELT10]], i32 0
+; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> poison, i8 [[ELT10]], i32 0
 ; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1
 ; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]]
 ; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1
@@ -137,7 +137,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4
 ; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0
 ; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]]
-; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> undef, i8 [[ELT20]], i32 0
+; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> poison, i8 [[ELT20]], i32 0
 ; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1
 ; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]]
 ; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1
@@ -155,7 +155,7 @@ define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
 ; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4
 ; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0
 ; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]]
-; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> undef, i8 [[ELT30]], i32 0
+; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> poison, i8 [[ELT30]], i32 0
 ; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1
 ; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]]
 ; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
index dec28cd3cdd5e..c1aaca731d2cd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
@@ -37,7 +37,7 @@ target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Shuffle across any hardware sub-group
 ; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
 ; Put that result into the final vector
-; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> undef, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0
 
 ; And so on for the other shuffle values
 ; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
@@ -78,7 +78,7 @@ define spir_kernel void @kernel_varying_data_const_value(ptr %in, ptr %out) {
 ; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
 ; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
 ; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
-; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> undef, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0
 ; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
 ; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
 ; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
@@ -126,7 +126,7 @@ define spir_kernel void @kernel_uniform_data_uniform_value(half %data, i32 %val,
 ; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
 ; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
 ; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
-; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> undef, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0
 ; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
 ; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
 ; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
@@ -164,20 +164,20 @@ define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, p
 ; CHECK: [[MULIDXELT0:%.*]] = mul i32 [[IDXELT0]], 2
 ; CHECK: [[MADIDXELT00:%.*]] = add i32 [[MULIDXELT0]], 0
 ; CHECK: [[ELT00:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT00]]
-; CHECK: [[DATAELT00:%.*]] = insertelement <2 x float> undef, float [[ELT00]], i32 0
+; CHECK: [[DATAELT00:%.*]] = insertelement <2 x float> poison, float [[ELT00]], i32 0
 ; CHECK: [[MADIDXELT01:%.*]] = add i32 [[MULIDXELT0]], 1
 ; CHECK: [[ELT01:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT01]]
 ; CHECK: [[DATAELT01:%.*]] = insertelement <2 x float> [[DATAELT00]], float [[ELT01]], i32 1
 ; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
 ; CHECK: [[SHUFF_ELT0:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT01]], i32 [[ID0]])
 ; CHECK: [[SHUFF_RES0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
-; CHECK-SAME:                                      <8 x float> undef, <2 x float> [[SHUFF_ELT0]], i64 0)
+; CHECK-SAME:                                      <8 x float> poison, <2 x float> [[SHUFF_ELT0]], i64 0)
 
 ; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
 ; CHECK: [[MULIDXELT1:%.*]] = mul i32 [[IDXELT1]], 2
 ; CHECK: [[MADIDXELT10:%.*]] = add i32 [[MULIDXELT1]], 0
 ; CHECK: [[ELT10:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT10]]
-; CHECK: [[DATAELT10:%.*]] = insertelement <2 x float> undef, float [[ELT10]], i32 0
+; CHECK: [[DATAELT10:%.*]] = insertelement <2 x float> poison, float [[ELT10]], i32 0
 ; CHECK: [[MADIDXELT11:%.*]] = add i32 [[MULIDXELT1]], 1
 ; CHECK: [[ELT11:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT11]]
 ; CHECK: [[DATAELT11:%.*]] = insertelement <2 x float> [[DATAELT10]], float [[ELT11]], i32 1
@@ -190,7 +190,7 @@ define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, p
 ; CHECK: [[MULIDXELT2:%.*]] = mul i32 [[IDXELT2]], 2
 ; CHECK: [[MADIDXELT20:%.*]] = add i32 [[MULIDXELT2]], 0
 ; CHECK: [[ELT20:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT20]]
-; CHECK: [[DATAELT20:%.*]] = insertelement <2 x float> undef, float [[ELT20]], i32 0
+; CHECK: [[DATAELT20:%.*]] = insertelement <2 x float> poison, float [[ELT20]], i32 0
 ; CHECK: [[MADIDXELT21:%.*]] = add i32 [[MULIDXELT2]], 1
 ; CHECK: [[ELT21:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT21]]
 ; CHECK: [[DATAELT21:%.*]] = insertelement <2 x float> [[DATAELT20]], float [[ELT21]], i32 1
@@ -203,7 +203,7 @@ define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, p
 ; CHECK: [[MULIDXELT3:%.*]] = mul i32 [[IDXELT3]], 2
 ; CHECK: [[MADIDXELT30:%.*]] = add i32 [[MULIDXELT3]], 0
 ; CHECK: [[ELT30:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT30]]
-; CHECK: [[DATAELT30:%.*]] = insertelement <2 x float> undef, float [[ELT30]], i32 0
+; CHECK: [[DATAELT30:%.*]] = insertelement <2 x float> poison, float [[ELT30]], i32 0
 ; CHECK: [[MADIDXELT31:%.*]] = add i32 [[MULIDXELT3]], 1
 ; CHECK: [[ELT31:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT31]]
 ; CHECK: [[DATAELT31:%.*]] = insertelement <2 x float> [[DATAELT30]], float [[ELT31]], i32 1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
index fee72ee014ac9..7d9b0385dbb90 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -31,8 +31,8 @@ for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
   %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
-  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
-  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
   %0 = extractelement <4 x i1> %cmp2, i64 0
   br i1 %0, label %for.body, label %for.end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
index a1a0fed20654e..998c283a2f46f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
@@ -25,8 +25,8 @@ define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %o
 entry:
   %call = call i64 @__mux_get_global_id(i32 0)
   %call.trunc = trunc i64 %call to i32
-  %call.splatinsert = insertelement <4 x i32> undef, i32 %call.trunc, i32 0
-  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %call.splatinsert = insertelement <4 x i32> poison, i32 %call.trunc, i32 0
+  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %cmp = icmp eq i64 %call, 0
   br i1 %cmp, label %for.end, label %for.cond
 
@@ -34,8 +34,8 @@ for.cond:                                         ; preds = %entry, %for.body
   %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
   %call1 = call i64 @__mux_get_global_size(i32 0)
   %conv = trunc i64 %call1 to i32
-  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
-  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
   %0 = extractelement <4 x i1> %cmp2, i64 0
   br i1 %0, label %for.body, label %for.end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
index a7f8ba693664d..d1085569a5207 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
@@ -41,7 +41,7 @@ if.then2:                                     ; preds = %if.then1
   %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %mul1
   %cmp3 = icmp slt i64 %gid1, %n
   %load1 = load float, float addrspace(1)* %gep2, align 4
-  %ie1 = insertelement <4 x float> undef, float %load1, i32 0
+  %ie1 = insertelement <4 x float> poison, float %load1, i32 0
   br i1 %cmp3, label %if.then3, label %if.else3
 
 if.then3:                                     ; preds = %if.then2

From 25068196293868d53c50f6604fc5d9303574c685 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 27 Aug 2025 12:04:32 +0100
Subject: [PATCH 173/182] [NFC] Update to clang-format/clang-tidy 20.

---
 .../compiler_passes/vecz/include/vecz/vecz_choices.h            | 2 +-
 .../vecz/source/include/analysis/packetization_analysis.h       | 2 +-
 .../compiler_passes/vecz/source/include/control_flow_boscc.h    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
index a06adcaac8b88..231cc228830ee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
@@ -126,7 +126,7 @@ class VectorizationChoices {
   /// @brief Check if a choice is enabled or not
   /// @param C The choice to check for
   /// @return true if the choice is enabled, false otherwise
-  bool isEnabled(Choice C) const { return Enabled.count(C) > 0; }
+  bool isEnabled(Choice C) const { return Enabled.contains(C); }
 
   /// @brief Enable a choice
   /// @param C The choice to enable
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
index 9321ad2ed7267..f87fe45434a5f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
@@ -61,7 +61,7 @@ class PacketizationAnalysisResult {
   /// @param[in] V the value to query
   /// @return true if the value was marked for packetization, false otherwise.
   bool needsPacketization(const llvm::Value *V) const {
-    return toPacketize.count(V) != 0;
+    return toPacketize.contains(V);
   }
 
  private:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
index b1eecfe2854e4..5cb6ca77eeefd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
@@ -110,7 +110,7 @@ class ControlFlowConversionState::BOSCCGadget final {
     /// @param[in] B Block to look for in the region
     /// @return Whether the block belong to the region or not.
     bool contains(llvm::BasicBlock *B) const {
-      return predicatedBlocks.count(B);
+      return predicatedBlocks.contains(B);
     }
   };
   /// @brief List of all duplicated uniform regions.

From cfc296e2e3bd9d2720ad90b3ded015977a51cfcb Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 28 Aug 2025 19:30:39 +0100
Subject: [PATCH 174/182] [vecz] Avoid mishandling poison.

We were using 'and' to filter branch conditions, trusting that if an
entry mask had it set to false, it would assuredly be kept false. This
does not work when a branch condition is computed as 'poison', as 'false
and poison' evaluates to 'poison', not 'false'.

We can generate 'select' instructions instead, which avoid this problem.
---
 .../control_flow_conversion_pass.cpp          | 50 ++++++++++++-------
 .../vecz/test/lit/llvm/Boscc/boscc_merge.ll   |  2 +-
 .../lit/llvm/Boscc/partial_linearization13.ll |  2 +-
 .../control_flow_conversion_nested_loops.ll   |  8 +--
 .../llvm/control_flow_conversion_order_y.ll   |  8 +--
 .../llvm/control_flow_conversion_order_z.ll   |  8 +--
 .../control_flow_conversion_varying_loop.ll   |  4 +-
 .../vecz/test/lit/llvm/divergent_loop_bug.ll  |  4 +-
 .../test/lit/llvm/partial_linearization13.ll  |  2 +-
 .../vecz/test/lit/llvm/struct_phi.ll          |  2 +-
 10 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index f33f20a0e878a..8df91eeb9769a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -840,13 +840,17 @@ bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB,
 
     Value *cond = BI->getCondition();
     if (isVarying) {
-      maskInfo.exitMasks[trueBB] = B.CreateAnd(
-          maskInfo.entryMask, cond, trueBB->getName() + ".exit_mask");
+      Value *constantFalse = getDefaultValue(cond->getType());
+
+      maskInfo.exitMasks[trueBB] =
+          B.CreateSelect(maskInfo.entryMask, cond, constantFalse,
+                         trueBB->getName() + ".exit_mask");
 
       // For the false edge, we have to negate the condition.
-      Value *falseCond = B.CreateNot(cond, cond->getName() + ".not");
-      maskInfo.exitMasks[falseBB] = B.CreateAnd(
-          maskInfo.entryMask, falseCond, falseBB->getName() + ".exit_mask");
+      Value *negCond = B.CreateNot(cond, cond->getName() + ".not");
+      maskInfo.exitMasks[falseBB] =
+          B.CreateSelect(maskInfo.entryMask, negCond, constantFalse,
+                         falseBB->getName() + ".exit_mask");
 
       LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
                         << trueBB->getName() << ": "
@@ -883,9 +887,15 @@ bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB,
     // condition, so that if no case has its condition true, then we can choose
     // default.
     Value *caseConds = nullptr;
+    Value *constantFalse = nullptr;
     for (auto c : SI->cases()) {
       Value *caseCond = B.CreateICmpEQ(cond, c.getCaseValue());
-      caseConds = !caseConds ? caseCond : B.CreateOr(caseConds, caseCond);
+      if (!caseConds) {
+        caseConds = caseCond;
+        constantFalse = getDefaultValue(caseCond->getType());
+      } else {
+        caseConds = B.CreateOr(caseConds, caseCond);
+      }
       BasicBlock *caseBlock = c.getCaseSuccessor();
       if (isBOSCCEntry) {
         if (BasicBlock *caseBlockUniform = BOSCC->getBlock(caseBlock)) {
@@ -894,15 +904,16 @@ bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB,
       }
 
       if (isVarying) {
-        maskInfo.exitMasks[caseBlock] = B.CreateAnd(
-            maskInfo.entryMask, caseCond, caseBlock->getName() + ".exit_mask");
+        maskInfo.exitMasks[caseBlock] =
+            B.CreateSelect(maskInfo.entryMask, caseCond, constantFalse,
+                           caseBlock->getName() + ".exit_mask");
         LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
                           << caseBlock->getName() << ": "
                           << *maskInfo.exitMasks[caseBlock] << "\n");
       } else {
-        maskInfo.exitMasks[caseBlock] = B.CreateSelect(
-            caseCond, maskInfo.entryMask, getDefaultValue(caseCond->getType()),
-            caseBlock->getName() + ".exit_mask");
+        maskInfo.exitMasks[caseBlock] =
+            B.CreateSelect(maskInfo.entryMask, caseCond, constantFalse,
+                           caseBlock->getName() + ".exit_mask");
         LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
                           << caseBlock->getName() << ": "
                           << *maskInfo.exitMasks[caseBlock] << "\n");
@@ -913,15 +924,16 @@ bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB,
 
     Value *negCond = B.CreateNot(caseConds, caseConds->getName() + ".not");
     if (isVarying) {
-      maskInfo.exitMasks[defaultDest] = B.CreateAnd(
-          negCond, maskInfo.entryMask, defaultDest->getName() + ".exit_mask");
+      maskInfo.exitMasks[defaultDest] =
+          B.CreateSelect(maskInfo.entryMask, negCond, constantFalse,
+                         defaultDest->getName() + ".exit_mask");
       LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
                         << defaultDest->getName() << ": "
                         << *maskInfo.exitMasks[defaultDest] << "\n");
     } else {
-      maskInfo.exitMasks[defaultDest] = B.CreateSelect(
-          negCond, maskInfo.entryMask, getDefaultValue(negCond->getType()),
-          defaultDest->getName() + ".exit_mask");
+      maskInfo.exitMasks[defaultDest] =
+          B.CreateSelect(maskInfo.entryMask, negCond, constantFalse,
+                         defaultDest->getName() + ".exit_mask");
       LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
                         << defaultDest->getName() << ": "
                         << *maskInfo.exitMasks[defaultDest] << "\n");
@@ -1551,9 +1563,9 @@ bool ControlFlowConversionState::Impl::createBranchReductions() {
         if (auto *LTag = DR->getTag(&BB).loop;
             DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) {
           if (!isBranchCondTrulyUniform(cond, *UVR)) {
-            auto *newcond = BinaryOperator::Create(
-                Instruction::BinaryOps::And, cond, MaskInfos[&BB].entryMask,
-                cond->getName() + "_active");
+            auto *newcond = SelectInst::Create(MaskInfos[&BB].entryMask, cond,
+                                               getDefaultValue(cond->getType()),
+                                               cond->getName() + "_active");
             newcond->insertBefore(Branch->getIterator());
             cond = newcond;
           }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
index e886b1ada4903..014f19594e2b0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -223,7 +223,7 @@ if.else6:                                             ; preds = %if.then6, %if.e
 ; CHECK: %[[CMP3:.+]] = icmp
 ; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
 ; on inactive lanes.
-; CHECK: %[[CMP3_ACTIVE:.+]] = and i1 %[[CMP3]], %[[CMP2]]
+; CHECK: %[[CMP3_ACTIVE:.+]] = select i1 %[[CMP2]], i1 %[[CMP3]], i1 false
 ; CHECK: %[[CMP3_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP3_ACTIVE]])
 ; CHECK: br i1 %[[CMP3_ACTIVE_ANY]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
index 4d54162e00c90..67d4e6542cdb5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -230,7 +230,7 @@ attributes #2 = { nobuiltin nounwind readonly }
 ; CHECK: %[[TRUNC:.+]] = icmp
 ; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
 ; on inactive lanes.
-; CHECK: %[[TRUNC_ACTIVE:.+]] = and i1 %[[TRUNC]], {{%.*}}
+; CHECK: %[[TRUNC_ACTIVE:.+]] = select i1 {{%.*}}, i1 %[[TRUNC]], i1 false
 ; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]])
 ; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
index f156402249999..7108df3732999 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
@@ -174,9 +174,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
 ; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
 ; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
-; CHECK: %[[EDGEMASK_FORBODY:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[CMP]]
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false
 ; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
-; CHECK: %[[EDGEMASK_FOREND14:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[NOT_CMP]]
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false
 ; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
 ; CHECK: br label %[[FORBODY:.+]]
 
@@ -187,9 +187,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
 ; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
 ; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
-; CHECK: %[[EDGEMASK_FORBODY6:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[CMP4]]
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false
 ; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
-; CHECK: %[[EDGEMASK_FORINC12:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[NOT_CMP4]]
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false
 ; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
 ; CHECK: br label %[[FORBODY6:.+]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
index e3c8c8f136f05..0384d9959e24a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
@@ -174,9 +174,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
 ; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
 ; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
-; CHECK: %[[EDGEMASK_FORBODY:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[CMP]]
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false
 ; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
-; CHECK: %[[EDGEMASK_FOREND14:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[NOT_CMP]]
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false
 ; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
 ; CHECK: br label %[[FORBODY:.+]]
 
@@ -187,9 +187,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
 ; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
 ; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
-; CHECK: %[[EDGEMASK_FORBODY6:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[CMP4]]
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false
 ; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
-; CHECK: %[[EDGEMASK_FORINC12:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[NOT_CMP4]]
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false
 ; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
 ; CHECK: br label %[[FORBODY6:.+]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
index d8a82cce6d422..e6c92b8290d92 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
@@ -174,9 +174,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
 ; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
 ; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
-; CHECK: %[[EDGEMASK_FORBODY:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[CMP]]
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false
 ; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
-; CHECK: %[[EDGEMASK_FOREND14:.+]] = and i1 %[[ENTRYMASK_FORCOND]], %[[NOT_CMP]]
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false
 ; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
 ; CHECK: br label %[[FORBODY:.+]]
 
@@ -187,9 +187,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
 ; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
 ; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
-; CHECK: %[[EDGEMASK_FORBODY6:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[CMP4]]
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false
 ; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
-; CHECK: %[[EDGEMASK_FORINC12:.+]] = and i1 %[[ENTRYMASK_FORCOND3]], %[[NOT_CMP4]]
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false
 ; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
 ; CHECK: br label %[[FORBODY6:.+]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
index 7c2afc456be6d..77184596228ce 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
@@ -167,9 +167,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %for.cond.entry_mask = phi i1 [ true, %entry ], [ %for.body.exit_mask, %for.body ]
 ; CHECK: %for.end.loop_exit_mask = phi i1 [ false, %entry ], [ %for.end.loop_exit_mask.update, %for.body ]
 ; CHECK: %cmp = icmp slt i32 %storemerge, 16
-; CHECK: %for.body.exit_mask = and i1 %for.cond.entry_mask, %cmp
+; CHECK: %for.body.exit_mask = select i1 %for.cond.entry_mask, i1 %cmp, i1 false
 ; CHECK: %cmp.not = xor i1 %cmp, true
-; CHECK: %for.end.exit_mask = and i1 %for.cond.entry_mask, %cmp.not
+; CHECK: %for.end.exit_mask = select i1 %for.cond.entry_mask, i1 %cmp.not, i1 false
 ; CHECK: %for.end.loop_exit_mask.update = or i1 %for.end.loop_exit_mask, %for.end.exit_mask
 ; CHECK: br label %for.body
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
index 8160714150664..cca59611013d3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
@@ -54,7 +54,7 @@ entry.if.end17_crit_edge:                          ; preds = %entry
 ; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
 ; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01
 ; CHECK: %or.cond = and i1 %cmp11, %cmp14
-; CHECK: %or.cond_active = and i1 %or.cond, [[CMP_NOT_NOT]]
+; CHECK: %or.cond_active = select i1 [[CMP_NOT_NOT]], i1 %or.cond, i1 false
 ; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active)
 ; CHECK: br i1 %or.cond_active_any, label %if.then.if.end_crit_edge, label %if.then16
 if.then:                                           ; preds = %entry
@@ -119,7 +119,7 @@ entry.if.end17_crit_edge:                          ; preds = %entry
 ; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
 ; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01
 ; CHECK: %or.cond = and i1 %cmp11, %cmp14
-; CHECK: %or.cond_active = and i1 %or.cond, [[CMP_NOT_NOT]]
+; CHECK: %or.cond_active = select i1 [[CMP_NOT_NOT]], i1 %or.cond, i1 false
 ; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active)
 ; CHECK: br i1 %or.cond_active_any, label %if.else.crit_edge, label %if.then16
 if.then:                                           ; preds = %entry
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
index 098fd2a80f5ea..e044bef6c2a43 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -200,7 +200,7 @@ attributes #2 = { nobuiltin nounwind readonly }
 ; CHECK: %[[TRUNC:.+]] = icmp eq i64 %[[TMP]], 0
 ; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
 ; on inactive lanes.
-; CHECK: %[[TRUNC_ACTIVE:.+]] = and i1 %[[TRUNC]], {{%.*}}
+; CHECK: %[[TRUNC_ACTIVE:.+]] = select i1 {{%.*}}, i1 %[[TRUNC]], i1 false
 ; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]])
 ; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
index 5fe0c0296a512..0476b1e40d4e2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -100,7 +100,7 @@ declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
 ; Check if the operations that use integer types are vectorized
 ; CHECK: zext <4 x i32>
 ; CHECK: icmp ugt <4 x i64>
-; CHECK: and <4 x i1>
+; CHECK: select <4 x i1>
 ; CHECK: %[[L423:.+]] = call <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrDv4_b(ptr %{{.*}}, <4 x i1>
 ; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrDv4_b(<4 x i32> %[[L423]], ptr{{( nonnull)? %.*}}, <4 x i1>
 

From fc3e3f27a0569a729b3427c660168adf17c3c7ec Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 29 Aug 2025 13:24:33 +0100
Subject: [PATCH 175/182] More poison/undef updates.

Further updates to account for the deprecation of undef: this updates
documentation and variable names in line with the previous update to use
poison more. It updates tests to no longer use undef where we would no
longer generate undef. It restores one use of UndefValue::get where the
replacement with PoisonValue::get was wrong: SPIR-V's OpUndef does not
allow treating it as poison.
---
 .../source/pass_functions.cpp                 | 13 ++++++-----
 .../vecz/source/include/llvm_helpers.h        |  2 +-
 .../transform/packetization_helpers.cpp       | 18 +++++++--------
 .../vecz/source/transform/packetizer.cpp      | 10 ++++-----
 .../vecz/source/vector_target_info.cpp        |  6 ++---
 .../vecz/source/vectorization_context.cpp     | 10 ++++-----
 .../vecz/test/lit/llvm/Boscc/boscc_killer.ll  |  2 +-
 .../vecz/test/lit/llvm/Boscc/boscc_merge2.ll  |  4 ++--
 .../define_interleaved_store.ll               |  6 ++---
 .../define_interleaved_store_as_masked.ll     |  6 ++---
 .../test/lit/llvm/RISCV/broadcast_vector.ll   | 20 ++++++++---------
 .../test/lit/llvm/RISCV/extract_element.ll    | 12 +++++-----
 .../test/lit/llvm/RISCV/insert_element.ll     |  2 +-
 .../lit/llvm/RISCV/select_scalar_vector.ll    |  4 ++--
 .../vecz/test/lit/llvm/RISCV/vp_memops.ll     |  2 +-
 .../llvm/ScalableVectors/broadcast_vector.ll  | 22 +++++++++----------
 .../define_interleaved_store.ll               |  6 ++---
 .../define_interleaved_store_as_masked.ll     |  2 +-
 .../ScalableVectors/define_masked_load.ll     |  2 +-
 .../llvm/ScalableVectors/extract_element.ll   |  8 +++----
 .../llvm/ScalableVectors/insert_element.ll    |  2 +-
 .../ScalableVectors/packetize_mask_varying.ll |  2 +-
 .../test/lit/llvm/ScalableVectors/select.ll   |  4 ++--
 .../ScalableVectors/select_scalar_vector.ll   |  4 ++--
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |  4 ++--
 .../define_interleaved_load_store.ll          |  2 +-
 .../llvm/VectorPredication/load_add_store.ll  |  4 ++--
 .../test/lit/llvm/VectorPredication/udiv.ll   |  2 +-
 .../VectorWidening/define_interleaved_load.ll |  2 +-
 .../define_interleaved_load_as_masked.ll      |  2 +-
 .../llvm/VectorWidening/interleaved_safety.ll |  2 +-
 .../widen_fmin_vector_scalar.ll               |  2 +-
 .../test/lit/llvm/define_interleaved_load.ll  |  6 ++---
 .../llvm/define_interleaved_load_as_masked.ll |  6 ++---
 .../test/lit/llvm/define_interleaved_store.ll |  6 ++---
 .../define_interleaved_store_as_masked.ll     |  6 ++---
 .../test/lit/llvm/define_internal_builtins.ll |  2 +-
 .../vecz/test/lit/llvm/define_masked_load.ll  |  2 +-
 .../vecz/test/lit/llvm/expect_assume.ll       |  2 +-
 .../lit/llvm/insert_element_debug_info.ll     |  4 ++--
 .../test/lit/llvm/instantiate_constants.ll    |  2 +-
 .../vecz/test/lit/llvm/interleaved_safety.ll  |  2 +-
 .../vecz/test/lit/llvm/masked_interleaved.ll  |  4 ++--
 .../lit/llvm/masked_interleaved_as_scatter.ll |  4 ++--
 .../test/lit/llvm/packetization_branch.ll     |  8 +++----
 .../lit/llvm/packetization_uniform_branch.ll  |  4 ++--
 .../lit/llvm/packetize_uniform_conditional.ll |  2 +-
 .../packetize_uniform_default_conditional.ll  |  2 +-
 .../llvm/packetize_uniform_default_reduce.ll  |  2 +-
 .../packetize_uniform_loops_conditional.ll    |  2 +-
 .../llvm/packetize_uniform_loops_reduce.ll    |  4 ++--
 .../test/lit/llvm/packetize_uniform_reduce.ll |  4 ++--
 .../lit/llvm/{undef_ub.ll => poison_ub.ll}    |  6 ++---
 .../test/lit/llvm/predicate_with_switch.ll    |  2 +-
 .../llvm/scalar_splat_in_varying_branch.ll    |  4 ++--
 .../vecz/test/lit/llvm/scalarize-gather.ll    |  2 +-
 .../vecz/test/lit/llvm/scalarize-splat.ll     |  4 ++--
 .../vecz/test/lit/llvm/scan_fact.ll           |  2 +-
 .../vecz/test/lit/llvm/struct_phi.ll          | 10 ++++-----
 .../vecz/test/lit/llvm/subgroup_builtins.ll   |  4 ++--
 .../vecz/test/lit/llvm/subgroup_reductions.ll |  2 +-
 .../test/lit/llvm/uniform_reassociation1.ll   |  6 ++---
 .../test/lit/llvm/uniform_reassociation2.ll   |  4 ++--
 .../test/lit/llvm/uniform_reassociation3.ll   |  4 ++--
 64 files changed, 157 insertions(+), 156 deletions(-)
 rename llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/{undef_ub.ll => poison_ub.ll} (85%)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index ce84591aaf7af..e1beea751ca06 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -204,24 +204,25 @@ void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
       // InsertElement to place it in a new vector and the second is a
       // ShuffleVector to duplicate the value across the vector.
       auto numEls = constantVec->getNumOperands();
-      llvm::Value *undef = llvm::PoisonValue::get(
+      llvm::Value *poison = llvm::PoisonValue::get(
           llvm::FixedVectorType::get(splatVal->getType(), numEls));
       llvm::Type *i32Ty = llvm::Type::getInt32Ty(constant->getContext());
       auto insert = llvm::InsertElementInst::Create(
-          undef, splatVal, llvm::ConstantInt::get(i32Ty, 0));
+          poison, splatVal, llvm::ConstantInt::get(i32Ty, 0));
       insert->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
       llvm::Value *zeros = llvm::ConstantAggregateZero::get(
           llvm::FixedVectorType::get(i32Ty, numEls));
-      newInst = new llvm::ShuffleVectorInst(insert, undef, zeros);
+      newInst = new llvm::ShuffleVectorInst(insert, poison, zeros);
       newInst->insertAfter(insert);
     } else if (llvm::ConstantArray *constantArr =
                    llvm::dyn_cast<llvm::ConstantArray>(constant)) {
       auto numEls = constantArr->getNumOperands();
-      llvm::Value *undef = llvm::PoisonValue::get(constantArr->getType());
+      llvm::Value *poison = llvm::PoisonValue::get(constantArr->getType());
       llvm::Instruction *insertedIns = nullptr;
       for (unsigned int i = 0; i < numEls; i++) {
-        auto *insertNext = llvm::InsertValueInst::Create(
-            insertedIns ? insertedIns : undef, constantArr->getOperand(i), {i});
+        auto *insertNext =
+            llvm::InsertValueInst::Create(insertedIns ? insertedIns : poison,
+                                          constantArr->getOperand(i), {i});
         if (insertedIns) {
           insertNext->insertAfter(insertedIns);
         } else {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
index 434981ddb9abc..548a11eb5f3bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
@@ -40,7 +40,7 @@ llvm::FixedVectorType *getVectorType(llvm::Value *V);
 /// @param[in] T Type to get default value of.
 /// @param[in] V Default value to use for numeric type
 ///
-/// @return Default value, which will be undef for non-numeric types
+/// @return Default value, which will be poison for non-numeric types
 llvm::Value *getDefaultValue(llvm::Type *T, uint64_t V = 0UL);
 
 /// @brief Get the shuffle mask as sequence of integers.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index c0e996593ebee..bac6b19789706 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -134,7 +134,7 @@ Value *createOptimalShuffle(IRBuilder<> &B, Value *srcA, Value *srcB,
   // If either operand is a unary shuffle, we can pull a few more tricks..
   // For instance:
   //
-  //    shuffle(shuffle(A, undef, maskA), shuffle(B, undef, maskB), maskC)
+  //    shuffle(shuffle(A, poison, maskA), shuffle(B, poison, maskB), maskC)
   // => shuffle(A, B, shuffle(maskA, adjust(maskB), maskC))
   // where "adjust" refers to adjusting the mask values to refer to the second
   // source vector by adding the width of the first operand to the indices.
@@ -229,9 +229,9 @@ bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
     }
   }
 
-  auto *undef = PoisonValue::get(srcs.front()->getType());
+  auto *poison = PoisonValue::get(srcs.front()->getType());
   for (auto &src : srcs) {
-    src = createOptimalShuffle(B, src, undef, mask);
+    src = createOptimalShuffle(B, src, poison, mask);
   }
   return true;
 }
@@ -456,7 +456,7 @@ PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
     assert(isa<FixedVectorType>(vecTy) && "Must be a fixed vector type here!");
     const unsigned scalarWidth = vecTy->getNumElements() / width;
     if (scalarWidth > 1 || scalar->getType()->isVectorTy()) {
-      auto *const undef = PoisonValue::get(vec->getType());
+      auto *const poison = PoisonValue::get(vec->getType());
 
       // Build shuffle mask to perform the subvector extracts.
       IRBuilder<> B(buildAfter(vec, packetizer.F));
@@ -465,7 +465,7 @@ PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
         for (size_t j = 0; j < scalarWidth; ++j, ++k) {
           mask.push_back(k);
         }
-        packet[i] = createOptimalShuffle(B, vec, undef, mask,
+        packet[i] = createOptimalShuffle(B, vec, poison, mask,
                                          Twine(scalar->getName(), ".split"));
       }
     } else {
@@ -536,7 +536,7 @@ PacketRange Packetizer::Result::widen(unsigned width) const {
   auto *it = parts.begin();
   IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
   if (newWidth > 1) {
-    auto *const undef = PoisonValue::get(vecTy);
+    auto *const poison = PoisonValue::get(vecTy);
 
     // Build shuffle mask to perform the subvector extracts.
     for (size_t i = 0, origIdx = 0; i < width; ++i) {
@@ -549,7 +549,7 @@ PacketRange Packetizer::Result::widen(unsigned width) const {
         mask.push_back(origIdx);
       }
       packet[i] =
-          createOptimalShuffle(B, *it, undef, mask, Twine(name, ".split"));
+          createOptimalShuffle(B, *it, poison, mask, Twine(name, ".split"));
     }
   } else {
     for (size_t i = 0, origIdx = 0; i < width; ++i, ++origIdx) {
@@ -581,9 +581,9 @@ PacketRange Packetizer::Result::narrow(unsigned width) const {
     // Build vectors out of pairs of scalar values
     const auto name = scalar->getName();
     IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
-    Value *undef = PoisonValue::get(FixedVectorType::get(ty, 2));
+    Value *poison = PoisonValue::get(FixedVectorType::get(ty, 2));
     for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) {
-      Value *in = B.CreateInsertElement(undef, parts[pairIdx], B.getInt32(0),
+      Value *in = B.CreateInsertElement(poison, parts[pairIdx], B.getInt32(0),
                                         Twine(name, ".gather"));
       packet[i] = B.CreateInsertElement(in, parts[pairIdx + 1], B.getInt32(1),
                                         Twine(name, ".gather"));
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index ec7bae09a73ec..7a5118d7643e8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -806,7 +806,7 @@ bool Packetizer::Impl::packetize() {
 void Packetizer::Impl::onFailure() {
   // On failure, clean up pending Phis, which may still be invalid in that they
   // have no incoming operands. For simplicity, just erase and replace all of
-  // them with undef: the failed vectorized function will be removed anyway.
+  // them with poison: the failed vectorized function will be removed anyway.
   for (auto *Phi : pendingPhis) {
     auto &info = packets[Phi];
     assert(info.numInstances > 0 && "A PHI pending packetization has no stub");
@@ -2624,10 +2624,10 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
       auto *const newPtrTy = FixedVectorType::get(ptrTy, simdWidth);
 
       auto *const idxVector = ConstantVector::get(indices);
-      auto *const undef = PoisonValue::get(newPtrTy);
+      auto *const poison = PoisonValue::get(newPtrTy);
       for (auto &vecPtr : ptrPacket) {
         vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
-        vecPtr = B.CreateShuffleVector(vecPtr, undef, widenMask);
+        vecPtr = B.CreateShuffleVector(vecPtr, poison, widenMask);
         vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector);
       }
     }
@@ -2792,9 +2792,9 @@ ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
           }
         }
 
-        auto *const undef = PoisonValue::get(maskPacket.front()->getType());
+        auto *const poison = PoisonValue::get(maskPacket.front()->getType());
         for (auto &vecMask : maskPacket) {
-          vecMask = createOptimalShuffle(B, vecMask, undef, widenMask);
+          vecMask = createOptimalShuffle(B, vecMask, poison, widenMask);
         }
       }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index c26d30fe90cb8..e019994cb71ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -994,7 +994,7 @@ llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
   assert(srcTy &&
          "TargetInfo::createVectorShuffle: source must have vector type");
 
-  auto *const undef = PoisonValue::get(srcTy);
+  auto *const poison = PoisonValue::get(srcTy);
   const auto EC = srcTy->getElementCount();
   if (!EC.isScalable()) {
     // Special case for fixed-width vectors
@@ -1007,11 +1007,11 @@ llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
     }
 
     auto *const rotate =
-        createOptimalShuffle(B, src, undef, mask, Twine("slide_up"));
+        createOptimalShuffle(B, src, poison, mask, Twine("slide_up"));
     return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in");
   }
 
-  auto *const rotate = B.CreateVectorSplice(undef, src, -1, "slide_up");
+  auto *const rotate = B.CreateVectorSplice(poison, src, -1, "slide_up");
   return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in");
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 39939e7385782..57c25642c6c8e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -811,15 +811,15 @@ bool VectorizationContext::emitMaskedScatterGatherMemOpBody(
 //   Iteration 0:
 //     %e.0 = extractelement %v, 0          (A)
 //     %s.0 = add N, %e.0                   (A)
-//     %v.0 = insertelement undef, %s.0, 0  (<A,U,U,U>)
+//     %v.0 = insertelement poison, %s.0, 0 (<A,P,P,P>)
 //   Iteration 1:
 //     %e.1 = extractelement %v, 1          (B)
 //     %s.1 = add %s.0, %e.1                (A+B)
-//     %v.1 = insertelement  %v.0, %s.1, 1  (<A,A+B,U,U>)
+//     %v.1 = insertelement  %v.0, %s.1, 1  (<A,A+B,P,P>)
 //   Iteration 2:
 //     %e.2 = extractelement %v, 2          (C)
 //     %s.2 = add %s.1, %e.2                (A+B+C)
-//     %v.2 = insertelement  %v.1, %s.2, 2  (<A,A+B,A+B+C,U>)
+//     %v.2 = insertelement  %v.1, %s.2, 2  (<A,A+B,A+B+C,P>)
 //   Iteration 3:
 //     %e.3 = extractelement %v, 3          (D)
 //     %s.3 = add %s.2, %e.2                (A+B+C+D)
@@ -830,11 +830,11 @@ bool VectorizationContext::emitMaskedScatterGatherMemOpBody(
 // Exclusive scans operate by pre-filling the vector with the neutral value,
 // looping from 1 onwards, and extracting from one less than the current
 // iteration:
-//   %z = insertelement undef, N, 0
+//   %z = insertelement poison, N, 0
 //   Iteration 0:
 //     %e.0 = extractelement %v, 0          (A)
 //     %s.0 = add N, %e.0                   (A)
-//     %v.0 = insertelement %z, %s.0, 1     (<N,A,U,U>)
+//     %v.0 = insertelement %z, %s.0, 1     (<N,A,P,P>)
 // This loop operates up to the VL input, if it is a vector-predicated scan.
 // Elements past the vector length will receive a default zero value.
 // Note: This method is not optimal for fixed-length code, but serves as a way
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
index 0132338fac6af..b6b34d30c45fc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
@@ -24,7 +24,7 @@ target triple = "spir64-unknown-unknown"
 declare i64 @__mux_get_local_id(i32)
 declare i64 @__mux_get_local_size(i32)
 
-@boscc_killer.shared = internal unnamed_addr addrspace(3) global i32 undef, align 4
+@boscc_killer.shared = internal unnamed_addr addrspace(3) global i32 poison, align 4
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @boscc_killer(float addrspace(1)* %A, float addrspace(1)* %B, i32 %N, i32 %lda) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
index c0cc77227d2c9..bdaf96b9903fe 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
@@ -27,8 +27,8 @@ declare spir_func float @_Z3maxff(float, float) #1
 declare i64 @__mux_get_local_id(i32) #1
 declare i64 @__mux_get_group_id(i32) #1
 
-@fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared = internal addrspace(3) global [640 x float] undef, align 4
-@fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared = internal addrspace(3) global [1152 x float] undef, align 4
+@fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared = internal addrspace(3) global [640 x float] poison, align 4
+@fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared = internal addrspace(3) global [1152 x float] poison, align 4
 
 ; Function Attrs: convergent nounwind
 define spir_kernel void @boscc_merge2(float addrspace(1)* noalias %input0, float addrspace(1)* noalias %input1, float addrspace(1)* noalias %tensor, float addrspace(1)* noalias %input2) #2 {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
index 637866abbc7ed..a7d72cd259d0d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -73,8 +73,8 @@ attributes #3 = { nobuiltin nounwind }
 ; Test if the interleaved store is defined correctly
 ; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
 ; CHECK: entry:
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
-; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]]
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
index e6eaf8579bad4..1f70bde790233 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -73,8 +73,8 @@ attributes #3 = { nobuiltin nounwind }
 ; Test if the interleaved store is defined correctly
 ; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
 ; CHECK: entry:
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
-; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]]
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index d8d02153048e0..278931266fd77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -114,14 +114,14 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> poison, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
 ; CHECK-NEXT:  [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
 ; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
@@ -138,7 +138,7 @@ entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <32 x float>, align 128
 ; CHECK-NEXT:    store <32 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 128
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 128 x i32> @llvm.experimental.stepvector.nxv128i32()
-; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], shufflevector (<vscale x 128 x i32> insertelement (<vscale x 128 x i32> {{(undef|poison)}}, i32 31, {{i32|i64}} 0), <vscale x 128 x i32> {{(undef|poison)}}, <vscale x 128 x i32> zeroinitializer)
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], shufflevector (<vscale x 128 x i32> insertelement (<vscale x 128 x i32> poison, i32 31, {{i32|i64}} 0), <vscale x 128 x i32> poison, <vscale x 128 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> poison)
@@ -155,13 +155,13 @@ entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> {{(undef|poison)}}, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> poison, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
@@ -172,7 +172,7 @@ entry:
 ; CHECK-NEXT:  [[EXISTINGALLOC:%.*]] = alloca <4 x i32>, align 16
 ; CHECK-NEXT:  [[VS21:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
 ; CHECK-NEXT:  [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:  [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
 ; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
@@ -195,14 +195,14 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VS21:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[WOOF:%.*]], i64 0)
 ; CHECK-NEXT:    [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:    [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[XLEN4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[XLEN4]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
 ; CHECK-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i8> @llvm.{{(experimental.)?}}vector.insert.nxv16i8.v4i8(<vscale x 16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:    [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> {{(undef|poison)}}, i16 3, {{i32|i64}} 0), <vscale x 16 x i16> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 3, {{i32|i64}} 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP0]])
 ; CHECK: [[TMP4:%.*]] = trunc <vscale x 16 x i8> [[TMP3]] to <vscale x 16 x i1>
 ; CHECK: [[TMP5:%.*]] = fcmp oeq <vscale x 16 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index 63ff3edfca734..0ef996867742b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -111,7 +111,7 @@ entry:
 ; EE-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
 ; EE-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; EE-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; EE-NEXT:    [[VS1:%.*]] = add <vscale x 4 x i32> [[IDXSCALE]], [[SPLAT]]
 ; EE-NEXT:    [[T3:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
 ; EE-NEXT:    [[T4:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T3]], i64 [[TMP2]])
@@ -132,9 +132,9 @@ entry:
 ; EE-UNI-VEC:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; EE-UNI-VEC:         [[T3:%.*]] = shl i64 [[XLEN]], 2
 ; EE-UNI-VEC-NEXT:    [[T:%.*]] = trunc <vscale x 4 x i64> [[T2:%.*]] to <vscale x 4 x i32>
-; EE-UNI-VEC-NEXT:    [[I1:%.*]] = and <vscale x 4 x i32> [[T]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+; EE-UNI-VEC-NEXT:    [[I1:%.*]] = and <vscale x 4 x i32> [[T]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
 ; EE-UNI-VEC-NEXT:    [[IDX02:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; EE-UNI-VEC-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX02]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-UNI-VEC-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX02]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
 ; EE-UNI-VEC-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1]]
@@ -147,7 +147,7 @@ entry:
 ; EE-INDICES:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; EE-INDICES-NEXT:    [[T4:%.*]] = shl i64 [[XLEN]], 2
 ; EE-INDICES-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; EE-INDICES-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1:%.*]]
 ; EE-INDICES-NEXT:    [[T5:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
 ; EE-INDICES-NEXT:    [[T6:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T3:%.*]], <vscale x 16 x i32> [[T5]], i64 [[T4]])
@@ -159,9 +159,9 @@ entry:
 ; EE-BOOL-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
 ; EE-BOOL-NEXT:  [[T7:%.*]] = shl i64 [[XLEN]], 2
 ; EE-BOOL-NEXT:  [[T8:%.*]] = trunc <vscale x 4 x i64> [[T0:%.*]] to <vscale x 4 x i16>
-; EE-BOOL-NEXT:  [[T9:%.*]] = and <vscale x 4 x i16> [[T8]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> {{(undef|poison)}}, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i16>)
+; EE-BOOL-NEXT:  [[T9:%.*]] = and <vscale x 4 x i16> [[T8]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i16>)
 ; EE-BOOL-NEXT:  [[T10:%.*]] = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
-; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> {{(undef|poison)}}, i16 2, {{(i32|i64)}} 0), <vscale x 4 x i16> {{(undef|poison)}}, <vscale x 4 x i32> zeroinitializer)
+; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 2, {{(i32|i64)}} 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
 ; EE-BOOL-NEXT:  [[VS1:%.*]] = {{add|or}} <vscale x 4 x i16> [[T11]], [[T9]]
 ; EE-BOOL-NEXT:  [[T12:%.*]] = call <vscale x 16 x i16> @llvm.{{(experimental.)?}}vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[VS1]], i64 0)
 ; EE-BOOL-NEXT:  [[T13:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[T6]], <vscale x 16 x i16> [[T12]], i64 [[T7]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index 2343a0d950e16..9fc9b4a0c2104 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -121,7 +121,7 @@ entry:
 ; IE-INDICES-NEXT:    [[TMP5:%.*]] = shl i64 [[XLEN]], 4
 ; IE-INDICES-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> {{%.*}}, i64 0)
 ; IE-INDICES:         [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> {{(undef|poison)}}, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer)
+; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; IE-INDICES-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i32> [[IDX1]], i64 [[TMP5]])
 ; IE-INDICES-NEXT:    [[VS25:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[FIDX2]], i64 0)
 ; IE-INDICES-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 055c64a47c627..85684cc5b7e2f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -43,8 +43,8 @@ entry:
 ; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
 ; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
 ; CHECK: [[idx0:%.*]] = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> {{(undef|poison)}}, i16 1, {{i32|i64}} 0), <vscale x 8 x i16> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, {{i32|i64}} 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[vs2:%.*]], <vscale x 8 x i16> [[vs1:%.*]], i64 [[xlen:%.*]])
 ; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
-; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> {{(undef|poison)}}, i32 4, {{i32|i64}} 0), <vscale x 8 x i32> {{(undef|poison)}}, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 4, {{i32|i64}} 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
index 239be0b82fc01..284a99c74ac78 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -94,7 +94,7 @@ ret:
 ; CHECK-LOAD-16-NEXT: [[TMPSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[TMPSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-LOAD-16-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 16 x i32> [[TMP4]], [[TMPSPLAT]]
 ; CHECK-LOAD-16-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[TMP1]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
-; CHECK-LOAD-16-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p1(ptr addrspace(1) [[TMP0]], i32 4, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i32> {{undef|poison}})
+; CHECK-LOAD-16-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p1(ptr addrspace(1) [[TMP0]], i32 4, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i32> poison)
 ; CHECK-LOAD-16-NEXT: ret <vscale x 16 x i32> [[TMP7]]
 
 declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
index 7de1df26153d4..aa4559aad057e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -100,7 +100,7 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    store <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> (undef|poison), float 0x7FF8000020000000, (i32|i64) 0\), <vscale x 16 x float> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF8000020000000\)}}, ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    store <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> poison, float 0x7FF8000020000000, (i32|i64) 0\), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF8000020000000\)}}, ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
@@ -108,7 +108,7 @@ entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16
 ; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> poison)
@@ -125,13 +125,13 @@ entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
-; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
-; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
-; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 8388607, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 8388607\)}}
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 8388607, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 8388607\)}}
 ; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> (undef|poison), float 0x7FF0000020000000, (i32|i64) 0\), <vscale x 16 x float> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF0000020000000\)}}
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> poison, float 0x7FF0000020000000, (i32|i64) 0\), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF0000020000000\)}}
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
@@ -144,10 +144,10 @@ entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC1:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16
 ; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> {{(undef|poison)}})
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> poison)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16
 ; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align
@@ -155,7 +155,7 @@ entry:
 ; CHECK-NEXT:    store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i32> {{(undef|poison)}})
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i32> poison)
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
@@ -169,11 +169,11 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK:    [[FIXLEN_MASK_ALLOC:%.*]] = alloca <4 x i8>, align 4
 ; CHECK:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; CHECK:    [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
 ; CHECK:    store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4
 ; CHECK:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
 ; CHECK:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i8, ptr [[FIXLEN_MASK_ALLOC]], <vscale x 16 x i64> [[TMP0]]
-; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i8> {{(undef|poison)}})
+; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i8> poison)
 ; CHECK:    [[BMASK:%.*]] = trunc <vscale x 16 x i8> [[TMP1]] to <vscale x 16 x i1>
 ; CHECK:    {{.*}} = and <vscale x 16 x i1> {{.*}}, [[BMASK]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index 0a5a36cc138dc..38182a90ba1d4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -29,7 +29,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -56,8 +56,8 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; Test if the interleaved store is defined correctly
 ; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
 ; CHECK: entry:
-; CHECK: %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
-; CHECK: %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> {{poison|undef}}, <vscale x 4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK: %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; CHECK: %3 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %2
 ; CHECK: %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index f6e841d4fc987..cbc0ece32d65c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -29,7 +29,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
index 39de104569875..da2bfc7651c93 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -66,5 +66,5 @@ declare i64 @__mux_get_group_id(i32)
 ; Test if the masked load is defined correctly
 ; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_load4_u5nxv4ju3ptrU3AS2u5nxv4b(ptr addrspace(2){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
 ; CHECK: entry:
-; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> {{undef|poison}})
+; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> poison)
 ; CHECK: ret <vscale x 4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index be6fc339be0ef..90f54f795b5db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -114,8 +114,8 @@ entry:
 ; EE-UNI-VEC: [[T4:%.*]] = shufflevector <vscale x 4 x i64> [[T3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; EE-UNI-VEC: [[STEP:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
 ; EE-UNI-VEC: [[T5:%.*]] = add <vscale x 4 x i64> [[T4]], [[STEP]]
-; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> (undef|poison), i64 3, (i32|i64) 0\), <vscale x 4 x i64> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i64 3\)}}
-; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> (undef|poison), i64 2, (i32|i64) 0\), <vscale x 4 x i64> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i64 2\)}}
+; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 3, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 3\)}}
+; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 2, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 2\)}}
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
 ; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i64> [[T6]], [[MOD]]
@@ -128,10 +128,10 @@ entry:
 ; EE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
 ; EE-INDICES: [[T0:%.*]] = getelementptr i32, ptr addrspace(1) %idxs, i64 %call
 ; EE-INDICES: [[T2:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) [[T0]], align 4
-; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 3, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 3\)}}
 ; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
 ; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
-; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
 ; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T4]], [[T3]]
 ; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
 ; EE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index 0b6a835147d0c..72f0e80da803b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -99,7 +99,7 @@ entry:
 ; IE-INDICES: [[VAL:%.*]] = uitofp <vscale x 4 x i64> {{%.*}} to <vscale x 4 x float>
 ; IE-INDICES: store <vscale x 16 x float> {{%.*}}, ptr [[ALLOC]], align 64
 ; IE-INDICES: [[T1:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
-; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
 
 ; LLVM 16 deduces add/or equivalence and uses `or` instead.
 ; IE-INDICES: [[T3:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T2]], {{%.*}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index b1889eafb1f20..94cfaa536a6e1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -39,7 +39,7 @@ if.end:
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_mask_varying
 ; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
index e555e306e55e3..55d888bf1b4b8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -55,7 +55,7 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
 ; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
 ; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 4 x i32> [[lhs]], [[rhs]]
-; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 4 x i1> [[cmp]], <vscale x 4 x i32> [[rhs]], <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> (undef|poison), i32 4, (i32|i64) 0\), <vscale x 4 x i32> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i32 4\)}}
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 4 x i1> [[cmp]], <vscale x 4 x i32> [[rhs]], <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 4, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 4\)}}
 ; CHECK: store <vscale x 4 x i32> [[sel]],
 
 ; CHECK: define spir_kernel void @__vecz_nxv4_select_vector_vector
@@ -63,5 +63,5 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: [[y:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
 ; CHECK: [[z:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
 ; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 8 x i32> [[x]], [[y]]
-; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[z]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> (undef|poison), i32 4, (i32|i64) 0\), <vscale x 8 x i32> (undef|poison), <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[z]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> poison, i32 4, (i32|i64) 0\), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index 62be9049501d1..6f2f36035fa60 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
 ; CHECK: store <vscale x 4 x i8> [[sext]], ptr [[alloc:%.*]], align 4
 ; CHECK: [[idx0:%.*]] = call <vscale x 8 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv8i32()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> (undef|poison), i32 1, (i32|i64) 0\), <vscale x 8 x i32> (undef|poison), <vscale x 8 x i32> zeroinitializer\)|splat \(i32 1\)}}
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> poison, i32 1, (i32|i64) 0\), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer\)|splat \(i32 1\)}}
 
 ; Note that since we just did a lshr 1 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
@@ -53,5 +53,5 @@ entry:
 ; CHECK: [[addrs:%.*]] = getelementptr i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
 ; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[addrs]],
 ; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
-; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> (undef|poison), i32 4, (i32|i64) 0\), <vscale x 8 x i32> (undef|poison), <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> poison, i32 4, (i32|i64) 0\), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index 07e95ce18f74a..b39b1ddead43b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -33,7 +33,7 @@ define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat
 ; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> (undef|poison), i32 2, (i32|i64) 0\), <vscale x 16 x i32> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
 
 ; Note that since we just did a lshr 2 on the input of the extend, it doesn't
 ; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
@@ -55,7 +55,7 @@ define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x
   ret void
 ; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat_uniform
 ; CHECK: [[ins:%.*]] = insertelement <vscale x 16 x i32> poison, i32 %a, {{(i32|i64)}} 0
-; CHECK: [[splat:%.*]] = shufflevector <vscale x 16 x i32> [[ins]], <vscale x 16 x i32> {{(undef|poison)}}, <vscale x 16 x i32> zeroinitializer
+; CHECK: [[splat:%.*]] = shufflevector <vscale x 16 x i32> [[ins]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK: store <vscale x 16 x i32> [[splat]], ptr
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index dd38362fc144c..aafc04d3c9289 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -28,7 +28,7 @@ entry:
   %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index a4b8c1cb54c8b..39004ea1bb69c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -58,7 +58,7 @@ entry:
 ; CHECK_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2
 ; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
-; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> (shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 4 x i1> (undef|poison), <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
+; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> (shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
 ; CHECK_1S: [[RHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
 ; CHECK_1S: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[LHS]], <vscale x 4 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
 ; CHECK_1S: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
@@ -99,7 +99,7 @@ entry:
 ; CHECK_V4_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
 ; Each WI performs 4 elements, so multiply the VL by 4
 ; CHECK_V4_1S: [[SVL:%.*]] = shl i32 [[VL]], 2
-; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> (shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 16 x i1> (undef|poison), <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[SVL]])
+; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> (shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[SVL]])
 ; CHECK_V4_1S: [[RHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
 ; CHECK_V4_1S: [[ADD:%.*]] = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> [[LHS]], <vscale x 16 x i32> [[RHS]], [[TRUEMASK]], i32 [[SVL]])
 ; CHECK_V4_1S: call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index 8178c59d943e1..777ac90321664 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 1
 ; CHECK: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
 ; CHECK: [[VL:%.*]] = trunc i64 [[T2]] to i32
-; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> (shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> (undef|poison), i1 true, (i32|i64) 0\), <vscale x 2 x i1> (undef|poison), <vscale x 2 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
+; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> (shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
 ; CHECK: [[RHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
 ; CHECK: [[ADD:%.*]] = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> [[LHS]], <vscale x 2 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
 ; CHECK: call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
index 0202575883035..7c71b06f530c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
index 0202575883035..7c71b06f530c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
index 2498cb54e5209..00853a9b28b94 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
index b433fb0fff646..1251115351205 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -57,7 +57,7 @@ entry:
 ; scalar operand is sub-splatted to the required <16 x float>.
 ; CHECK: %[[LDA:.+]] = load <16 x float>, ptr %{{.+}}
 ; CHECK: %[[LDB:.+]] = load <4 x float>, ptr %{{.+}}
-; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> {{undef|poison}}, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; CHECK: %[[RES:.+]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> %[[LDA]], <16 x float> %[[SPL]])
 ; CHECK: store <16 x float> %[[RES]], ptr %{{.+}}
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
index bc1ab94fcd018..d54d31595e7f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -29,7 +29,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -55,8 +55,8 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 
 ; Test if the interleaved load is defined correctly
 ; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
-; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK: %[[TMP1:.*]] = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison)
 ; CHECK: ret <4 x double> %[[TMP2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
index e17494b5cf462..ca5b39de6e149 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -72,8 +72,8 @@ attributes #3 = { nobuiltin nounwind }
 
 ; Test if the interleaved load is defined correctly
 ; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %0, {{i32|i64}} 0
-; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK: %1 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison)
 ; CHECK: ret <4 x double> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
index 1c5752a1da9c3..1e8c1c3f67979 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -29,7 +29,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -56,8 +56,8 @@ declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double
 ; Test if the interleaved store is defined correctly
 ; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
 ; CHECK: entry:
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
-; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
index 394bc03896118..5fd7ad27aa856 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
@@ -73,8 +73,8 @@ attributes #3 = { nobuiltin nounwind }
 ; Test if the interleaved store is defined correctly
 ; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
 ; CHECK: entry:
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
-; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
index 20733e5eeee21..8de9ec81b534c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
@@ -27,6 +27,6 @@ define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
 
 declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)*, <4 x i1>)
 ; CHECK-LABEL: define <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(ptr addrspace(2){{.*}}, <4 x i1>{{.*}}) {
-; CHECK:   %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32 4, <4 x i1> %1, <4 x i32> {{undef|poison}})
+; CHECK:   %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32 4, <4 x i1> %1, <4 x i32> poison)
 ; CHECK:   ret <4 x i32> %2
 ; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
index bd17447cb7889..1b7e191cce0a3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
@@ -86,5 +86,5 @@ attributes #6 = { nounwind }
 ; Test if the masked load is defined correctly
 ; CHECK: define <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrU3AS2Dv4_b(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}})
 ; CHECK: entry:
-; CHECK: %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> {{undef|poison}})
+; CHECK: %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> poison)
 ; CHECK: ret <4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
index 604fa9c86da29..ebf2ef88aa2c8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -65,7 +65,7 @@ entry:
 ; CHECK: [[EX1:%.*]] = call i32 @llvm.expect.i32(i32 [[E1]], i32 42)
 ; CHECK: [[EX2:%.*]] = call i32 @llvm.expect.i32(i32 [[E2]], i32 42)
 ; CHECK: [[EX3:%.*]] = call i32 @llvm.expect.i32(i32 [[E3]], i32 42)
-; CHECK: [[C0:%.*]] = insertelement <4 x i32> {{undef|poison}}, i32 [[EX0]], i64 0
+; CHECK: [[C0:%.*]] = insertelement <4 x i32> poison, i32 [[EX0]], i64 0
 ; CHECK: [[C1:%.*]]  = insertelement <4 x i32> [[C0]], i32 [[EX1]], i64 1
 ; CHECK: [[C2:%.*]]  = insertelement <4 x i32> [[C1]], i32 [[EX2]], i64 2
 ; CHECK: [[C3:%.*]]  = insertelement <4 x i32> [[C2]], i32 [[EX3]], i64 3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index 5f82e99e6d583..bc53958d2707c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -50,7 +50,7 @@ entry:
 ; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the
 ; termination of the previous value assigned to %tmp - we could probably do
 ; better here by manifesting a vectorized value?
-; CHECK: #dbg_value(i32 {{(poison|undef)}}, [[VAR:![0-9]+]],
+; CHECK: #dbg_value(i32 poison, [[VAR:![0-9]+]],
 ; CHECK-SAME:   !DIExpression({{.*}}),
 ; CHECK-SAME:   !{{[0-9]+}}
   %1 = load i32, i32* %tid, align 4, !dbg !32
@@ -58,7 +58,7 @@ entry:
   %idx.ext = sext i32 %mul to i64, !dbg !32
   %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idx.ext, !dbg !32
   %call1 = call spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64 0, i32 addrspace(1)* %add.ptr) #3, !dbg !32
-  %extractVec = shufflevector <3 x i32> %call1, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>, !dbg !32
+  %extractVec = shufflevector <3 x i32> %call1, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>, !dbg !32
   %storetmp = bitcast <3 x i32>* %tmp to <4 x i32>*, !dbg !32
   store <4 x i32> %extractVec, <4 x i32>* %storetmp, align 16, !dbg !32
   %2 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
index 16367f8197290..990b7cdcec49f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -85,7 +85,7 @@ attributes #6 = { convergent nobuiltin nounwind }
 ; CHECK: %[[C1:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
 ; CHECK: %[[C2:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
 ; CHECK: %[[C3:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
-; CHECK: %[[G0:.+]] = insertelement <4 x float> {{undef|poison}}, float %[[C0]], {{(i32|i64)}} 0
+; CHECK: %[[G0:.+]] = insertelement <4 x float> poison, float %[[C0]], {{(i32|i64)}} 0
 ; CHECK: %[[G1:.+]] = insertelement <4 x float> %[[G0]], float %[[C1]], {{(i32|i64)}} 1
 ; CHECK: %[[G2:.+]] = insertelement <4 x float> %[[G1]], float %[[C2]], {{(i32|i64)}} 2
 ; CHECK: %[[G3:.+]] = insertelement <4 x float> %[[G2]], float %[[C3]], {{(i32|i64)}} 3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
index 17768daa08af3..9af442b68e1a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -30,7 +30,7 @@ entry:
   call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
   store double 1.600000e+01, double addrspace(1)* %.cast, align 8
   %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
-  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
   %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
   %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
index f094ca8e2616a..d88ad53d87e01 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -67,8 +67,8 @@ attributes #2 = { nobuiltin }
 
 ; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] {
 ; CHECK: entry:
-; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %1, {{i32|i64}} 0
-; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK: %3 = getelementptr i32, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 2, i64 4, i64 6>
 ; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %3, i32{{( immarg)?}} 4, <4 x i1> %2) #
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
index 1d822b93be424..3999f2cf44a80 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -68,8 +68,8 @@ attributes #2 = { nobuiltin }
 ; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] {
 
 ; Check for the address splat
-; CHECK: %[[BROADCASTADDRSPLATINSERT:.+]] = insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %{{.+}}, {{i32|i64}} 0
-; CHECK: %[[BROADCASTADDRSPLAT:.+]] = shufflevector <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLATINSERT]], <4 x ptr addrspace(1)> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[BROADCASTADDRSPLATINSERT:.+]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %{{.+}}, {{i32|i64}} 0
+; CHECK: %[[BROADCASTADDRSPLAT:.+]] = shufflevector <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLATINSERT]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK: getelementptr i32, <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLAT]], <4 x i64> <i64 0, i64 2, i64 4, i64 6>
 
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
index 7631fc7818e40..16b63d1e1c451 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -47,11 +47,11 @@ declare i64 @__mux_get_global_id(i32)
 ; and masked properly
 ; CHECK: define spir_kernel void @__vecz_v4_test_branch(i32 %a, ptr %b)
 ; CHECK: %conv = sext i32 %a to i64
-; CHECK: %[[A_SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %conv, {{i32|i64}} 0
-; CHECK: %[[A_SPLAT:.+]] = shufflevector <4 x i64> %[[A_SPLATINSERT]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[A_SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %conv, {{i32|i64}} 0
+; CHECK: %[[A_SPLAT:.+]] = shufflevector <4 x i64> %[[A_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK: %call = call i64 @__mux_get_global_id(i32 0)
-; CHECK: %[[GID_SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %call, {{i32|i64}} 0
-; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[GID_SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %call, {{i32|i64}} 0
+; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK: %[[GID:.+]] = add <4 x i64> %[[GID_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK: %[[CMP3:.+]] = icmp eq <4 x i64> %[[A_SPLAT]], %[[GID]]
 ; CHECK: %[[NOT_CMP4:.+]] = xor <4 x i1> %[[CMP3]], {{<(i1 true(, )?)+>|splat \(i1 true\)}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
index faf4e1cb43485..9f82652ea8a23 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -84,8 +84,8 @@ declare i64 @__mux_get_global_id(i32)
 ; node is also vectorized properly
 ; CHECK: define spir_kernel void @__vecz_v4_test_uniform_branch(i32 %a, ptr %b)
 ; CHECK: %call = call i64 @__mux_get_global_id(i32 0)
-; CHECK: %[[SPLATINSERT:.+]] = insertelement <4 x i64> {{poison|undef}}, i64 %call, {{i32|i64}} 0
-; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i64> %[[SPLATINSERT]], <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %call, {{i32|i64}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i64> %[[SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK: %[[GID:.+]] = add <4 x i64> %[[SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK: %cmp = icmp eq i32 %a, 42
 ; CHECK: br i1 %cmp, label %if.then, label %if.else
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
index 019d77387fe7c..4c5d2b32da7f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
@@ -152,7 +152,7 @@ if.end:                                           ; preds = %entry, %if.then
 ; values only.
 
 ; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
-; CHECK: insertelement <4 x ptr addrspace(1)> {{poison|undef}}, ptr addrspace(1) %in, {{(i32|i64)}} 0
+; CHECK: insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %in, {{(i32|i64)}} 0
 ; CHECK: shufflevector <4 x ptr addrspace(1)>
 ; CHECK: call <4 x i32> @__vecz_b_gather_load4_Dv4_jDv4_u3ptrU3AS1
 ; CHECK: store <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
index 91cac389bbd0c..89d5118c8fbc1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
@@ -153,7 +153,7 @@ if.end:                                           ; preds = %entry, %if.then
 
 ; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; CHECK: load i32, ptr
-; CHECK: insertelement <4 x i32> {{poison|undef}}
+; CHECK: insertelement <4 x i32> poison
 ; CHECK: shufflevector <4 x i32>
 ; CHECK: store <4 x i32>
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
index a77b4e08bc1b7..6fc47a670a781 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -152,7 +152,7 @@ if.end:                                           ; preds = %entry, %if.then
 ; packetized.
 
 ; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
-; CHECK: insertelement <4 x i64> {{poison|undef}}, i64
+; CHECK: insertelement <4 x i64> poison, i64
 ; CHECK: shufflevector <4 x i64>
 ; CHECK: %[[LOCAL_SIZE:[^ ]+]] = call i64 @__mux_get_local_size(i32 0)
 ; CHECK: icmp {{(ugt|ult)}} i64 %[[LOCAL_SIZE]], {{(1|2)}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
index e23a4e52ab81e..1a4a89972205f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
@@ -154,7 +154,7 @@ if.end:                                           ; preds = %entry, %if.then
 ; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; CHECK: load i32, {{(ptr|i32)}}
 ; CHECK: load i32, {{(ptr|i32)}}
-; CHECK: insertelement <4 x i32> {{poison|undef}}
+; CHECK: insertelement <4 x i32> poison
 ; CHECK: shufflevector <4 x i32>
 ; CHECK: store <4 x i32>
 ; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
index b4b050210fb0c..e251cc4bd07e1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -64,8 +64,8 @@ for.end:                                          ; preds = %for.cond
 ; by other uniform values only.
 
 ; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
-; CHECK: insertelement <4 x i64> {{poison|undef}}, i64 %{{.+}}, {{(i32|i64)}} 0
-; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: insertelement <4 x i64> poison, i64 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK: phi <4 x i32>
 ; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
 ; CHECK: urem <4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
index 61dd05df92f1a..3815fce8e6637 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -64,8 +64,8 @@ for.end:                                          ; preds = %for.cond
 ; values only.
 
 ; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
-; CHECK: insertelement <4 x i64> {{poison|undef}}, i64 %{{.+}}, {{(i32|i64)}} 0
-; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: insertelement <4 x i64> poison, i64 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK: phi <4 x i32>
 ; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
 ; CHECK: urem <4 x i64>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll
similarity index 85%
rename from llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
rename to llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll
index c996ab108cda9..027a688a8614a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_ub.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll
@@ -28,16 +28,16 @@ declare i32 @__mux_get_local_id(i32) #2
 define spir_kernel void @test() #0 {
 entry:
   %call8 = call i32 @__mux_get_local_id(i32 0) #3
-  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* undef, i32 %call8
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* poison, i32 %call8
   %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
   %conv9 = uitofp i8 %0 to float
   %phitmp = fptoui float %conv9 to i8
-  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* undef, i32 %call8
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* poison, i32 %call8
   store i8 %phitmp, i8 addrspace(1)* %arrayidx16, align 1
   ret void
 }
 
-; The "undefs" in the above IR should "optimize" to a trap call and an unreachable
+; The "poison"s in the above IR should "optimize" to a trap call and an unreachable
 ; terminator instruction.
 ; CHECK: define spir_kernel void @__vecz_v4_test
 ; CHECK: unreachable
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
index a0c94959a7fa2..7d52fda58e6f3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
@@ -23,7 +23,7 @@ declare i64 @__mux_get_local_id(i32)
 
 declare i64 @__mux_get_global_id(i32)
 
-@predicate_with_switch.tmpIn = internal addrspace(3) global [16 x i32] undef, align 4
+@predicate_with_switch.tmpIn = internal addrspace(3) global [16 x i32] poison, align 4
 
 define spir_kernel void @predicate_with_switch(i32 addrspace(1)* %A, i32 addrspace(1)* %B) #0 {
 entry:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
index 0f67b19351679..e7b76a778e784 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
@@ -50,6 +50,6 @@ merge:
 
 ; CHECK: define spir_kernel void @__vecz_v4_test
 ; CHECK: %[[LOAD:.+]] = load i32, ptr addrspace(1) %in
-; CHECK: %[[SPLAT_IN:.+]] = insertelement <4 x i32> {{poison|undef}}, i32 %[[LOAD]], {{(i32|i64)}} 0
-; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLAT_IN]], <4 x i32> {{poison|undef}}, <4 x i32> zeroinitializer
+; CHECK: %[[SPLAT_IN:.+]] = insertelement <4 x i32> poison, i32 %[[LOAD]], {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLAT_IN]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32> %[[SPLAT]], ptr addrspace(1){{( nonnull)? %.*}}, <4 x i1> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
index 7dd47ae4ffcc9..7d361eaa47399 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
@@ -47,7 +47,7 @@ declare spir_func i32 @not_scalarizable(<4 x i32> noundef)
 ; CHECK:   %[[ADD1:.*]] = add i32 %[[LD]]
 ; CHECK:   %[[ADD2:.*]] = add i32 %[[LD]]
 ; CHECK:   %[[ADD3:.*]] = add i32 %[[LD]]
-; CHECK:   %[[INS0:.*]] = insertelement <4 x i32> {{undef|poison}}, i32 %[[ADD0]], i32 0
+; CHECK:   %[[INS0:.*]] = insertelement <4 x i32> poison, i32 %[[ADD0]], i32 0
 ; CHECK:   %[[INS1:.+]] = insertelement <4 x i32> %[[INS0]], i32 %[[ADD1]], i32 1
 ; CHECK:   %[[INS2:.+]] = insertelement <4 x i32> %[[INS1]], i32 %[[ADD2]], i32 2
 ; CHECK:   %[[INS3:.+]] = insertelement <4 x i32> %[[INS2]], i32 %[[ADD3]], i32 3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
index 492f39758391e..4492b16c1c978 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
@@ -41,9 +41,9 @@ declare spir_func float @not_scalarizable(<4 x float> noundef)
 ; CHECK: void @__vecz_v4_splat({{.*}})
 ; CHECK: entry:
 ; CHECK:   %[[LD:.*]] = load float
-; CHECK:   %[[INS0:.*]] = insertelement <4 x float> {{undef|poison}}, float %[[LD]], {{i32|i64}} 0
+; CHECK:   %[[INS0:.*]] = insertelement <4 x float> poison, float %[[LD]], {{i32|i64}} 0
 ; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 1
 ; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 2
 ; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 3
-; CHECK:   %[[SPLAT:.*]] = shufflevector <4 x float> %[[INS0]], <4 x float> {{undef|poison}}, <4 x i32> zeroinitializer
+; CHECK:   %[[SPLAT:.*]] = shufflevector <4 x float> %[[INS0]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK:   %{{.*}} = tail call spir_func float @not_scalarizable(<4 x float> noundef %[[SPLAT]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
index 39a73d0f013ba..530b01b7a0d88 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
@@ -21,7 +21,7 @@ source_filename = "kernel.opencl"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "spir64-unknown-unknown"
 
-@scan_fact.temp = internal addrspace(3) global [16 x i32] undef, align 4
+@scan_fact.temp = internal addrspace(3) global [16 x i32] poison, align 4
 
 ; Function Attrs: convergent nounwind readonly
 declare i64 @__mux_get_global_id(i32) #0
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
index 0476b1e40d4e2..ee78646485d04 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -29,7 +29,7 @@ entry:
   %o = load i32, i32* %oup
   ; do this little compare + phi to throw off the InstCombine pass and ensure
   ; we end up with a phi %struct_type that must be instantiated
-  %s = insertvalue %struct_type undef, i32 %o, 1
+  %s = insertvalue %struct_type poison, i32 %o, 1
   %cmpcall = icmp ult i64 16, %call
   br i1 %cmpcall, label %lower, label %higher
 
@@ -82,10 +82,10 @@ declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
 ; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 1
 ; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 2
 ; CHECK: %[[V6:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 3
-; CHECK: %[[S24:.+]] = insertvalue %struct_type undef, i32 %[[V3]], 1
-; CHECK: %[[S25:.+]] = insertvalue %struct_type undef, i32 %[[V4]], 1
-; CHECK: %[[S26:.+]] = insertvalue %struct_type undef, i32 %[[V5]], 1
-; CHECK: %[[S27:.+]] = insertvalue %struct_type undef, i32 %[[V6]], 1
+; CHECK: %[[S24:.+]] = insertvalue %struct_type poison, i32 %[[V3]], 1
+; CHECK: %[[S25:.+]] = insertvalue %struct_type poison, i32 %[[V4]], 1
+; CHECK: %[[S26:.+]] = insertvalue %struct_type poison, i32 %[[V5]], 1
+; CHECK: %[[S27:.+]] = insertvalue %struct_type poison, i32 %[[V6]], 1
 
 ; Check if the phi node has been instantiated
 ; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
index d202a1f67fcbc..d6b074d1d266f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -67,7 +67,7 @@ define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspac
 ; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 0
 ; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0)
 ; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
-; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
 }
 
@@ -85,7 +85,7 @@ define spir_kernel void @sub_group_broadcast_wider_than_vf(i32 addrspace(1)* %in
 ; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 2
 ; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 1)
 ; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
-; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> {{(undef|poison)}}, <4 x i32> zeroinitializer
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
index 3bc98c18d60a5..c69d993acdd18 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -114,7 +114,7 @@ entry:
 ; scalar.
 ; CHECK: [[CALL:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{%.*}})
 ; CHECK: %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[CALL]])
-; CHECK: [[INS:%.*]] = insertelement <4 x i32> {{(undef|poison)}}, i32 %call1, {{(i32|i64)}} 0
+; CHECK: [[INS:%.*]] = insertelement <4 x i32> poison, i32 %call1, {{(i32|i64)}} 0
 ; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: store <4 x i32> [[SPLAT]],
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
index 10f571c41fa94..6ce5f1cfc7ce4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
@@ -50,9 +50,9 @@ declare i64 @__mux_get_global_id(i32)
 ; CHECK: %[[REASSOC:.+]] = add i32 %uniform1, %uniform2
 
 ; Ensure there is only one vector splat
-; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> {{undef|poison}}, i32 %[[REASSOC]], {{(i32|i64)}} 0
-; CHECK-NOT: insertelement <4 x i32> {{undef|poison}}, i32 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %[[REASSOC]], {{(i32|i64)}} 0
+; CHECK-NOT: insertelement <4 x i32> poison, i32 %{{.+}}, {{(i32|i64)}} 0
 
-; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> {{undef|poison}}, <4 x i32> zeroinitializer
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK: %[[RESULT:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
 ; CHECK: store <4 x i32> %vuu{{.*}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
index fcbf4cf948fec..1315a92a7a9d3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
@@ -50,8 +50,8 @@ declare i64 @__mux_get_global_id(i32)
 
 ; The splat of the uniform value
 ; CHECK: %uniform = load
-; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> {{undef|poison}}, i32 %uniform, {{(i32|i64)}} 0
-; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> {{undef|poison}}, <4 x i32> zeroinitializer
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %uniform, {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer
 
 ; Ensure the two varyings are added together directly
 ; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
index e8a291afbfb7f..10dab1c06440e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
@@ -50,8 +50,8 @@ declare i64 @__mux_get_global_id(i32)
 
 ; The splat of the uniform value
 ; CHECK: %uniform = load
-; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> {{undef|poison}}, i32 %uniform, {{(i32|i64)}} 0
-; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> {{undef|poison}}, <4 x i32> zeroinitializer
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %uniform, {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer
 
 ; Ensure the two varyings are added together directly
 ; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]]

From a951cdee2b0a6b48ae8007a805c53a96ba972dbc Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 29 Aug 2025 14:04:48 +0100
Subject: [PATCH 176/182] [NFC] Remove unused %pp-llvm-ver.

A number of tests were still using %pp-llvm-ver despite not having any
LLVM-version-conditional checks.
---
 .../vecz/test/lit/llvm/inlined_function_debug_info.ll          | 3 +--
 .../vecz/test/lit/llvm/insert_element_debug_info.ll            | 3 +--
 .../vecz/test/lit/llvm/packetization_debug_info.ll             | 3 +--
 .../compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll  | 3 +--
 .../vecz/test/lit/llvm/scalarization_debug_info.ll             | 3 +--
 5 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
index c14a5a421f95e..ce041960424b9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -16,8 +16,7 @@
 
 ; Check VECZ debug info for inlined DILocation metadata nodes
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %t
+; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = '/tmp/inlined_function.ll'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
index bc53958d2707c..24947313c290b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -18,8 +18,7 @@
 ; intrinsics across all lanes even when scalarization masks disable some
 ; of the lanes. This occurs when we scalarize insertelement instructions.
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %t
+; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index eaaba4463f9e0..456414d1c22ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -17,8 +17,7 @@
 ; Check that debug info is preserved in the vectorized kernel.
 ; Specifically that the packetization pass creates vector types
 ; in the DI for the variables.
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k add -S < %s | FileCheck %t
+; RUN: veczc -k add -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
index 8e6c5a0954238..aabbd65bf1059 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -17,8 +17,7 @@
 ; Check that debug info intrinsics are correctly placed after
 ; phi nodes.
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %t
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
index 85189b3651eb5..2e7c1a2202c71 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -18,8 +18,7 @@
 ; Specifically that the scalarization pass doesn't destroy DI
 ; intrinsics attached to the vector instructions it scalarizes.
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %t
+; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

From 3720900b7e09d1ab17c0d3cd4d94c03439ffd23b Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 29 Aug 2025 16:01:01 +0100
Subject: [PATCH 177/182] Switch to LLVM versions 20/21 as supported versions.

LLVM 21.1.0 has been released, so by our policy of supporting the two
most recent LLVM releases, we can now move from LLVM 19/20 to
LLVM 20/21.
---
 .../include/multi_llvm/dibuilder.h            | 101 ------------------
 .../include/multi_llvm/instructions.h         |   3 -
 .../include/multi_llvm/instructions.inc       |  14 +--
 .../include/multi_llvm/intrinsic.h            |  14 +--
 .../include/multi_llvm/loop_utils.h           |  37 -------
 .../include/multi_llvm/multi_llvm.h           |   1 -
 .../compiler_pipeline/source/builtin_info.cpp |   2 +-
 .../optimal_builtin_replacement_pass.cpp      |   3 +-
 .../transform/packetization_helpers.cpp       |   5 +-
 .../source/transform/pre_linearize_pass.cpp   |   2 +-
 .../vecz/source/transform/scalarizer.cpp      |   4 +-
 .../vecz/source/vector_target_info.cpp        |   5 +-
 .../vecz/source/vector_target_info_arm.cpp    |   5 +-
 .../vecz/source/vectorization_helpers.cpp     |   4 +-
 .../packetize_mask_varying.ll                 |   8 +-
 .../vecz/test/lit/llvm/irreducible_loop.ll    |  12 +--
 16 files changed, 21 insertions(+), 199 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
deleted file mode 100644
index 487315a8077a2..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/dibuilder.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef MULTI_LLVM_DIBUILDER_H_INCLUDED
-#define MULTI_LLVM_DIBUILDER_H_INCLUDED
-
-#include <llvm/IR/DIBuilder.h>
-#include <multi_llvm/llvm_version.h>
-
-#include <type_traits>
-
-namespace multi_llvm {
-// TODO In order to enable the use of OCK in DPC++ which currently uses the
-// older DIBuilder interface, we do not yet condition this on LLVM version, we
-// dynamically detect which version of DIBuilder we have. This should be updated
-// after DPC++'s next pulldown to drop the use of DIBuilderWrapperNeeded and
-// base it entirely on LLVM major version.
-#if LLVM_VERSION_GREATER_EQUAL(20, 0) && 0
-using DIBuilder = llvm::DIBuilder;
-#else
-template <typename DIBuilder>
-struct DIBuilderWrapper : DIBuilder {
-  using DIBuilder::DIBuilder;
-
-  llvm::BasicBlock *getBasicBlock(llvm::InsertPosition InsertPt) {
-    return InsertPt.getBasicBlock();
-  }
-
-  auto insertDeclare(llvm::Value *Storage, llvm::DILocalVariable *VarInfo,
-                     llvm::DIExpression *Expr, const llvm::DILocation *DL,
-                     llvm::BasicBlock *InsertAtEnd) {
-    return DIBuilder::insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd);
-  }
-
-  auto insertDeclare(llvm::Value *Storage, llvm::DILocalVariable *VarInfo,
-                     llvm::DIExpression *Expr, const llvm::DILocation *DL,
-                     llvm::BasicBlock::iterator InsertPt) {
-    auto *InsertBB = getBasicBlock(InsertPt);
-    if (InsertPt == InsertBB->end()) {
-      return DIBuilder::insertDeclare(Storage, VarInfo, Expr, DL, InsertBB);
-    } else {
-      return DIBuilder::insertDeclare(Storage, VarInfo, Expr, DL, &*InsertPt);
-    }
-  }
-
-  auto insertDbgValueIntrinsic(llvm::Value *Val, llvm::DILocalVariable *VarInfo,
-                               llvm::DIExpression *Expr,
-                               const llvm::DILocation *DL,
-                               llvm::BasicBlock *InsertAtEnd) {
-    return DIBuilder::insertDbgValueIntrinsic(Val, VarInfo, Expr, DL,
-                                              InsertAtEnd);
-  }
-
-  auto insertDbgValueIntrinsic(llvm::Value *Val, llvm::DILocalVariable *VarInfo,
-                               llvm::DIExpression *Expr,
-                               const llvm::DILocation *DL,
-                               llvm::BasicBlock::iterator InsertPt) {
-    auto *InsertBB = getBasicBlock(InsertPt);
-    if (InsertPt == InsertBB->end()) {
-      return DIBuilder::insertDbgValueIntrinsic(Val, VarInfo, Expr, DL,
-                                                InsertBB);
-    } else {
-      return DIBuilder::insertDbgValueIntrinsic(Val, VarInfo, Expr, DL,
-                                                &*InsertPt);
-    }
-  }
-};
-
-template <typename DIBuilder, typename = void>
-static constexpr bool DIBuilderWrapperNeeded = true;
-
-template <typename DIBuilder>
-static constexpr bool DIBuilderWrapperNeeded<
-    DIBuilder, std::void_t<decltype(std::declval<DIBuilder &>().insertLabel(
-                   std::declval<llvm::DILabel *>(),
-                   std::declval<const llvm::DILocation *>(),
-                   std::declval<llvm::BasicBlock::iterator>()))>> = false;
-
-template <typename DIBuilder>
-using DIBuilderMaybeWrapped =
-    std::conditional_t<DIBuilderWrapperNeeded<DIBuilder>,
-                       DIBuilderWrapper<DIBuilder>, DIBuilder>;
-
-using DIBuilder = DIBuilderMaybeWrapped<llvm::DIBuilder>;
-#endif
-}  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_DIBUILDER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
index 19ec81efc57ef..fabe42ae57a93 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
@@ -36,9 +36,6 @@ struct BinOpHelper;
 #define LLVM 20
 #include <multi_llvm/instructions.inc>
 #undef LLVM
-#define LLVM 19
-#include <multi_llvm/instructions.inc>
-#undef LLVM
 
 }  // namespace detail
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
index 80ae42c23fe4f..d9049b2584bf4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
@@ -14,20 +14,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#if LLVM == 19
-template <typename T>
-struct BinOpHelper<T, std::enable_if_t<T::LAST_BINOP - T::FIRST_BINOP == 16>>
-#define BINOP_LLVM20(OP, STR)
-#define BINOP_LLVM21(OP, STR)
-#elif LLVM == 20
+#if LLVM == 20
 template <typename T>
 struct BinOpHelper<T, std::enable_if_t<T::LAST_BINOP - T::FIRST_BINOP == 18>>
-#define BINOP_LLVM20(OP, STR) BINOP(OP, STR)
 #define BINOP_LLVM21(OP, STR)
 #elif LLVM == 21
 template <typename T, typename>
 struct BinOpHelper
-#define BINOP_LLVM20(OP, STR) BINOP(OP, STR)
 #define BINOP_LLVM21(OP, STR) BINOP(OP, STR)
 #endif
 {
@@ -51,8 +44,8 @@ struct BinOpHelper
   BINOP_LLVM21(FMinimum, "fminumum") \
   BINOP(UIncWrap, "uincwrap")        \
   BINOP(UDecWrap, "udecwrap")        \
-  BINOP_LLVM20(USubCond, "usubcond") \
-  BINOP_LLVM20(USubSat, "usubsat")
+  BINOP(USubCond, "usubcond")        \
+  BINOP(USubSat, "usubsat")
 
   static std::optional<T> consume_front_with_underscore(
       llvm::StringRef &String) {
@@ -79,6 +72,5 @@ struct BinOpHelper
   }
 
 #undef BINOPS
-#undef BINOP_LLVM20
 #undef BINOP_LLVM21
 };
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
index f8e5800fa901c..3df2c19ae805f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
@@ -22,19 +22,7 @@
 
 namespace multi_llvm {
 
-static inline auto GetOrInsertIntrinsicDeclaration(
-    llvm::Module *M, llvm::Intrinsic::ID id,
-    llvm::ArrayRef<llvm::Type *> Tys = {}) {
-#if LLVM_VERSION_GREATER_EQUAL(20, 0)
-  return llvm::Intrinsic::getOrInsertDeclaration(M, id, Tys);
-#else
-  return llvm::Intrinsic::getDeclaration(M, id, Tys);
-#endif
-}
-
-// Drop getAttributes workaround when LLVM 20 is minimum version
-// This can also be simplified once DPC++ catches up with getAttributes
-// with FunctionType as the last argument.
+// Drop getAttributes workaround when LLVM 21 is minimum version
 namespace detail {
 template <typename... T>
 auto getAttributes(T... args)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
deleted file mode 100644
index 2d6e8ba84b242..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/loop_utils.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (C) Codeplay Software Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-// Exceptions; you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations
-// under the License.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef MULTI_LLVM_LOOP_UTILS_H_INCLUDED
-#define MULTI_LLVM_LOOP_UTILS_H_INCLUDED
-
-#include <llvm/Transforms/Utils/LoopUtils.h>
-#include <multi_llvm/llvm_version.h>
-
-namespace multi_llvm {
-
-inline llvm::Value *createSimpleReduction(llvm::IRBuilderBase &B,
-                                          llvm::Value *Src,
-                                          llvm::RecurKind RdxKind) {
-#if LLVM_VERSION_MAJOR >= 20
-  return llvm::createSimpleReduction(B, Src, RdxKind);
-#else
-  return llvm::createSimpleTargetReduction(B, Src, RdxKind);
-#endif
-}
-
-}  // namespace multi_llvm
-
-#endif  // MULTI_LLVM_LOOP_UTILS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index f80557fcf4810..34ee6707448a5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -18,6 +18,5 @@
 #define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
 
 #include <multi_llvm/llvm_version.h>
-#include <multi_llvm/loop_utils.h>
 
 #endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index 3242031d61c6f..dc85cd0ad508d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -545,7 +545,7 @@ Function *BuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
     Type *ScalarType = VecRetTy->getElementType();
     // Get the scalar version of the intrinsic
     Function *ScalarIntrinsic =
-        multi_llvm::GetOrInsertIntrinsicDeclaration(M, IntrinsicID, ScalarType);
+        Intrinsic::getOrInsertDeclaration(M, IntrinsicID, ScalarType);
 
     return ScalarIntrinsic;
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index 832389282760d..e7610f89c5525 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -28,7 +28,6 @@
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/Module.h>
 #include <llvm/TargetParser/Triple.h>
-#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #define DEBUG_TYPE "ca-optimal-builtins"
@@ -63,7 +62,7 @@ Value *OptimalBuiltinReplacementPass::replaceAbacusCLZ(
   // Get the declaration for the intrinsic
   auto *const ArgTy = Args[0]->getType();
   auto *const Intrinsic =
-      multi_llvm::GetOrInsertIntrinsicDeclaration(M, Intrinsic::ctlz, ArgTy);
+      llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctlz, ArgTy);
   // If we didn't find the intrinsic or the return type isn't what we
   // expect, skip this optimization
   Function *Callee = CB.getCalledFunction();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index bac6b19789706..996881c8b16ab 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -30,7 +30,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/Transforms/Utils/LoopUtils.h>
-#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
@@ -241,7 +240,7 @@ Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind,
   assert(isa<VectorType>(Val->getType()) && "Must be vector type");
   // If VL is null, it's not a vector-predicated reduction.
   if (!VL) {
-    return multi_llvm::createSimpleReduction(B, Val, Kind);
+    return createSimpleReduction(B, Val, Kind);
   }
   auto IntrinsicOp = Intrinsic::not_intrinsic;
   switch (Kind) {
@@ -290,7 +289,7 @@ Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind,
       break;
   }
 
-  auto *const F = multi_llvm::GetOrInsertIntrinsicDeclaration(
+  auto *const F = Intrinsic::getOrInsertDeclaration(
       B.GetInsertBlock()->getModule(), IntrinsicOp, Val->getType());
   assert(F && "Could not declare vector-predicated reduction intrinsic");
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index 05913dbe3cca1..e82ec366c8f85 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -120,7 +120,7 @@ InstructionCost calculateBoolReductionCost(LLVMContext &context, Module *module,
   auto *F = Function::Create(new_fty, Function::InternalLinkage, "tmp", module);
   auto *BB = BasicBlock::Create(context, "reduce", F);
   IRBuilder<> B(BB);
-  multi_llvm::createSimpleReduction(B, &*F->arg_begin(), RecurKind::And);
+  createSimpleReduction(B, &*F->arg_begin(), RecurKind::And);
   const InstructionCost cost = calculateBlockCost(*BB, TTI);
 
   // We don't really need that function in the module anymore because it's
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 5b79287c14959..63b33c02f3a44 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -20,6 +20,7 @@
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/Analysis/InstructionSimplify.h>
+#include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
@@ -27,7 +28,6 @@
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_ostream.h>
-#include <multi_llvm/dibuilder.h>
 #include <multi_llvm/multi_llvm.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -683,7 +683,7 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
   // instructions and is used to avoid duplicate LLVM dbg.value's.
   SmallPtrSet<Value *, 4> VectorElements;
 
-  multi_llvm::DIBuilder DIB(*Original->getModule(), false);
+  DIBuilder DIB(*Original->getModule(), false);
 
   for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) {
     DILocalVariable *DILocal = nullptr;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index e019994cb71ff..2ab49546fea1e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -20,7 +20,6 @@
 #include <llvm/MC/TargetRegistry.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/TargetParser/Triple.h>
-#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/target_transform_info.h>
 #include <multi_llvm/vector_type_helper.h>
 
@@ -504,7 +503,7 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
       const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_gather, Tys, Args);
     } else if (Legality.isMaskLegal()) {
-      Function *MaskedGather = multi_llvm::GetOrInsertIntrinsicDeclaration(
+      Function *MaskedGather = Intrinsic::getOrInsertDeclaration(
           F->getParent(), Intrinsic::masked_gather, {Ty, VecPtrTy});
 
       if (MaskedGather) {
@@ -606,7 +605,7 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
       const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
       return B.CreateIntrinsic(llvm::Intrinsic::vp_scatter, Tys, Args);
     } else if (Legality.isMaskLegal()) {
-      Function *MaskedScatter = multi_llvm::GetOrInsertIntrinsicDeclaration(
+      Function *MaskedScatter = Intrinsic::getOrInsertDeclaration(
           F->getParent(), Intrinsic::masked_scatter, {DataTy, VecPtrTy});
 
       if (MaskedScatter) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index 892c81483cb84..148db144756f3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -18,7 +18,6 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IntrinsicsAArch64.h>
 #include <llvm/IR/IntrinsicsARM.h>
-#include <multi_llvm/intrinsic.h>
 #include <multi_llvm/vector_type_helper.h>
 
 #include "debugging.h"
@@ -224,7 +223,7 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
     Tys = {VecTy, PtrTy};
   }
 
-  Function *IntrFn = multi_llvm::GetOrInsertIntrinsicDeclaration(
+  Function *IntrFn = Intrinsic::getOrInsertDeclaration(
       Op0->getModule(), (Intrinsic::ID)IntrID, Tys);
   if (!IntrFn) {
     return false;
@@ -379,7 +378,7 @@ bool TargetInfoAArch64::optimizeInterleavedGroup(
     VecTy = cast<FixedVectorType>(Op0->getType());
   }
 
-  Function *IntrFn = multi_llvm::GetOrInsertIntrinsicDeclaration(
+  Function *IntrFn = Intrinsic::getOrInsertDeclaration(
       Op0->getModule(), (Intrinsic::ID)IntrID, {VecTy, PtrTy});
   if (!IntrFn) {
     return false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 38e7752532d45..24c892f7b606a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -19,11 +19,11 @@
 #include <compiler/utils/attributes.h>
 #include <compiler/utils/metadata.h>
 #include <llvm/IR/Attributes.h>
+#include <llvm/IR/DIBuilder.h>
 #include <llvm/IR/IntrinsicInst.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/TypeSize.h>
 #include <llvm/Transforms/Utils/Cloning.h>
-#include <multi_llvm/dibuilder.h>
 
 #include <optional>
 
@@ -274,7 +274,7 @@ void cloneDebugInfo(const VectorizationUnit &VU) {
   }
 
   // Create a DISubprogram entry for the vectorized kernel
-  multi_llvm::DIBuilder DIB(*VU.scalarFunction()->getParent(), false);
+  DIBuilder DIB(*VU.scalarFunction()->getParent(), false);
   DICompileUnit *CU =
       DIB.createCompileUnit(dwarf::DW_LANG_OpenCL, ScalarDI->getFile(), "",
                             ScalarDI->isOptimized(), "", 0);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
index 5c1df71ed9475..0ce65b9f4ca00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -14,8 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %t
+; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -40,10 +39,7 @@ if.end:
 ; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying
 ; CHECK: [[CMP:%.*]] = icmp slt <vscale x 4 x i64> %{{.*}},
 ; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[CMP]], {{.*}}, i32 {{.*}})
-; CHECK-LT20: [[REINS:%.*]] = insertelement <4 x i1> poison, i1 [[RED]], {{(i32|i64)}} 0
-; CHECK-LT20: [[RESPLAT:%.*]] = shufflevector <4 x i1> [[REINS]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-LT20: [[VAL:%.*]] = call <4 x i32> @__vecz_b_masked_load16_Dv4_ju3ptrDv4_b(ptr %aptr, <4 x i1> [[RESPLAT]])
-; CHECK-GE20: [[VAL:%.*]] = load <4 x i32>, ptr %aptr
+; CHECK: [[VAL:%.*]] = load <4 x i32>, ptr %aptr
 }
 
 declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
index e81e139e52dad..770a31740a8b2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -14,8 +14,7 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; RUN: %pp-llvm-ver -o %t < %s --llvm-ver %LLVMVER
-; RUN: veczc -k irreducible_loop -S < %s | FileCheck %t
+; RUN: veczc -k irreducible_loop -S < %s | FileCheck %s
 
 ; ModuleID = 'Unknown buffer'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -49,17 +48,10 @@ declare i64 @__mux_get_global_id(i32)
 
 ; CHECK: define spir_kernel void @__vecz_v4_irreducible_loop
 ; CHECK: entry:
-; CHECK-LT20:   br label %irr.guard.outer
-
-; CHECK-LT20: irr.guard.outer:                                  ; preds = %irr.guard.pure_exit, %entry
 ; CHECK:   br label %irr.guard
 
-; CHECK-LT20: do.end:                                           ; preds = %irr.guard.pure_exit
-; CHECK-LT20:   ret void
-
 ; CHECK: irr.guard:
 ; CHECK:   br i1 %{{.+}}, label %irr.guard.pure_exit, label %irr.guard
 
 ; CHECK: irr.guard.pure_exit:                              ; preds = %irr.guard
-; CHECK-LT20:   br i1 %{{.+}}, label %do.end, label %irr.guard.outer
-; CHECK-GE20:   ret void
+; CHECK:   ret void

From ac9ee79f4e2474fad76f5c0b0ff56f8504cfce78 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Fri, 29 Aug 2025 21:45:54 +0100
Subject: [PATCH 178/182] Remove outdated test.

undef_debug_info.ll was originally added to test that we did not
generate debug info with undef, because we were unable to handle that
properly. Later on, when we were able to handle that properly, I removed
the logic for keeping undef out of debug info in #375, at which point
this test no longer tested anything useful. This PR finally removes it.
---
 .../vecz/test/lit/llvm/undef_debug_info.ll    | 116 ------------------
 1 file changed, 116 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
deleted file mode 100644
index 966bebfc59fe6..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/undef_debug_info.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; Check that debug info intrinsics aren't created using undef values.
-; These cause the backend to assert in codegen.
-
-; RUN: veczc -k test_fn -S < %s | FileCheck %s
-
-; ModuleID = 'kernel.opencl'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Function Attrs: nounwind
-define spir_kernel void @test_fn(i16 addrspace(1)* %src, <4 x i16> addrspace(1)* %dst) #0 !dbg !4 {
-entry:
-  %src.addr = alloca i16 addrspace(1)*, align 8
-  %dst.addr = alloca <4 x i16> addrspace(1)*, align 8
-  %tid = alloca i32, align 4
-  %tmp = alloca <4 x i16>, align 8
-  store i16 addrspace(1)* %src, i16 addrspace(1)** %src.addr, align 8
-  call void @llvm.dbg.declare(metadata i16 addrspace(1)** %src.addr, metadata !18, metadata !32), !dbg !33
-  store <4 x i16> addrspace(1)* %dst, <4 x i16> addrspace(1)** %dst.addr, align 8
-  call void @llvm.dbg.declare(metadata <4 x i16> addrspace(1)** %dst.addr, metadata !19, metadata !32), !dbg !33
-  call void @llvm.dbg.declare(metadata i32* %tid, metadata !20, metadata !32), !dbg !34
-  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !34
-  %conv = trunc i64 %call to i32, !dbg !34
-  store i32 %conv, i32* %tid, align 4, !dbg !34
-  call void @llvm.dbg.declare(metadata <4 x i16>* %tmp, metadata !22, metadata !32), !dbg !35
-  %0 = load i32, i32* %tid, align 4, !dbg !35
-  %conv1 = sext i32 %0 to i64, !dbg !35
-  %1 = load i16 addrspace(1)*, i16 addrspace(1)** %src.addr, align 8, !dbg !35
-  %call2 = call spir_func <3 x i16> @_Z6vload3mPKU3AS1t(i64 %conv1, i16 addrspace(1)* %1) #3, !dbg !35
-  %call3 = call spir_func <4 x i16> @_Z9as_short4Dv3_t(<3 x i16> %call2) #3, !dbg !35
-  store <4 x i16> %call3, <4 x i16>* %tmp, align 8, !dbg !35
-  %2 = load <4 x i16>, <4 x i16>* %tmp, align 8, !dbg !36
-  %3 = load i32, i32* %tid, align 4, !dbg !36
-  %idxprom = sext i32 %3 to i64, !dbg !36
-  %4 = load <4 x i16> addrspace(1)*, <4 x i16> addrspace(1)** %dst.addr, align 8, !dbg !36
-  %arrayidx = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %4, i64 %idxprom, !dbg !36
-  store <4 x i16> %2, <4 x i16> addrspace(1)* %arrayidx, align 8, !dbg !36
-  ret void, !dbg !37
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare i64 @__mux_get_global_id(i32) #2
-
-declare spir_func <4 x i16> @_Z9as_short4Dv3_t(<3 x i16>) #2
-
-declare spir_func <3 x i16> @_Z6vload3mPKU3AS1t(i64, i16 addrspace(1)*) #2
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin }
-
-!llvm.dbg.cu = !{!0}
-!opencl.kernels = !{!23}
-!llvm.module.flags = !{!30}
-!llvm.ident = !{!31}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
-!1 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
-!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test_fn", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null, !7, !11}
-!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 64)
-!8 = !DIDerivedType(tag: DW_TAG_typedef, name: "ushort", file: !9, line: 29, baseType: !10)
-!9 = !DIFile(filename: "builtins/include/builtins/builtins.h", directory: "/tmp")
-!10 = !DIBasicType(name: "unsigned short", size: 16, align: 16, encoding: DW_ATE_unsigned)
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64, align: 64)
-!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "short4", file: !9, line: 55, baseType: !13)
-!13 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, size: 64, align: 64, flags: DIFlagVector, elements: !15)
-!14 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed)
-!15 = !{!16}
-!16 = !DISubrange(count: 4)
-!17 = !{!18, !19, !20, !22}
-!18 = !DILocalVariable(name: "src", arg: 1, scope: !4, file: !1, line: 2, type: !7)
-!19 = !DILocalVariable(name: "dst", arg: 2, scope: !4, file: !1, line: 2, type: !11)
-!20 = !DILocalVariable(name: "tid", scope: !4, file: !1, line: 4, type: !21)
-!21 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!22 = !DILocalVariable(name: "tmp", scope: !4, file: !1, line: 5, type: !12)
-!23 = !{void (i16 addrspace(1)*, <4 x i16> addrspace(1)*)* @test_fn, !24, !25, !26, !27, !28, !29}
-!24 = !{!"kernel_arg_addr_space", i32 1, i32 1}
-!25 = !{!"kernel_arg_access_qual", !"none", !"none"}
-!26 = !{!"kernel_arg_type", !"ushort*", !"short4*"}
-!27 = !{!"kernel_arg_base_type", !"ushort*", !"short __attribute__((ext_vector_type(4)))*"}
-!28 = !{!"kernel_arg_type_qual", !"", !""}
-!29 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1}
-!30 = !{i32 2, !"Debug Info Version", i32 3}
-!31 = !{!"clang version 3.8.1 "}
-!32 = !DIExpression()
-!33 = !DILocation(line: 2, scope: !4)
-!34 = !DILocation(line: 4, scope: !4)
-!35 = !DILocation(line: 5, scope: !4)
-!36 = !DILocation(line: 6, scope: !4)
-!37 = !DILocation(line: 7, scope: !4)
-
-; Vectorized kernel function
-; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_test_fn({{.*}} !dbg {{![0-9]+}}

From 93336499b19b5564f2cbb02ec7bbc6fadd889429 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 2 Sep 2025 13:04:02 +0100
Subject: [PATCH 179/182] [NFC] Remove leftover older LLVM handling.

We require LLVM 20 or newer, so a whole lot of version checks are no
longer necessary, and a number of tests that were only running on older
LLVM versions no longer serve a purpose. Remove the checks and tests as
appropriate.
---
 .../lit/llvm/Boscc/partial_linearization22.ll | 348 ------------------
 .../test/lit/llvm/RISCV/broadcast_vector.ll   |   1 -
 .../lit/llvm/RISCV/define_subgroup_scans.ll   |   1 -
 .../llvm/RISCV/define_subgroup_scans_vp.ll    |   1 -
 .../test/lit/llvm/RISCV/extract_element.ll    |   1 -
 .../test/lit/llvm/RISCV/insert_element.ll     |   1 -
 .../vecz/test/lit/llvm/RISCV/lit.local.cfg    |   5 +-
 .../test/lit/llvm/RISCV/packetize_shuffle.ll  |   1 -
 .../lit/llvm/RISCV/packetize_shuffle_bool.ll  |   1 -
 .../llvm/RISCV/packetize_shuffle_concat.ll    |   1 -
 .../llvm/RISCV/packetize_shuffle_narrow.ll    |   1 -
 .../lit/llvm/RISCV/packetize_shuffle_wider.ll |   1 -
 .../lit/llvm/RISCV/select_scalar_vector.ll    |   1 -
 .../vecz/test/lit/llvm/RISCV/vp_memops.ll     |   1 -
 .../vecz/test/lit/llvm/RISCV/vp_vsetvli.ll    |   2 -
 .../define_interleaved_store.ll               |   1 -
 .../define_interleaved_store_as_masked.ll     |   1 -
 .../ScalableVectors/define_masked_load.ll     |   1 -
 .../define_masked_scatter_gather.ll           |   1 -
 .../ScalableVectors/define_subgroup_scans.ll  |   1 -
 .../define_subgroup_scans_vp.ll               |   1 -
 .../llvm/ScalableVectors/extract_element.ll   |   1 -
 .../llvm/ScalableVectors/insert_element.ll    |   1 -
 .../llvm/ScalableVectors/interleaved_load.ll  |   1 -
 .../lit/llvm/ScalableVectors/intrinsics.ll    |   1 -
 .../lit/llvm/ScalableVectors/lit.local.cfg    |  18 -
 .../ScalableVectors/packetize_mask_varying.ll |   1 -
 .../ScalableVectors/select_scalar_vector.ll   |   1 -
 .../test/lit/llvm/ScalableVectors/shuffle.ll  |   1 -
 .../llvm/ScalableVectors/subgroup_builtins.ll |   1 -
 .../ScalableVectors/verification_fail_phi.ll  |   1 -
 .../lit/llvm/ScalableVectors/widen_vload.ll   |   1 -
 .../llvm/ScalableVectors/workitem_funcs.ll    |   1 -
 .../llvm/VectorPredication/boscc_reduction.ll |   1 -
 .../test/lit/llvm/VectorPredication/choice.ll |   1 -
 .../compute_vector_length.ll                  |   1 -
 .../define_interleaved_load_store.ll          |   1 -
 .../define_masked_load_store.ll               |   1 -
 .../define_masked_scatter_gather.ll           |   1 -
 .../define_subgroup_scans.ll                  |   1 -
 .../llvm/VectorPredication/load_add_store.ll  |   2 -
 .../llvm/VectorPredication/scatter_gather.ll  |   1 -
 .../test/lit/llvm/VectorPredication/udiv.ll   |   2 -
 .../test/lit/llvm/VectorWidening/widen_abs.ll |   1 -
 .../lit/llvm/VectorWidening/widen_copysign.ll |   1 -
 .../vecz/test/lit/llvm/gep_elim_opaque.ll     |   1 -
 .../lit/llvm/multiple_vectorization_flags.ll  |   1 -
 .../llvm/partial_linearization22-llvm18.ll    | 264 -------------
 .../test/lit/llvm/partial_linearization22.ll  |   1 -
 .../test/lit/llvm/pass_pipeline_printafter.ll |   1 -
 50 files changed, 1 insertion(+), 683 deletions(-)
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
 delete mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
deleted file mode 100644
index 716511b063592..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization22.ll
+++ /dev/null
@@ -1,348 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; REQUIRES: llvm-8.0-only
-; RUN: veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,function(loop(indvars)),cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
-
-; The CFG of the following kernel is:
-;
-;     a
-;     |
-;     b <------.
-;    / \       |
-;   f   c <--. |
-;   |\ / \   | |
-;   | |   d -' |
-;   | |\ / \   |
-;   | | |   e -'
-;   | | |\ /
-;   | | | g
-;   | | |/
-;   | | /
-;    \|/
-;     h
-;
-; * where nodes b, d, and e are uniform branches, and node c is a varying
-;   branch.
-; * where nodes b, d, e and f are divergent.
-;
-; With BOSCC, it will be transformed as follows:
-;
-;     a
-;     |
-;     b <------.     b' <--.
-;    / \       |    /|     |
-;   f   c <--. |   / c' <. |
-;   |\ / \___|_|_ f' |   | |
-;   | |   d -' | `|- d' -' |
-;   | |\ / \   |  |  |     |
-;   | | |   e -'  |  e' ---'
-;   | | |\ /       \ |
-;   | | | g         \|
-;   | | |/           g'
-;   | | /            |
-;    \|/             h'
-;     h ----> & <---'
-;
-; where '&' represents merge blocks of BOSCC regions.
-;
-; __kernel void partial_linearization22(__global int *out, int n) {
-;   int id = get_global_id(0);
-;   int ret = 0;
-;
-;   while (1) {
-;     if (n > 0 && n < 5) {
-;       goto f;
-;     }
-;     while (1) {
-;       if (n <= 2) {
-;         goto f;
-;       } else {
-;         if (ret + id >= n) {
-;           goto d;
-;         }
-;       }
-;       if (n & 1) {
-;         goto h;
-;       }
-;
-; d:
-;       if (n > 3) {
-;         goto e;
-;       }
-;     }
-;
-; e:
-;     if (n & 1) {
-;       goto g;
-;     }
-;   }
-;
-; f:
-;   if (n == 2) {
-;     goto h;
-;   }
-;
-; g:
-;   for (int i = 0; i < n + 1; i++) ret++;
-;   goto h;
-;
-; h:
-;   out[id] = ret;
-; }
-
-; ModuleID = 'Unknown buffer'
-source_filename = "kernel.opencl"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0) #2
-  %conv = trunc i64 %call to i32
-  br label %while.body
-
-while.body:                                       ; preds = %e, %entry
-  %n.off = add i32 %n, -1
-  %0 = icmp ult i32 %n.off, 4
-  %cmp6 = icmp slt i32 %n, 3
-  %or.cond1 = or i1 %cmp6, %0
-  br i1 %or.cond1, label %f, label %if.else
-
-while.body5:                                      ; preds = %d
-  switch i32 %n, label %g [
-    i32 3, label %if.else
-    i32 2, label %h
-  ]
-
-if.else:                                          ; preds = %while.body5, %while.body
-  %cmp9 = icmp sge i32 %conv, %n
-  %and = and i32 %n, 1
-  %tobool = icmp eq i32 %and, 0
-  %or.cond2 = or i1 %tobool, %cmp9
-  br i1 %or.cond2, label %d, label %h
-
-d:                                                ; preds = %if.else
-  %cmp16 = icmp sgt i32 %n, 3
-  br i1 %cmp16, label %e, label %while.body5
-
-e:                                                ; preds = %d
-  %and20 = and i32 %n, 1
-  %tobool21 = icmp eq i32 %and20, 0
-  br i1 %tobool21, label %while.body, label %g
-
-f:                                                ; preds = %while.body
-  %cmp24 = icmp eq i32 %n, 2
-  br i1 %cmp24, label %h, label %g
-
-g:                                                ; preds = %f, %e, %while.body5
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %g
-  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
-  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
-  %cmp29 = icmp sgt i32 %storemerge, %n
-  br i1 %cmp29, label %h, label %for.body
-
-for.body:                                         ; preds = %for.cond
-  %inc = add nuw nsw i32 %ret.0, 1
-  %inc31 = add nuw nsw i32 %storemerge, 1
-  br label %for.cond
-
-h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
-  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
-  %idxprom = sext i32 %conv to i64
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
-  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
-  ret void
-}
-
-; Function Attrs: convergent nounwind readonly
-declare i64 @__mux_get_global_id(i32) #1
-
-attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nobuiltin nounwind readonly }
-
-!llvm.module.flags = !{!0}
-!opencl.ocl.version = !{!1}
-!opencl.spir.version = !{!1}
-!opencl.kernels = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 1, i32 2}
-!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8}
-!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
-!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
-!5 = !{!"kernel_arg_type", !"int*", !"int"}
-!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
-!7 = !{!"kernel_arg_type_qual", !"", !""}
-!8 = !{!"kernel_arg_name", !"out", !"n"}
-
-; CHECK: spir_kernel void @__vecz_v4_partial_linearization22
-; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
-
-; CHECK: [[WHILEBODY]]:
-; CHECK: %[[CMP6:.+]] = icmp slt
-; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]]
-; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]])
-; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]]
-
-; CHECK: [[IFELSEPREHEADER]]:
-; CHECK: br label %[[IFELSE:.+]]
-
-; CHECK: [[WHILEBODY5:.+]]:
-; CHECK: br label %[[NODEBLOCK:.+]]
-
-; CHECK: [[NODEBLOCK]]:
-; CHECK: br label %[[LEAFBLOCK1:.+]]
-
-; CHECK: [[LEAFBLOCK1]]:
-; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
-
-; CHECK: [[IFELSEPUREEXIT]]:
-; CHECK: br label %[[E:.+]]
-
-; CHECK: [[LEAFBLOCK:.+]]:
-; CHECK: %[[SWITCHLEAF:.+]] = icmp
-; CHECK: %[[SWITCHLEAFANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[SWITCHLEAF]])
-; CHECK: br i1 %[[SWITCHLEAFANY]], label %[[LEAFBLOCKELSE:.+]], label %[[NEWDEFAULT:.+]]
-
-; CHECK: [[LEAFBLOCKELSE]]:
-; CHECK: br label %[[GLOOPEXIT:.+]]
-
-; CHECK: [[IFELSE]]:
-; CHECK: br label %[[D:.+]]
-
-; CHECK: [[WHILEBODYUNIFORM]]:
-; CHECK: %[[CMP6UNIFORM:cmp.+]] = icmp
-; CHECK: %[[ORCOND1UNIFORM:.+]] = or i1 %[[CMP6UNIFORM]]
-; CHECK: br i1 %[[ORCOND1UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFELSEPREHEADERUNIFORM:.+]]
-
-; CHECK: [[IFELSEPREHEADERUNIFORM]]:
-; CHECK: br label %[[IFELSEUNIFORM:.+]]
-
-; CHECK: [[IFELSEUNIFORM]]:
-; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
-
-; CHECK: [[DUNIFORM]]:
-; CHECK: %[[CMP16UNIFORM:.+]] = icmp
-; CHECK: br i1 %[[CMP16UNIFORM]], label %[[EUNIFORM:.+]], label %[[NODEBLOCKUNIFORM:.+]]
-
-; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
-; CHECK: br i1 %{{.+}}, label %[[HLOOPEXIT3UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]]
-
-; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]:
-; CHECK: br label %[[D]]
-
-; CHECK: [[NODEBLOCKUNIFORM]]:
-; CHECK: %[[PIVOTUNIFORM:.+]] = icmp
-; CHECK: br i1 %[[PIVOTUNIFORM]], label %[[LEAFBLOCKUNIFORM:.+]], label %[[LEAFBLOCK1UNIFORM:.+]]
-
-; CHECK: [[LEAFBLOCK1UNIFORM]]:
-; CHECK: %[[SWITCHLEAF2UNIFORM:.+]] = icmp
-; CHECK: br i1 %[[SWITCHLEAF2UNIFORM]], label %[[IFELSEUNIFORM]], label %[[NEWDEFAULTLOOPEXITUNIFORM:.+]]
-
-; CHECK: [[EUNIFORM]]:
-; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp
-; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[GLOOPEXITUNIFORM:.+]]
-
-; CHECK: [[HLOOPEXIT3UNIFORM]]:
-; CHECK: br label %[[H:.+]]
-
-; CHECK: [[NEWDEFAULTLOOPEXITUNIFORM]]:
-; CHECK: br label %[[NEWDEFAULTUNIFORM:.+]]
-
-; CHECK: [[LEAFBLOCKUNIFORM]]:
-; CHECK: %[[SWITCHLEAFUNIFORM:.+]] = icmp
-; CHECK: br i1 %[[SWITCHLEAFUNIFORM]], label %[[H]], label %[[NEWDEFAULTUNIFORM]]
-
-; CHECK: [[NEWDEFAULTUNIFORM]]:
-; CHECK: br label %[[GUNIFORM:.+]]
-
-; CHECK: [[GLOOPEXITUNIFORM]]:
-; CHECK: br label %[[GUNIFORM]]
-
-; CHECK: [[FUNIFORM]]:
-; CHECK: %[[CMP24UNIFORM:.+]] = icmp
-; CHECK: br i1 %[[CMP24UNIFORM]], label %[[H]], label %[[GUNIFORM]]
-
-; CHECK: [[GUNIFORM]]:
-; CHECK: br label %[[FORCONDUNIFORM:.+]]
-
-; CHECK: [[FORCONDUNIFORM]]:
-; CHECK: %[[CMP29UNIFORM:.+]] = icmp
-; CHECK: br i1 %[[CMP29UNIFORM]], label %[[HLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]]
-
-; CHECK: [[FORBODYUNIFORM]]:
-; CHECK: br label %[[FORCONDUNIFORM]]
-
-; CHECK: [[HLOOPEXITUNIFORM]]:
-; CHECK: br label %[[H]]
-
-; CHECK: [[D]]:
-; CHECK: br label %[[WHILEBODY5]]
-
-; CHECK: [[E]]:
-; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
-
-; CHECK: [[WHILEBODYPUREEXIT]]:
-; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[CMP24:cmp24]], %[[F]] ], [ false, %[[E]] ]
-; CHECK: br label %[[HLOOPEXIT3:.+]]
-
-; CHECK: [[F]]:
-; CHECK: %[[CMP24]] = icmp
-; CHECK: br label %[[WHILEBODYPUREEXIT]]
-
-; CHECK: [[FELSE:.+]]:
-; CHECK: br label %[[NEWDEFAULT]]
-
-; CHECK: [[FSPLIT:.+]]:
-; CHECK: %[[CMP24MERGEANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP24MERGE]])
-; CHECK: br i1 %[[CMP24MERGEANY]], label %[[NEWDEFAULT]], label %[[G:.+]]
-
-; CHECK: [[NEWDEFAULT]]:
-; CHECK: br label %[[G]]
-
-; CHECK: [[GLOOPEXIT]]:
-; CHECK: br label %[[GLOOPEXITELSE:.+]]
-
-; CHECK: [[GLOOPEXITELSE]]:
-; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]]
-
-; CHECK: [[G]]:
-; CHECK: br label %[[FORCOND:.+]]
-
-; CHECK: [[FORCOND]]:
-; CHECK: %[[CMP29:.+]] = icmp
-; CHECK: %[[CMP29ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP29]])
-; CHECK: br i1 %[[CMP29ANY]], label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]]
-
-; CHECK: [[FORBODY]]:
-; CHECK: br label %[[FORCOND]]
-
-; CHECK: [[HLOOPEXIT]]:
-; CHECK: br label %[[H]]
-
-; CHECK: [[HLOOPEXIT3]]:
-; CHECK: br label %[[HLOOPEXIT3ELSE:.+]]
-
-; CHECK: [[HLOOPEXIT3ELSE]]:
-; CHECK: br label %[[NEWDEFAULTLOOPEXIT:.+]]
-
-; CHECK: [[H]]:
-; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index 278931266fd77..c1a059ab2f4a5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -15,7 +15,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
index 938100148aaca..f4ade070647f1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
index e04749c9803e6..98fd18da0dbe7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index 0ef996867742b..c151b1160783b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k extract_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE
 ; RUN: not veczc -k extract_element_ilegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
 ; RUN: veczc -k extract_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index 9fc9b4a0c2104..2ef110b4b99b5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k insert_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE
 ; RUN: veczc -k insert_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI
 ; RUN: veczc -k insert_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
index 1b8b665128138..8b3578af8d21e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
@@ -17,7 +17,4 @@
 if not 'RISCV' in config.root.targets:
     config.unsupported = True
 
-if config.llvm_version_major >= 14:
-    config.substitutions.append(('%vattr', '+v'))
-else:
-    config.substitutions.append(('%vattr', '+experimental-v'))
+config.substitutions.append(('%vattr', '+v'))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
index b614818e0c91b..c80338ff7de9c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
index e2a7fc87785db..d0ec0c5e6ce07 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
index 56fac919fb531..cf0b5b3ac8d5a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
index 8a40287b77add..8c28d3762451d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
index f0877a2590095..789e091a7e7b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 85684cc5b7e2f..4999374cd5d80 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k select_scalar_vector -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
index 284a99c74ac78..a206c8b781c32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-4
 ; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-8
 ; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
index 7710d97c909fd..4c201f654acde 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -14,8 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-14+
-
 ; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+v -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
index 38182a90ba1d4..315e721aea82d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
index cbc0ece32d65c..314ee922d7be6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
index da2bfc7651c93..bec291abbf638 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dont_mask_workitem_builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
index 642996e2d64c5..24815c1ae56d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
index de83911c22126..e151d82fa7981 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
index 3bff18980d2aa..cdd9ef6de8e02 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
index 90f54f795b5db..28785e54202a0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k extract_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE
 ; RUN: veczc -k extract_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI
 ; RUN: veczc -k extract_element_uniform_vec -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
index 72f0e80da803b..107603f898c7e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k insert_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE
 ; RUN: veczc -k insert_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI
 ; RUN: veczc -k insert_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
index cfb29eaffb7d8..212adee0fff9f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k load_interleaved -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
index 701861f189541..021b103ac4e56 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k ctpop -vecz-scalable -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP
 ; RUN: veczc -k ctlz -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix CTLZ
 ; RUN: veczc -k cttz -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s --check-prefix CTTZ
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
deleted file mode 100644
index 1ea9ac8a10a70..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/lit.local.cfg
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (C) Codeplay Software Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-# Exceptions; you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# Scalable vectorization is only supported on LLVM 12+
-config.unsupported |= config.llvm_version_major < 12
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
index 94cfaa536a6e1..fbadfebf05d4f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
index 6f2f36035fa60..501f4245ec090 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k select_scalar_vector -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
index b39b1ddead43b..9d9f141cf12ff 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
index ac112bfe44bba..994c87fce14f5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
index 67564ff601810..a4ff1d7c228f4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
@@ -15,7 +15,6 @@
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 ; Check that we fail to vectorize but don't leave behind an invalid function.
-; REQUIRES: llvm-13+
 ; RUN: not veczc -k regression_phis -vecz-scalable -w 1 -vecz-passes=packetizer,verify -S < %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
index 1ce02ded94b4f..43f40444837b3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k widen_vload -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
index 88e5c930a0be7..c8e7e27514fa9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k store_ult -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
 
 ; Check that we can scalably-vectorize a call to get_global_id by using the
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
index 8a161cb1c2194..05f7c9483f2f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k foo -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
index bdef00fb38803..9835c56732a32 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; Just check that the VectorPredication choice is valid
 ; RUN: veczc -k foo -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
index b1cd005666d45..38abafeb2cb77 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k get_sub_group_size -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-F2
 ; RUN: veczc -k get_sub_group_size -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-S4
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
index aafc04d3c9289..a913198ca3f2b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication:FullScalarization -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
index 3ae0852b693f5..7ef8742f87f58 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
index c4f7194953b68..5353ab9a90aae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
index 45292969d3b35..0d1b86390d6d1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
index 39004ea1bb69c..92ac9161e8c89 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -14,8 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
-
 ; RUN: veczc -k load_add_store_i32 -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_4F
 ; RUN: veczc -k load_add_store_i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_1S
 ; RUN: veczc -k load_add_store_v4i32 -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_2F
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
index 9b1d3f3500bad..9660f9a601365 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
 ; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
index 777ac90321664..e28025d5bccfc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -14,8 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-13+
-
 ; RUN: veczc -k udiv -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s
 
 target triple = "spir64-unknown-unknown"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
index 95226cb3c6e9d..67965785ba932 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-12+
 ; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
index 5b82688d52770..4d07f4a90a961 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-12+
 ; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
index 26608325b841f..005dbcaa64966 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-15+
 ; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | FileCheck %s
 
 ; ModuleID = 'kernel.opencl'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
index 82088d13746cc..6adc22cf4efe8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
@@ -17,7 +17,6 @@
 ; Check some basic properties of the veczc command line interface for multiple
 ; vectorizations works in various configurations. The kernel outputs here are
 ; not interesting, only their names.
-; REQUIRES: llvm-12+
 ; RUN: veczc -w 8 -k foo:4,8,16.2@32s -k bar:,64s -S < %s | FileCheck %s
 
 ; CHECK-DAG: define spir_kernel void @foo
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
deleted file mode 100644
index 26ca097f327c7..0000000000000
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22-llvm18.ll
+++ /dev/null
@@ -1,264 +0,0 @@
-; Copyright (C) Codeplay Software Limited
-;
-; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
-; Exceptions; you may not use this file except in compliance with the License.
-; You may obtain a copy of the License at
-;
-;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
-;
-; Unless required by applicable law or agreed to in writing, software
-; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-; License for the specific language governing permissions and limitations
-; under the License.
-;
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-; REQUIRES: !llvm-19+
-; RUN: veczc -k partial_linearization22 -vecz-passes="function(lowerswitch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
-
-; The CFG of the following kernel is:
-;
-;     a
-;     |
-;     b <------.
-;    / \       |
-;   f   c <--. |
-;   |\ / \   | |
-;   | |   d -' |
-;   | |\ / \   |
-;   | | |   e -'
-;   | | |\ /
-;   | | | g
-;   | | |/
-;   | | /
-;    \|/
-;     h
-;
-; * where nodes b, d, and e are uniform branches, and node c is a varying
-;   branch.
-; * where nodes b, d, e and f are divergent.
-;
-; With partial linearization, it will be transformed as follows:
-;
-;     a
-;     |
-;     b <--.
-;    /|    |
-;   f c <. |
-;   | |  | |
-;   | d -' |
-;   | |    |
-;   | e ---'
-;    \|
-;     g
-;     |
-;     h
-;
-; __kernel void partial_linearization22(__global int *out, int n) {
-;   int id = get_global_id(0);
-;   int ret = 0;
-;
-;   while (1) {
-;     if (n > 0 && n < 5) {
-;       goto f;
-;     }
-;     while (1) {
-;       if (n <= 2) {
-;         goto f;
-;       } else {
-;         if (ret + id >= n) {
-;           goto d;
-;         }
-;       }
-;       if (n & 1) {
-;         goto h;
-;       }
-;
-; d:
-;       if (n > 3) {
-;         goto e;
-;       }
-;     }
-;
-; e:
-;     if (n & 1) {
-;       goto g;
-;     }
-;   }
-;
-; f:
-;   if (n == 2) {
-;     goto h;
-;   }
-;
-; g:
-;   for (int i = 0; i < n + 1; i++) ret++;
-;   goto h;
-;
-; h:
-;   out[id] = ret;
-; }
-
-; ModuleID = 'Unknown buffer'
-source_filename = "kernel.opencl"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "spir64-unknown-unknown"
-
-; Function Attrs: convergent nounwind
-define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
-entry:
-  %call = call i64 @__mux_get_global_id(i32 0) #2
-  %conv = trunc i64 %call to i32
-  br label %while.body
-
-while.body:                                       ; preds = %e, %entry
-  %n.off = add i32 %n, -1
-  %0 = icmp ult i32 %n.off, 4
-  %cmp6 = icmp slt i32 %n, 3
-  %or.cond1 = or i1 %cmp6, %0
-  br i1 %or.cond1, label %f, label %if.else
-
-while.body5:                                      ; preds = %d
-  switch i32 %n, label %g [
-    i32 3, label %if.else
-    i32 2, label %h
-  ]
-
-if.else:                                          ; preds = %while.body5, %while.body
-  %cmp9 = icmp sge i32 %conv, %n
-  %and = and i32 %n, 1
-  %tobool = icmp eq i32 %and, 0
-  %or.cond2 = or i1 %tobool, %cmp9
-  br i1 %or.cond2, label %d, label %h
-
-d:                                                ; preds = %if.else
-  %cmp16 = icmp sgt i32 %n, 3
-  br i1 %cmp16, label %e, label %while.body5
-
-e:                                                ; preds = %d
-  %and20 = and i32 %n, 1
-  %tobool21 = icmp eq i32 %and20, 0
-  br i1 %tobool21, label %while.body, label %g
-
-f:                                                ; preds = %while.body
-  %cmp24 = icmp eq i32 %n, 2
-  br i1 %cmp24, label %h, label %g
-
-g:                                                ; preds = %f, %e, %while.body5
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %g
-  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
-  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
-  %cmp29 = icmp sgt i32 %storemerge, %n
-  br i1 %cmp29, label %h, label %for.body
-
-for.body:                                         ; preds = %for.cond
-  %inc = add nuw nsw i32 %ret.0, 1
-  %inc31 = add nuw nsw i32 %storemerge, 1
-  br label %for.cond
-
-h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
-  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
-  %idxprom = sext i32 %conv to i64
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
-  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
-  ret void
-}
-
-; Function Attrs: convergent nounwind readonly
-declare i64 @__mux_get_global_id(i32) #1
-
-attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nobuiltin nounwind readonly }
-
-!llvm.module.flags = !{!0}
-!opencl.ocl.version = !{!1}
-!opencl.spir.version = !{!1}
-!opencl.kernels = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 1, i32 2}
-!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8}
-!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
-!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
-!5 = !{!"kernel_arg_type", !"int*", !"int"}
-!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
-!7 = !{!"kernel_arg_type_qual", !"", !""}
-!8 = !{!"kernel_arg_name", !"out", !"n"}
-
-; CHECK: spir_kernel void @__vecz_v4_partial_linearization22
-; CHECK: br label %[[WHILEBODY:.+]]
-
-; CHECK: [[WHILEBODY]]:
-; CHECK: %[[CMP6:.+]] = icmp slt
-; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]]
-; CHECK: %[[F_EXIT_MASK:.+]] = select i1
-; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]])
-; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]]
-
-; CHECK: [[IFELSEPREHEADER]]:
-; CHECK: br label %[[IFELSE:.+]]
-
-; CHECK: [[LEAFBLOCK1:.*]]:
-; CHECK: %[[SWITCHLEAF:.+]] = icmp eq i32 %n, 3
-; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
-
-; CHECK: [[IFELSEPUREEXIT]]:
-; CHECK: br label %[[E:.+]]
-
-; CHECK: [[IFELSE]]:
-; CHECK: br label %[[D:.+]]
-
-; CHECK: [[D]]:
-; CHECK: br label %[[LEAFBLOCK1]]
-
-; CHECK: [[E]]:
-; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
-
-; CHECK: [[WHILEBODYPUREEXIT]]:
-; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[G_EXIT_MASK:.+]], %[[F]] ], [ false, %[[E]] ]
-; CHECK: br label %[[HLOOPEXIT1:.+]]
-
-; CHECK: [[F]]:
-; CHECK: %[[CMP24:.+]] = icmp eq i32 %n, 2
-; CHECK: %[[G_EXIT_MASK]] = select i1 %[[CMP24]], i1 false, i1 %[[F_EXIT_MASK]]
-; CHECK: br label %[[WHILEBODYPUREEXIT]]
-
-; CHECK: [[FELSE:.+]]:
-; CHECK: br label %[[G:.+]]
-
-; CHECK: [[FSPLIT:.+]]:
-; CHECK: %[[CMP24_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %cmp24.merge)
-; CHECK: br i1 %[[CMP24_ANY]], label %[[H:.+]], label %[[G]]
-
-; CHECK: [[GLOOPEXIT:.+]]:
-; CHECK: br label %[[GLOOPEXITELSE:.+]]
-
-; CHECK: [[GLOOPEXITELSE]]:
-; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]]
-
-; CHECK: [[G]]:
-; CHECK: br label %[[FORCOND:.+]]
-
-; CHECK: [[FORCOND]]:
-; CHECK: br i1 true, label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]]
-
-; CHECK: [[FORBODY]]:
-; CHECK: br label %[[FORCOND]]
-
-
-
-; CHECK: [[HLOOPEXIT]]:
-; CHECK: br label %[[H:.+]]
-
-; CHECK: [[HLOOPEXIT1]]:
-; CHECK: br label %[[HLOOPEXIT1ELSE:.+]]
-
-; CHECK: [[HLOOPEXIT1ELSE]]:
-; CHECK: br label %[[GLOOPEXIT]]
-
-;; CHECK: [[H]]:
-;; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
index 2b78be90a7e76..7be8b4bbc187d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-19+
 ; RUN: veczc -k partial_linearization22 -vecz-passes="function(lower-switch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
 
 ; The CFG of the following kernel is:
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
index 2a0c3ad8f0b77..cad700234785a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -14,7 +14,6 @@
 ;
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-; REQUIRES: llvm-12+
 ; RUN: veczc -k foo -w 2 -vecz-passes scalarize,mask-memops,packetizer -print-after mask-memops -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

From d3a5a6998d9b47a20a4499bc16316206144a830c Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Tue, 2 Sep 2025 18:26:05 +0100
Subject: [PATCH 180/182] Re-enable RISC-V lit tests.

These tests were inadvertently disabled and hence had not been kept up
to date with current LLVM versions. This commit enables and updates
them.
---
 .../test/lit/llvm/RISCV/broadcast_vector.ll   | 122 +++++++++---------
 .../lit/llvm/RISCV/define_subgroup_scans.ll   |  14 +-
 .../llvm/RISCV/define_subgroup_scans_vp.ll    |   4 +-
 .../test/lit/llvm/RISCV/extract_element.ll    |  51 ++++----
 .../test/lit/llvm/RISCV/insert_element.ll     |  22 ++--
 .../lit/llvm/RISCV/select_scalar_vector.ll    |   6 +-
 .../vecz/test/lit/llvm/RISCV/vp_memops.ll     |  16 +--
 .../vecz/test/lit/llvm/RISCV/vp_vsetvli.ll    |   4 +-
 8 files changed, 120 insertions(+), 119 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
index c1a059ab2f4a5..36b06c64063d8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -112,23 +112,23 @@ entry:
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    store <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> poison, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> splat (float 0x7FF8000020000000), ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
-; CHECK-NEXT:  [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
+; CHECK-NEXT:  [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], splat (i32 3)
 ; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
+; CHECK-NEXT:  [[TMP0:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4
 ; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
 ; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:  [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
 ; CHECK-NEXT:  [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:  [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:  store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:  ret void
 ;
@@ -136,73 +136,75 @@ entry:
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <32 x float>, align 128
 ; CHECK-NEXT:    store <32 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 128
-; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 128 x i32> @llvm.experimental.stepvector.nxv128i32()
-; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], shufflevector (<vscale x 128 x i32> insertelement (<vscale x 128 x i32> poison, i32 31, {{i32|i64}} 0), <vscale x 128 x i32> poison, <vscale x 128 x i32> zeroinitializer)
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 128 x i32> @llvm.stepvector.nxv128i32()
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], splat (i32 31)
 ; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
 ; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> shufflevector (<vscale x 128 x i1> insertelement (<vscale x 128 x i1> poison, i1 true, {{i32|i64}} 0), <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer), <vscale x 128 x float> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> splat (i1 true), <vscale x 128 x float> poison)
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 128 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 128 x float>, ptr addrspace(1) [[ARRAYIDX]], align 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 128 x float> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <32 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <32 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 128 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 64
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2139095040, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 8388607, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], splat (i32 2139095040)
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], splat (i32 2139095040)
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], splat (i32 8388607)
 ; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> shufflevector (<vscale x 16 x float> insertelement (<vscale x 16 x float> poison, float 0x7FF0000020000000, {{i32|i64}} 0), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> splat (float 0x7FF0000020000000)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
 ; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
 ; CHECK-NEXT:    ret void
 ;
-;
 ; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:  [[EXISTINGALLOC:%.*]] = alloca <4 x i32>, align 16
-; CHECK-NEXT:  [[VS21:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
-; CHECK-NEXT:  [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:  [[TMP0:%.*]] = shl i64 [[XLEN]], 4
-; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
-; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
-; CHECK-NEXT:  store <4 x i32> zeroinitializer, ptr [[EXISTINGALLOC]], align 16
-; CHECK-NEXT:  store i32 1, ptr [[EXISTINGALLOC]], align 16
-; CHECK-NEXT:  [[V:%.*]] = load <4 x i32>, ptr [[EXISTINGALLOC]], align 16
-; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.v4i32(<vscale x 16 x i32> poison, <4 x i32> [[V]], i64 0)
-; CHECK-NEXT:  [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
-; CHECK-NEXT:  [[ARRAYIDX4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
-; CHECK-NEXT:  store <vscale x 16 x i32> [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 16
-; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
-; CHECK-NEXT:  [[TMP5:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 4
-; CHECK-NEXT:  [[V44:%.*]] = fadd <vscale x 16 x float> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:  [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
-; CHECK-NEXT:  store <vscale x 16 x float> [[V44]], ptr addrspace(1) [[ARRAYIDX3]], align 16
-; CHECK-NEXT:  ret void
-;
+; CHECK-NEXT:   [[VS22:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND]], i64 0)
+; CHECK-NEXT:   [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:   [[VS14:%.*]] = and <vscale x 16 x i32> [[IDX03]], splat (i32 3)
+; CHECK-NEXT:   [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:   [[TMP1:%.*]] = shl {{(nuw )?}}i64 [[TMP0]], 4
+; CHECK-NEXT:   [[TMP2:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS22]], <vscale x 16 x i32> [[VS14]], i64 [[TMP1]])
+; CHECK-NEXT:   [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.v4i32(<vscale x 16 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>, i64 0)
+; CHECK-NEXT:   [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2]], <vscale x 16 x i32> [[VS14]], i64 [[TMP1]])
+; CHECK-NEXT:   [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:   [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
+; CHECK-NEXT:   store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
+; CHECK-NEXT:   [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN]], i64 [[CALL]]
+; CHECK-NEXT:   [[TMP4:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:   [[V45:%.*]] = fadd <vscale x 16 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:   [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT]], i64 [[CALL]]
+; CHECK-NEXT:   store <vscale x 16 x float> [[V45]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:   ret void
+
 ; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VS21:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[WOOF:%.*]], i64 0)
-; CHECK-NEXT:    [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:    [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[XLEN4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[XLEN4]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
-; CHECK-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i8> @llvm.{{(experimental.)?}}vector.insert.nxv16i8.v4i8(<vscale x 16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:    [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 3, {{i32|i64}} 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP0]])
-; CHECK: [[TMP4:%.*]] = trunc <vscale x 16 x i8> [[TMP3]] to <vscale x 16 x i1>
-; CHECK: [[TMP5:%.*]] = fcmp oeq <vscale x 16 x float>
-; CHECK: [[TMP8:%.*]] = and <vscale x 16 x i1> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   [[VS21:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[WOOF:%.*]], i64 0)
+; CHECK-NEXT:   [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:   [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], splat (i32 3)
+; CHECK-NEXT:   [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:   [[TMP1:%.*]] = shl {{(nuw )?}}i64 [[TMP0]], 4
+; CHECK-NEXT:   [[TMP2:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP1]])
+; CHECK-NEXT:   [[TMP3:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
+; CHECK-NEXT:   [[VS2:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v4i8(<vscale x 16 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; CHECK-NEXT:   [[IDX0:%.*]] = call <vscale x 16 x i16> @llvm.stepvector.nxv16i16()
+; CHECK-NEXT:   [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], splat (i16 3)
+; CHECK-NEXT:   [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP1]])
+; CHECK-NEXT:   [[TMP5:%.*]] = trunc <vscale x 16 x i8> [[TMP4]] to <vscale x 16 x i1>
+; CHECK-NEXT:   [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:   [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN]], i64 [[CALL]]
+; CHECK-NEXT:   [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:   [[TMP7:%.*]] = fcmp oeq <vscale x 16 x float> [[TMP6]], splat (float 1.000000e+00)
+; CHECK-NEXT:   [[TMP8:%.*]] = and <vscale x 16 x i1> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:   [[TMP9:%.*]] = select <vscale x 16 x i1> [[TMP8]], <vscale x 16 x float> [[TMP6]], <vscale x 16 x float> [[TMP2]]
+; CHECK-NEXT:   [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT]], i64 [[CALL]]
+; CHECK-NEXT:   store <vscale x 16 x float> [[TMP9]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
index f4ade070647f1..2895d1848afea 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -40,9 +40,9 @@ define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
 declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>)
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
 ; CHECK: entry:
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
-; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
@@ -54,7 +54,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 
 ;------- target-dependent dynamic shuffle code:
 ; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
-; CHECK:   %[[VL:.+]] = mul i64 %[[VLSCALE]], 4
+; CHECK:   %[[VL:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE]], 4
 ; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
@@ -73,9 +73,9 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscal
 declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>)
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
 ; CHECK: entry:
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
-; CHECK:   %[[SIZE:.+]] = mul i32 %[[SCALE]], 4
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
@@ -87,7 +87,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 
 ;------- target-dependent dynamic shuffle code:
 ; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
-; CHECK:   %[[VL:.+]] = mul i64 %[[VLSCALE]], 4
+; CHECK:   %[[VL:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE]], 4
 ; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
 
 ; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
@@ -103,7 +103,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscal
 
 ;------- target-dependent slide-up code:
 ; CHECK:   %[[VLSCALE2:.+]] = call i64 @llvm.vscale.i64()
-; CHECK:   %[[VL2:.+]] = mul i64 %[[VLSCALE2]], 4
+; CHECK:   %[[VL2:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE2]], 4
 ; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
 
 ; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
index 98fd18da0dbe7..fbd4bcf657f63 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -41,7 +41,7 @@ define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
 declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
 ; CHECK: entry:
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
@@ -71,7 +71,7 @@ declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<v
 declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
 ; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
 ; CHECK: entry:
-; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK:   br label %loop
 ; CHECK: loop:
 ; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
index c151b1160783b..8c98f98249bf4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -106,15 +106,15 @@ entry:
 
 ; EE-LABEL: @__vecz_nxv4_extract_element(
 ; EE:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; EE-NEXT:    [[TMP2:%.*]] = shl i64 [[XLEN]], 2
+; EE-NEXT:    [[TMP2:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
 ; EE-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
 ; EE-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; EE-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; EE-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], splat (i32 2)
 ; EE-NEXT:    [[VS1:%.*]] = add <vscale x 4 x i32> [[IDXSCALE]], [[SPLAT]]
-; EE-NEXT:    [[T3:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-NEXT:    [[T3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
 ; EE-NEXT:    [[T4:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T3]], i64 [[TMP2]])
-; EE-NEXT:    [[T5:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T4]], i64 0)
+; EE-NEXT:    [[T5:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T4]], i64 0)
 
 ; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
 
@@ -129,40 +129,39 @@ entry:
 
 ; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec(
 ; EE-UNI-VEC:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; EE-UNI-VEC:         [[T3:%.*]] = shl i64 [[XLEN]], 2
+; EE-UNI-VEC:         [[T3:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
 ; EE-UNI-VEC-NEXT:    [[T:%.*]] = trunc <vscale x 4 x i64> [[T2:%.*]] to <vscale x 4 x i32>
-; EE-UNI-VEC-NEXT:    [[I1:%.*]] = and <vscale x 4 x i32> [[T]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-; EE-UNI-VEC-NEXT:    [[IDX02:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; EE-UNI-VEC-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX02]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; EE-UNI-VEC-NEXT:    [[I1:%.*]] = and <vscale x 4 x i32> [[T]], {{splat \(i32 3\)|trunc \(<vscale x 4 x i64> splat \(i64 3\) to <vscale x 4 x i32>\)}}
+; EE-UNI-VEC-NEXT:    [[IDX02:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EE-UNI-VEC-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX02]], splat (i32 2)
 
-; LLVM 16 deduces add/or equivalence and uses `or` instead.
-; EE-UNI-VEC-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1]]
+; EE-UNI-VEC-NEXT:    [[VS1:%.*]] = or disjoint <vscale x 4 x i32> [[IDXSCALE]], [[I1]]
 
-; EE-UNI-VEC-NEXT:    [[T4:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-UNI-VEC-NEXT:    [[T4:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
 ; EE-UNI-VEC-NEXT:    [[T5:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T4]], i64 [[T3]])
-; EE-UNI-VEC-NEXT:    [[T6:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T5]], i64 0)
+; EE-UNI-VEC-NEXT:    [[T6:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T5]], i64 0)
 
 ; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
 ; EE-INDICES:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; EE-INDICES-NEXT:    [[T4:%.*]] = shl i64 [[XLEN]], 2
-; EE-INDICES-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; EE-INDICES-NEXT:    [[VS1:%.*]] = {{add|or}} <vscale x 4 x i32> [[IDXSCALE]], [[I1:%.*]]
-; EE-INDICES-NEXT:    [[T5:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-INDICES-NEXT:    [[T4:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
+; EE-INDICES-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], splat (i32 2)
+; EE-INDICES-NEXT:    [[VS1:%.*]] = or disjoint <vscale x 4 x i32> [[IDXSCALE]], [[I1:%.*]]
+; EE-INDICES-NEXT:    [[T5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
 ; EE-INDICES-NEXT:    [[T6:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T3:%.*]], <vscale x 16 x i32> [[T5]], i64 [[T4]])
-; EE-INDICES-NEXT:    [[T7:%.*]] = call <vscale x 4 x float> @llvm.{{(experimental.)?}}vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T6]], i64 0)
+; EE-INDICES-NEXT:    [[T7:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T6]], i64 0)
 
 ; Check we promote from i1 to i8 before doing our memops and use vrgatherei16.
 ; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool(
 ; EE-BOOL:       [[T6:%.*]] = sext <vscale x 16 x i1> [[T5:%.*]] to <vscale x 16 x i8>
 ; EE-BOOL-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; EE-BOOL-NEXT:  [[T7:%.*]] = shl i64 [[XLEN]], 2
+; EE-BOOL-NEXT:  [[T7:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
 ; EE-BOOL-NEXT:  [[T8:%.*]] = trunc <vscale x 4 x i64> [[T0:%.*]] to <vscale x 4 x i16>
-; EE-BOOL-NEXT:  [[T9:%.*]] = and <vscale x 4 x i16> [[T8]], trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, {{(i32|i64)}} 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i16>)
-; EE-BOOL-NEXT:  [[T10:%.*]] = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
-; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 2, {{(i32|i64)}} 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
-; EE-BOOL-NEXT:  [[VS1:%.*]] = {{add|or}} <vscale x 4 x i16> [[T11]], [[T9]]
-; EE-BOOL-NEXT:  [[T12:%.*]] = call <vscale x 16 x i16> @llvm.{{(experimental.)?}}vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[VS1]], i64 0)
+; EE-BOOL-NEXT:  [[T9:%.*]] = and <vscale x 4 x i16> [[T8]], {{splat \(i16 3\)|trunc \(<vscale x 4 x i64> splat \(i64 3\) to <vscale x 4 x i16>\)}}
+; EE-BOOL-NEXT:  [[T10:%.*]] = call <vscale x 4 x i16> @llvm.stepvector.nxv4i16()
+; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], splat (i16 2)
+; EE-BOOL-NEXT:  [[VS1:%.*]] = or disjoint <vscale x 4 x i16> [[T11]], [[T9]]
+; EE-BOOL-NEXT:  [[T12:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[VS1]], i64 0)
 ; EE-BOOL-NEXT:  [[T13:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[T6]], <vscale x 16 x i16> [[T12]], i64 [[T7]])
-; EE-BOOL-NEXT:  [[T14:%.*]] = call <vscale x 4 x i8> @llvm.{{(experimental.)?}}vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8> [[T13]], i64 0)
+; EE-BOOL-NEXT:  [[T14:%.*]] = call <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8> [[T13]], i64 0)
 ; EE-BOOL-NEXT:  [[T15:%.*]] = trunc <vscale x 4 x i8> [[T14]] to <vscale x 4 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
index 2ef110b4b99b5..782982d0447ee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -99,13 +99,13 @@ entry:
 ; IE:         [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[VAL:%.*]], {{(i32|i64)}} 0
 ; IE:         [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; IE:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; IE-NEXT:    [[TMP2:%.*]] = shl i64 [[XLEN]], 4
+; IE-NEXT:    [[TMP2:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4
 ; IE-NEXT:    [[SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
 ; IE-NEXT:    [[SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; IE-NEXT:    [[ELTS:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[SPLAT]], i64 0)
-; IE-NEXT:    [[STEP:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; IE-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[STEP]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; IE-NEXT:    [[OUTER:%.*]] = lshr <vscale x 16 x i32> [[STEP]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; IE-NEXT:    [[ELTS:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[SPLAT]], i64 0)
+; IE-NEXT:    [[STEP:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; IE-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[STEP]], splat (i32 3)
+; IE-NEXT:    [[OUTER:%.*]] = lshr <vscale x 16 x i32> [[STEP]], splat (i32 2)
 ; IE-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[SPLAT2]], [[INNER]]
 ; IE-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(<vscale x 16 x float> [[TMP1:%.*]], <vscale x 16 x float> [[ELTS]], <vscale x 16 x i32> [[OUTER]], <vscale x 16 x i1> [[VM]], i64 [[TMP2]]{{(, i64 1)?}})
 
@@ -117,13 +117,13 @@ entry:
 ; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices(
 ; IE-INDICES:         [[FIDX2:%.*]] = uitofp <vscale x 4 x i64> [[TMP0:%.*]] to <vscale x 4 x float>
 ; IE-INDICES-NEXT:    [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
-; IE-INDICES-NEXT:    [[TMP5:%.*]] = shl i64 [[XLEN]], 4
-; IE-INDICES-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental.)?}}vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> {{%.*}}, i64 0)
-; IE-INDICES:         [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, {{(i32|i64)}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; IE-INDICES-NEXT:    [[TMP5:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4
+; IE-INDICES-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> {{%.*}}, i64 0)
+; IE-INDICES:         [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], splat (i32 2)
 ; IE-INDICES-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i32> [[IDX1]], i64 [[TMP5]])
-; IE-INDICES-NEXT:    [[VS25:%.*]] = call <vscale x 16 x float> @llvm.{{(experimental.)?}}vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[FIDX2]], i64 0)
-; IE-INDICES-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[IDX0]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 3, {{i32|i64}} 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; IE-INDICES-NEXT:    [[VS25:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[FIDX2]], i64 0)
+; IE-INDICES-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[IDX0]], splat (i32 3)
 ; IE-INDICES-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[TMP9]], [[INNER]]
 ; IE-INDICES-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(<vscale x 16 x float> [[TMP4:%.*]], <vscale x 16 x float> [[VS25]], <vscale x 16 x i32> [[IDX1]], <vscale x 16 x i1> [[VM]], i64 [[TMP5]]{{(, i64 1)?}})
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
index 4999374cd5d80..8af9cb06320bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -41,9 +41,9 @@ entry:
 ; CHECK: [[rhs:%.*]] = load <vscale x 8 x i32>, ptr
 ; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
 ; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
-; CHECK: [[idx0:%.*]] = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
-; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, {{i32|i64}} 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[idx0:%.*]] = call <vscale x 8 x i16> @llvm.stepvector.nxv8i16()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], splat (i16 1)
 ; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[vs2:%.*]], <vscale x 8 x i16> [[vs1:%.*]], i64 [[xlen:%.*]])
 ; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
-; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 4, {{i32|i64}} 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> splat (i32 4)
 ; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
index a206c8b781c32..7ab9888ec9b91 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -39,19 +39,19 @@ ret:
   ret void
 }
 
-; CHECK-STORE-4:       define void @__vecz_b_masked_store4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 4 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-STORE-4:       define void @__vecz_b_masked_store4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 4 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]])
 ; CHECK-STORE-4-NEXT:  entry:
 ; CHECK-STORE-4-NEXT:    call void @llvm.vp.store.nxv4i32.p1(<vscale x 4 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], <vscale x 4 x i1> [[TMP2]], i32 [[TMP3]])
 ; CHECK-STORE-4-NEXT:    ret void
 
-; CHECK-STORE-8:       define void @__vecz_b_masked_store4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(<vscale x 8 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 8 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-STORE-8:       define void @__vecz_b_masked_store4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(<vscale x 8 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 8 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]])
 ; CHECK-STORE-8-NEXT:  entry:
 ; CHECK-STORE-8-NEXT:    call void @llvm.vp.store.nxv8i32.p1(<vscale x 8 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], <vscale x 8 x i1> [[TMP2]], i32 [[TMP3]])
 ; CHECK-STORE-8-NEXT:    ret void
 
-; CHECK-STORE-16:       define void @__vecz_b_masked_store4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(<vscale x 16 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 16 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-STORE-16:       define void @__vecz_b_masked_store4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(<vscale x 16 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 16 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]])
 ; CHECK-STORE-16-NEXT:  entry:
-; CHECK-STORE-16-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-STORE-16-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; CHECK-STORE-16-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], {{i32|i64}} 0
 ; CHECK-STORE-16-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-STORE-16-NEXT:    [[TMP6:%.*]] = icmp ult <vscale x 16 x i32> [[TMP5]], [[SPLAT]]
@@ -76,19 +76,19 @@ ret:
   ret void
 }
 
-; CHECK-LOAD-4:      define <vscale x 4 x i32> @__vecz_b_masked_load4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 4 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-LOAD-4:      define <vscale x 4 x i32> @__vecz_b_masked_load4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 4 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]])
 ; CHECK-LOAD-4-NEXT: entry:
 ; CHECK-LOAD-4-NEXT:   [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p1(ptr addrspace(1) [[TMP0]], <vscale x 4 x i1> [[TMP1]], i32 [[TMP2]])
 ; CHECK-LOAD-4-NEXT:   ret <vscale x 4 x i32> [[TMP4]]
 
-; CHECK-LOAD-8:      define <vscale x 8 x i32> @__vecz_b_masked_load4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 8 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-LOAD-8:      define <vscale x 8 x i32> @__vecz_b_masked_load4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 8 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]])
 ; CHECK-LOAD-8-NEXT: entry:
 ; CHECK-LOAD-8-NEXT:   [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p1(ptr addrspace(1) [[TMP0]], <vscale x 8 x i1> [[TMP1]], i32 [[TMP2]])
 ; CHECK-LOAD-8-NEXT:   ret <vscale x 8 x i32> [[TMP4]]
 
-; CHECK-LOAD-16:      define <vscale x 16 x i32> @__vecz_b_masked_load4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 16 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-LOAD-16:      define <vscale x 16 x i32> @__vecz_b_masked_load4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 16 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]])
 ; CHECK-LOAD-16-NEXT: entry:
-; CHECK-LOAD-16-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-LOAD-16-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; CHECK-LOAD-16-NEXT: [[TMPSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], {{i32|i64}} 0
 ; CHECK-LOAD-16-NEXT: [[TMPSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[TMPSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-LOAD-16-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 16 x i32> [[TMP4]], [[TMPSPLAT]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
index 4c201f654acde..7823d56291ac5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -38,8 +38,8 @@ entry:
 ; CHECK: %local.id = call i64 @__mux_get_local_id(i32 0)
 ; CHECK: %local.size = call i64 @__mux_get_local_size(i32 0)
 ; CHECK: %work.remaining = sub nuw nsw i64 %local.size, %local.id
-; CHECK: %[[vli64:.+]] = call i64 @llvm.riscv.vsetvli.opt.i64(i64 %work.remaining, i64 2, i64 1)
-; CHECK: %[[vl:.+]] = trunc i64 %[[vli64]] to i32
+; CHECK: %[[vli64:.+]] = call i64 @llvm.riscv.vsetvli.i64(i64 %work.remaining, i64 2, i64 1)
+; CHECK: %[[vl:.+]] = trunc nuw i64 %[[vli64]] to i32
 ; CHECK: %[[lhs:.+]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]])
 ; CHECK: %[[rhs:.+]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]])
 ; CHECK: %[[sum:.+]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %[[lhs]], <vscale x 4 x i32> %[[rhs]], {{.*}}, i32 %[[vl]])

From 25f934d92a9d7f1771a10a926710a7cf6341c1db Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 5 Sep 2025 13:43:44 +0100
Subject: [PATCH 181/182] [SYCL][NATIVE_CPU] Resolve formatting issues on
 compiler_pipeline

There were clang-format-20 and python formatting fixes required post
integration of oneAPI Construction Kit pipeline which are addressed here.
---
 .../include/compiler/utils/address_spaces.h   |    6 +-
 .../include/compiler/utils/attributes.h       |    8 +-
 .../include/compiler/utils/barrier_regions.h  |   19 +-
 .../include/compiler/utils/builtin_info.h     |   97 +-
 .../include/compiler/utils/cl_builtin_info.h  |   24 +-
 .../compiler/utils/define_mux_builtins_pass.h |    8 +-
 .../include/compiler/utils/device_info.h      |   14 +-
 .../include/compiler/utils/dma.h              |    8 +-
 .../utils/encode_kernel_metadata_pass.h       |    8 +-
 .../compiler/utils/group_collective_helpers.h |    8 +-
 .../include/compiler/utils/mangling.h         |   28 +-
 .../include/compiler/utils/metadata.h         |   28 +-
 .../utils/optimal_builtin_replacement_pass.h  |   10 +-
 .../include/compiler/utils/pass_functions.h   |    8 +-
 .../include/compiler/utils/pass_machinery.h   |   10 +-
 .../compiler/utils/prepare_barriers_pass.h    |    8 +-
 ...eplace_local_module_scope_variables_pass.h |    8 +-
 .../include/compiler/utils/scheduling.h       |    8 +-
 .../compiler/utils/sub_group_analysis.h       |   18 +-
 .../compiler/utils/target_extension_types.h   |   62 +-
 .../utils/unique_opaque_structs_pass.h        |    8 +-
 .../compiler/utils/work_item_loops_pass.h     |   10 +-
 .../include/multi_llvm/instructions.h         |   10 +-
 .../include/multi_llvm/instructions.inc       |   60 +-
 .../include/multi_llvm/intrinsic.h            |    8 +-
 .../include/multi_llvm/llvm_version.h         |   16 +-
 .../include/multi_llvm/multi_llvm.h           |    2 +-
 .../multi_llvm/target_transform_info.h        |    6 +-
 .../include/multi_llvm/targetinfo.h           |    6 +-
 .../include/multi_llvm/vector_type_helper.h   |    4 +-
 .../compiler_pipeline/source/attributes.cpp   |   28 +-
 .../source/barrier_regions.cpp                |   26 +-
 .../compiler_pipeline/source/builtin_info.cpp | 1223 +++++----
 .../source/cl_builtin_info.cpp                | 2395 +++++++++--------
 .../source/define_mux_builtins_pass.cpp       |    5 +-
 .../compiler_pipeline/source/dma.cpp          |    4 +-
 .../source/encode_kernel_metadata_pass.cpp    |   10 +-
 .../source/group_collective_helpers.cpp       |   66 +-
 .../compiler_pipeline/source/mangling.cpp     |  232 +-
 .../compiler_pipeline/source/metadata.cpp     |   12 +-
 .../source/mux_builtin_info.cpp               |  833 +++---
 .../optimal_builtin_replacement_pass.cpp      |   10 +-
 .../source/pass_functions.cpp                 |   91 +-
 .../source/pass_machinery.cpp                 |    8 +-
 .../source/prepare_barriers_pass.cpp          |    5 +-
 ...lace_local_module_scope_variables_pass.cpp |    2 +-
 .../compiler_pipeline/source/scheduling.cpp   |    4 +-
 .../source/sub_group_analysis.cpp             |   26 +-
 .../source/target_extension_types.cpp         |   30 +-
 .../source/unique_opaque_structs_pass.cpp     |   21 +-
 .../source/work_item_loops_pass.cpp           |  341 ++-
 .../compiler_passes/vecz/include/vecz/pass.h  |   20 +-
 .../vecz/include/vecz/vecz_choices.h          |   10 +-
 .../vecz/include/vecz/vecz_target_info.h      |   90 +-
 .../source/analysis/divergence_analysis.cpp   |    8 +-
 .../analysis/instantiation_analysis.cpp       |    4 +-
 .../source/analysis/liveness_analysis.cpp     |   10 +-
 .../analysis/packetization_analysis.cpp       |    6 +-
 .../source/analysis/simd_width_analysis.cpp   |    6 +-
 .../analysis/uniform_value_analysis.cpp       |   12 +-
 .../vectorizable_function_analysis.cpp        |   23 +-
 .../analysis/vectorization_unit_analysis.cpp  |   10 +-
 .../vecz/source/control_flow_boscc.cpp        |   11 +-
 .../vecz/source/control_flow_roscc.cpp        |    2 +-
 .../compiler_passes/vecz/source/debugging.cpp |    2 +-
 .../include/analysis/control_flow_analysis.h  |   10 +-
 .../include/analysis/divergence_analysis.h    |   14 +-
 .../include/analysis/instantiation_analysis.h |    6 +-
 .../include/analysis/liveness_analysis.h      |   12 +-
 .../include/analysis/packetization_analysis.h |   14 +-
 .../include/analysis/simd_width_analysis.h    |    8 +-
 .../source/include/analysis/stride_analysis.h |   16 +-
 .../include/analysis/uniform_value_analysis.h |   10 +-
 .../analysis/vectorizable_function_analysis.h |    8 +-
 .../analysis/vectorization_unit_analysis.h    |   16 +-
 .../vecz/source/include/control_flow_boscc.h  |   20 +-
 .../vecz/source/include/control_flow_roscc.h  |   10 +-
 .../vecz/source/include/debugging.h           |  106 +-
 .../vecz/source/include/ir_cleanup.h          |    8 +-
 .../vecz/source/include/llvm_helpers.h        |    4 +-
 .../vecz/source/include/memory_operations.h   |   59 +-
 .../vecz/source/include/offset_info.h         |    8 +-
 .../vecz/source/include/reachability.h        |   10 +-
 .../vecz/source/include/simd_packet.h         |    4 +-
 .../transform/common_gep_elimination_pass.h   |    8 +-
 .../transform/control_flow_conversion_pass.h  |   16 +-
 .../inline_post_vectorization_pass.h          |    6 +-
 .../include/transform/instantiation_pass.h    |    8 +-
 .../interleaved_group_combine_pass.h          |    8 +-
 .../include/transform/packetization_helpers.h |   10 +-
 .../include/transform/packetization_pass.h    |    6 +-
 .../source/include/transform/packetizer.h     |   12 +-
 .../vecz/source/include/transform/passes.h    |   32 +-
 .../include/transform/printf_scalarizer.h     |   22 +-
 .../include/transform/scalarization_pass.h    |   10 +-
 .../source/include/transform/scalarizer.h     |   10 +-
 .../transform/ternary_transform_pass.h        |    6 +-
 .../source/include/vectorization_context.h    |   29 +-
 .../source/include/vectorization_helpers.h    |    6 +-
 .../source/include/vectorization_heuristics.h |    6 +-
 .../vecz/source/include/vectorization_unit.h  |   15 +-
 .../vecz/source/include/vectorizer.h          |    6 +-
 .../vecz/source/include/vecz_pass_builder.h   |   10 +-
 .../vecz/source/ir_cleanup.cpp                |    2 +-
 .../vecz/source/memory_operations.cpp         |  114 +-
 .../vecz/source/offset_info.cpp               |  129 +-
 .../compiler_passes/vecz/source/pass.cpp      |   12 +-
 .../vecz/source/reachability.cpp              |    2 +-
 .../control_flow_conversion_pass.cpp          |  167 +-
 .../inline_post_vectorization_pass.cpp        |    6 +-
 .../source/transform/instantiation_pass.cpp   |   14 +-
 .../interleaved_group_combine_pass.cpp        |   18 +-
 .../transform/loop_rotate_custom_pass.cpp     |    9 +-
 .../transform/packetization_helpers.cpp       |   92 +-
 .../source/transform/packetization_pass.cpp   |    5 +-
 .../vecz/source/transform/packetizer.cpp      |  345 ++-
 .../vecz/source/transform/passes.cpp          |    2 +-
 .../source/transform/pre_linearize_pass.cpp   |   98 +-
 .../source/transform/printf_scalarizer.cpp    |  277 +-
 .../source/transform/scalarization_pass.cpp   |    2 +-
 .../vecz/source/transform/scalarizer.cpp      |  232 +-
 .../transform/simplify_infinite_loop_pass.cpp |    7 +-
 .../transform/ternary_transform_pass.cpp      |    2 +-
 .../transform/uniform_reassociation_pass.cpp  |    8 +-
 .../vecz/source/vector_target_info.cpp        |   78 +-
 .../vecz/source/vector_target_info_arm.cpp    |  102 +-
 .../vecz/source/vector_target_info_riscv.cpp  |   76 +-
 .../vecz/source/vectorization_choices.cpp     |    4 +-
 .../vecz/source/vectorization_context.cpp     |   79 +-
 .../vecz/source/vectorization_helpers.cpp     |   13 +-
 .../vecz/source/vectorization_heuristics.cpp  |   90 +-
 .../vecz/source/vectorization_unit.cpp        |   17 +-
 .../vecz/source/vectorizer.cpp                |   13 +-
 .../vecz/source/vecz_pass_builder.cpp         |   32 +-
 .../vecz/tools/source/veczc.cpp               |   78 +-
 135 files changed, 4401 insertions(+), 4397 deletions(-)

diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
index 42097cdcb900e..228097d1434d8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
@@ -32,7 +32,7 @@ enum {
   Generic = 4,
 };
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
+#endif // COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
index 177eaa0a432d8..3ea0a5fad08ca 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
@@ -24,7 +24,7 @@
 namespace llvm {
 class CallInst;
 class Function;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -180,7 +180,7 @@ bool hasNoExplicitSubgroups(const llvm::Function &F);
 /// Currently always returns 1!
 unsigned getMuxSubgroupSize(const llvm::Function &F);
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
+#endif // COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
index 9bae40595d480..701ac4d0f3102 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -45,7 +45,7 @@ class Module;
 class StructType;
 class Type;
 class Value;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -84,17 +84,14 @@ struct BarrierRegion {
 };
 
 class Barrier {
- public:
+public:
   /// @brief Type for ids of new kernel functions
   using kernel_id_map_t = std::map<unsigned, llvm::Function *>;
 
   Barrier(llvm::Module &m, llvm::Function &f, bool IsDebug)
       : live_var_mem_ty_(nullptr),
-        size_t_bytes(compiler::utils::getSizeTypeBytes(m)),
-        module_(m),
-        func_(f),
-        is_debug_(IsDebug),
-        max_live_var_alignment(0) {}
+        size_t_bytes(compiler::utils::getSizeTypeBytes(m)), module_(m),
+        func_(f), is_debug_(IsDebug), max_live_var_alignment(0) {}
 
   /// @brief perform the Barrier Region analysis and kernel splitting
   void Run(llvm::ModuleAnalysisManager &mam);
@@ -204,7 +201,7 @@ class Barrier {
                            const char *name, bool reuse = false);
   };
 
- private:
+private:
   /// @brief The first is set for livein and the second is set for liveout
   using live_in_out_t =
       std::pair<llvm::DenseSet<llvm::Value *>, llvm::DenseSet<llvm::Value *>>;
@@ -362,7 +359,7 @@ class Barrier {
   void SeperateKernelWithBarrier();
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
+#endif // COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
index 8b77ed7ee38da..b88b82aab6123 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
@@ -294,7 +294,7 @@ constexpr const char set_local_id[] = "__mux_set_local_id";
 constexpr const char set_sub_group_id[] = "__mux_set_sub_group_id";
 constexpr const char set_num_sub_groups[] = "__mux_set_num_sub_groups";
 constexpr const char set_max_sub_group_size[] = "__mux_set_max_sub_group_size";
-}  // namespace MuxBuiltins
+} // namespace MuxBuiltins
 
 static inline llvm::Type *getPointerReturnPointeeTy(const llvm::Function &F,
                                                     BuiltinProperties Props) {
@@ -336,7 +336,7 @@ class BILangInfoConcept;
 /// It contains an optional BILangInfoConcept implementation to provide builtin
 /// information on a target-by-target basis.
 class BuiltinInfo {
- public:
+public:
   // Default-construct a BuiltinInfo without a concrete set of language-level
   // builtins.
   BuiltinInfo() : MuxImpl(std::make_unique<BIMuxInfoConcept>()) {}
@@ -398,9 +398,10 @@ class BuiltinInfo {
   /// the 3 dimensions that this target supports.
   /// @param[in] MaxGlobalSizes The maximum global work-group sizes in each of
   /// the 3 dimensions that this target supports.
-  std::optional<llvm::ConstantRange> getBuiltinRange(
-      llvm::CallInst &CI, std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
-      std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const;
+  std::optional<llvm::ConstantRange>
+  getBuiltinRange(llvm::CallInst &CI,
+                  std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+                  std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const;
 
   /// @brief Lowers a call to a language-level builtin to an instruction
   /// sequences calling a mux builtin.
@@ -447,8 +448,8 @@ class BuiltinInfo {
 
   /// @brief Returns the mux builtin ID matching the group collective, or
   /// eBuiltinInvalid.
-  static std::optional<BuiltinID> getMuxGroupCollective(
-      const GroupCollective &Group);
+  static std::optional<BuiltinID>
+  getMuxGroupCollective(const GroupCollective &Group);
 
   /// @brief Returns true if the mux builtin has a barrier ID as its first
   /// operand.
@@ -474,8 +475,9 @@ class BuiltinInfo {
   ///
   /// @param OverloadInfo An array of types required to resolve certain
   /// overloadable builtins, e.g., group builtins.
-  static std::string getMuxBuiltinName(
-      BuiltinID ID, llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+  static std::string
+  getMuxBuiltinName(BuiltinID ID,
+                    llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
 
   /// @brief Mangles a type using the LLVM intrinsic scheme
   ///
@@ -495,8 +497,8 @@ class BuiltinInfo {
   /// if it was unable to demangle a type.
   ///
   /// @see getMangledTypeStr
-  static std::pair<llvm::Type *, llvm::StringRef> getDemangledTypeFromStr(
-      llvm::StringRef TyStr, llvm::LLVMContext &Ctx);
+  static std::pair<llvm::Type *, llvm::StringRef>
+  getDemangledTypeFromStr(llvm::StringRef TyStr, llvm::LLVMContext &Ctx);
 
   /// @brief Defines the body of a ComputeMux builtin declaration
   ///
@@ -507,17 +509,17 @@ class BuiltinInfo {
   ///
   /// @param OverloadInfo An array of types required to resolve certain
   /// overloadable builtins, e.g., group builtins.
-  llvm::Function *defineMuxBuiltin(
-      BuiltinID, llvm::Module &M,
-      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+  llvm::Function *
+  defineMuxBuiltin(BuiltinID, llvm::Module &M,
+                   llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
 
   /// @brief Gets a ComputeMux builtin from the module, or declares it
   ///
   /// @param OverloadInfo An array of types required to resolve certain
   /// overloadable builtins, e.g., group builtins.
-  llvm::Function *getOrDeclareMuxBuiltin(
-      BuiltinID, llvm::Module &M,
-      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+  llvm::Function *
+  getOrDeclareMuxBuiltin(BuiltinID, llvm::Module &M,
+                         llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
 
   struct SchedParamInfo {
     /// @brief An identifier providing resolution for targets to identify
@@ -564,8 +566,8 @@ class BuiltinInfo {
   ///
   /// This function does not have to fill in SchedParamInfo::ArgVal, as this
   /// query is not specific to one function.
-  llvm::SmallVector<SchedParamInfo, 4> getMuxSchedulingParameters(
-      llvm::Module &);
+  llvm::SmallVector<SchedParamInfo, 4>
+  getMuxSchedulingParameters(llvm::Module &);
 
   /// @brief Returns target-specific scheduling parameters from a concrete
   /// function.
@@ -582,8 +584,8 @@ class BuiltinInfo {
   /// parameters.
   ///
   /// If not set, this function returns an empty list.
-  llvm::SmallVector<SchedParamInfo, 4> getFunctionSchedulingParameters(
-      llvm::Function &);
+  llvm::SmallVector<SchedParamInfo, 4>
+  getFunctionSchedulingParameters(llvm::Function &);
 
   /// @brief Responsible for initializing a scheduling parameter for which
   /// PassedExternally is 'false'.
@@ -641,7 +643,7 @@ class BuiltinInfo {
     return false;
   }
 
- private:
+private:
   /// @brief Try to identify a builtin function.
   /// @param[in] F The function to identify.
   /// @return Valid builtin ID if the name was identified, as well as any types
@@ -666,18 +668,18 @@ class BuiltinInfo {
 /// information and transformations to an instance of BuiltinInfo. All methods
 /// are to be called through from the equivalent methods in BuiltinInfo.
 class BIMuxInfoConcept {
- public:
+public:
   virtual ~BIMuxInfoConcept() = default;
 
   /// @brief See BuiltinInfo::defineMuxBuiltin.
-  virtual llvm::Function *defineMuxBuiltin(
-      BuiltinID, llvm::Module &M,
-      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+  virtual llvm::Function *
+  defineMuxBuiltin(BuiltinID, llvm::Module &M,
+                   llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
 
   /// @brief See BuiltinInfo::getOrDeclareMuxBuiltin.
-  virtual llvm::Function *getOrDeclareMuxBuiltin(
-      BuiltinID, llvm::Module &M,
-      llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+  virtual llvm::Function *
+  getOrDeclareMuxBuiltin(BuiltinID, llvm::Module &M,
+                         llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
 
   /// @brief See BuiltinInfo::getMuxSchedulingParameters
   virtual llvm::SmallVector<BuiltinInfo::SchedParamInfo, 4>
@@ -710,9 +712,10 @@ class BIMuxInfoConcept {
   virtual llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M);
 
   /// @see BuiltinInfo::getBuiltinRange
-  virtual std::optional<llvm::ConstantRange> getBuiltinRange(
-      llvm::CallInst &, BuiltinID ID, std::array<std::optional<uint64_t>, 3>,
-      std::array<std::optional<uint64_t>, 3>) const;
+  virtual std::optional<llvm::ConstantRange>
+  getBuiltinRange(llvm::CallInst &, BuiltinID ID,
+                  std::array<std::optional<uint64_t>, 3>,
+                  std::array<std::optional<uint64_t>, 3>) const;
 
   enum MemScope : uint32_t {
     MemScopeCrossDevice = 0,
@@ -736,7 +739,7 @@ class BIMuxInfoConcept {
     MemSemanticsCrossWorkGroupMemory = 0x200,
   };
 
- protected:
+protected:
   llvm::Function *defineGetGlobalId(llvm::Module &M);
   llvm::Function *defineGetGlobalSize(llvm::Module &M);
   llvm::Function *defineGetLocalLinearId(llvm::Module &M);
@@ -783,14 +786,14 @@ class BIMuxInfoConcept {
 /// transformations to an instance of BuiltinInfo. All methods are to be called
 /// through from the equivalent methods in BuiltinInfo.
 class BILangInfoConcept {
- public:
+public:
   virtual ~BILangInfoConcept() = default;
 
   /// @see BuiltinInfo::getBuiltinsModule
   virtual llvm::Module *getBuiltinsModule() { return nullptr; }
   /// @see BuiltinInfo::analyzeBuiltin
-  virtual std::optional<Builtin> analyzeBuiltin(
-      const llvm::Function &F) const = 0;
+  virtual std::optional<Builtin>
+  analyzeBuiltin(const llvm::Function &F) const = 0;
   /// @see BuiltinInfo::isBuiltinUniform
   virtual BuiltinUniformity isBuiltinUniform(const Builtin &B,
                                              const llvm::CallInst *,
@@ -802,13 +805,13 @@ class BILangInfoConcept {
   virtual llvm::Function *getScalarEquivalent(const Builtin &B,
                                               llvm::Module *M) = 0;
   /// @see BuiltinInfo::emitBuiltinInline
-  virtual llvm::Value *emitBuiltinInline(
-      llvm::Function *Builtin, llvm::IRBuilder<> &B,
-      llvm::ArrayRef<llvm::Value *> Args) = 0;
+  virtual llvm::Value *
+  emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B,
+                    llvm::ArrayRef<llvm::Value *> Args) = 0;
   /// @see BuiltinInfo::getBuiltinRange
-  virtual std::optional<llvm::ConstantRange> getBuiltinRange(
-      llvm::CallInst &, std::array<std::optional<uint64_t>, 3>,
-      std::array<std::optional<uint64_t>, 3>) const {
+  virtual std::optional<llvm::ConstantRange>
+  getBuiltinRange(llvm::CallInst &, std::array<std::optional<uint64_t>, 3>,
+                  std::array<std::optional<uint64_t>, 3>) const {
     return std::nullopt;
   }
 
@@ -826,7 +829,7 @@ class BuiltinInfoAnalysis
     : public llvm::AnalysisInfoMixin<BuiltinInfoAnalysis> {
   friend AnalysisInfoMixin<BuiltinInfoAnalysis>;
 
- public:
+public:
   using Result = BuiltinInfo;
   using CallbackFn = std::function<Result(const llvm::Module &)>;
 
@@ -842,7 +845,7 @@ class BuiltinInfoAnalysis
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "BuiltinInfo analysis"; }
 
- private:
+private:
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
 
@@ -851,7 +854,7 @@ class BuiltinInfoAnalysis
 };
 
 /// @}
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
+#endif // COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
index 5e6f3fe26e9b6..16be8450d5124 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
@@ -36,10 +36,10 @@ std::unique_ptr<BILangInfoConcept> createCLBuiltinInfo(llvm::Module *builtins);
 
 /// @brief Builtin loader base class.
 class CLBuiltinLoader {
- protected:
+protected:
   CLBuiltinLoader() = default;
 
- public:
+public:
   virtual ~CLBuiltinLoader() = default;
 
   /// @brief Load a builtin function.
@@ -59,7 +59,7 @@ class CLBuiltinLoader {
 
 /// @brief Simple Builtin loader wrapping a given builtins module.
 class SimpleCLBuiltinLoader final : public CLBuiltinLoader {
- public:
+public:
   SimpleCLBuiltinLoader(llvm::Module *builtins) : BuiltinModule(builtins) {}
 
   ~SimpleCLBuiltinLoader() = default;
@@ -67,7 +67,7 @@ class SimpleCLBuiltinLoader final : public CLBuiltinLoader {
   /// @brief Expose any builtins Module
   virtual llvm::Module *getBuiltinsModule() override { return BuiltinModule; }
 
- private:
+private:
   /// @brief Loaded builtins module.
   llvm::Module *BuiltinModule;
 };
@@ -75,7 +75,7 @@ class SimpleCLBuiltinLoader final : public CLBuiltinLoader {
 ///  @brief A class that encapsulates information and transformations concerning
 /// compiler OpenCL builtin functions.
 class CLBuiltinInfo : public BILangInfoConcept {
- public:
+public:
   /// @brief Constructs a CLBuiltinInfo from a given Builtins module
   CLBuiltinInfo(llvm::Module *Builtins);
 
@@ -108,12 +108,12 @@ class CLBuiltinInfo : public BILangInfoConcept {
   /// @see BuiltinInfo::getPrintfBuiltin
   std::optional<BuiltinID> getPrintfBuiltin() const override;
 
- private:
+private:
   std::optional<BuiltinID> identifyBuiltin(const llvm::Function &) const;
 
-  llvm::Function *materializeBuiltin(
-      llvm::StringRef BuiltinName, llvm::Module *DestM = nullptr,
-      BuiltinMatFlags Flags = eBuiltinMatDefault);
+  llvm::Function *
+  materializeBuiltin(llvm::StringRef BuiltinName, llvm::Module *DestM = nullptr,
+                     BuiltinMatFlags Flags = eBuiltinMatDefault);
 
   llvm::Instruction *lowerGroupBuiltinToMuxBuiltin(llvm::CallInst &CI,
                                                    BuiltinID ID,
@@ -210,7 +210,7 @@ class CLBuiltinInfo : public BILangInfoConcept {
 };
 
 /// @}
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
+#endif // COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
index 525c125a886f5..af33fbce17788 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
@@ -26,11 +26,11 @@ namespace utils {
 
 class DefineMuxBuiltinsPass final
     : public llvm::PassInfoMixin<DefineMuxBuiltinsPass> {
- public:
+public:
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
index 6ec701f758159..c1002430aadc1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
@@ -68,9 +68,7 @@ struct DeviceInfo {
   /// @param max_work_width  The maximum number of work-items of a work-group
   /// allowed to execute in one invocation of a kernel.
   DeviceInfo(uint32_t h, uint32_t f, uint32_t d, uint32_t max_work_width)
-      : half_capabilities(h),
-        float_capabilities(f),
-        double_capabilities(d),
+      : half_capabilities(h), float_capabilities(f), double_capabilities(d),
         max_work_width(max_work_width) {}
 
   uint32_t half_capabilities = 0;
@@ -99,7 +97,7 @@ struct DeviceInfo {
 class DeviceInfoAnalysis : public llvm::AnalysisInfoMixin<DeviceInfoAnalysis> {
   friend AnalysisInfoMixin<DeviceInfoAnalysis>;
 
- public:
+public:
   using Result = DeviceInfo;
 
   DeviceInfoAnalysis() = default;
@@ -113,7 +111,7 @@ class DeviceInfoAnalysis : public llvm::AnalysisInfoMixin<DeviceInfoAnalysis> {
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Device info analysis"; }
 
- private:
+private:
   /// @brief Optional device information
   std::optional<Result> Info;
 
@@ -121,7 +119,7 @@ class DeviceInfoAnalysis : public llvm::AnalysisInfoMixin<DeviceInfoAnalysis> {
   static llvm::AnalysisKey Key;
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
+#endif // COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
index a5c13add7e21d..815188761f272 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
@@ -30,7 +30,7 @@ namespace llvm {
 class BasicBlock;
 class Module;
 class Value;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -85,7 +85,7 @@ void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock,
 llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m);
 
 /// @}
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_DMA_H_INCLUDED
+#endif // COMPILER_UTILS_DMA_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
index d3557ddf3034f..261a5bbc7d4f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
@@ -50,11 +50,11 @@ struct EncodeKernelMetadataPass
 
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
 
- private:
+private:
   std::string KernelName;
   std::optional<std::array<uint64_t, 3>> LocalSizes;
 };
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
index c565c3c93870f..fcbd07825fb22 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
@@ -27,7 +27,7 @@ namespace llvm {
 class Constant;
 class Function;
 class Type;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -106,7 +106,7 @@ struct GroupCollective {
   /// @brief Returns true for work-group collective operations.
   bool isWorkGroupScope() const { return Scope == ScopeKind::WorkGroup; }
 };
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
+#endif // COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
index abbd1abca093e..66e6a89bd5d43 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
@@ -31,7 +31,7 @@ namespace llvm {
 class LLVMContext;
 class Type;
 class raw_ostream;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -59,7 +59,7 @@ enum TypeQualifier : int32_t {
 class TypeQualifiers final {
   using StorageT = uint64_t;
 
- public:
+public:
   /// @brief Create a type qualifier list with no qualifiers.
   TypeQualifiers();
   /// @brief Create a type qualifier list with one qualifiers.
@@ -124,7 +124,7 @@ class TypeQualifiers final {
   /// @brief Determine whether two qualifier lists are different.
   bool operator!=(const TypeQualifiers &other) { return !(*this == other); }
 
- private:
+private:
   /// @brief Set the number of type qualifiers contained in the list.
   void setCount(StorageT newCount);
 
@@ -148,7 +148,7 @@ class TypeQualifiers final {
 
 /// @brief Helps with light parsing such as demangling function names.
 class Lexer final {
- public:
+public:
   /// @brief Create a new lexer with the given text.
   ///
   /// @param[in] text Text to lex.
@@ -216,7 +216,7 @@ class Lexer final {
   /// @return true if any whitespace was consumed or false otherwise
   bool ConsumeWhitespace();
 
- private:
+private:
   /// @brief Text to lex.
   llvm::StringRef Text;
   /// @brief Current lexing position into the text.
@@ -225,7 +225,7 @@ class Lexer final {
 
 /// @brief Converts between mangled and non-mangled function names.
 class NameMangler final {
- public:
+public:
   /// @brief Create a new name mangler.
   ///
   /// @param[in] context LLVM context to use.
@@ -298,10 +298,10 @@ class NameMangler final {
   ///     Quals[1] = (SignedIntQual)
   ///
   /// @return Demangled name or an empty string on failure
-  llvm::StringRef demangleName(
-      llvm::StringRef Name, llvm::SmallVectorImpl<llvm::Type *> &Types,
-      llvm::SmallVectorImpl<llvm::Type *> &PointerElementTypes,
-      llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+  llvm::StringRef
+  demangleName(llvm::StringRef Name, llvm::SmallVectorImpl<llvm::Type *> &Types,
+               llvm::SmallVectorImpl<llvm::Type *> &PointerElementTypes,
+               llvm::SmallVectorImpl<TypeQualifiers> &Quals);
 
   /// @brief Remove the mangling of a function name.
   ///
@@ -310,7 +310,7 @@ class NameMangler final {
   /// @return Demangled name or original name if not mangled.
   llvm::StringRef demangleName(llvm::StringRef Name);
 
- private:
+private:
   /// @brief Try to mangle the given qualified type. This only works for simple
   /// types that do not require string manipulation.
   ///
@@ -402,7 +402,7 @@ class NameMangler final {
   /// @brief LLVM context used to access LLVM types.
   llvm::LLVMContext *Context;
 };
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_MANGLING_H_INCLUDED
+#endif // COMPILER_UTILS_MANGLING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
index 6950169a68eb2..eda860477aaee 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
@@ -26,7 +26,7 @@
 namespace llvm {
 class Function;
 class Module;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -109,8 +109,8 @@ bool parseOrigToVeczFnLinkMetadata(
 /// @return On success, a pair containing a pointer to the original kernel
 /// function and the vectorization factor used as the key. The original
 /// function may be null. On decoding failure, std::nullopt.
-std::optional<LinkMetadataResult> parseVeczToOrigFnLinkMetadata(
-    llvm::Function &f);
+std::optional<LinkMetadataResult>
+parseVeczToOrigFnLinkMetadata(llvm::Function &f);
 
 /// @brief Drops "base" vectorization metadata from a function, if present.
 ///
@@ -191,15 +191,15 @@ void encodeLocalSizeMetadata(llvm::Function &f,
 ///
 /// @param[in] f Function from which to decode the metadata
 /// @returns The local size array if present, else `std::nullopt`
-std::optional<std::array<uint64_t, 3>> getLocalSizeMetadata(
-    const llvm::Function &f);
+std::optional<std::array<uint64_t, 3>>
+getLocalSizeMetadata(const llvm::Function &f);
 
 /// @brief Drops all !mux_scheduled_fn metadata from a function.
 void dropSchedulingParameterMetadata(llvm::Function &f);
 
 /// @brief Retrieves the indices of scheduling parameters from the function.
-llvm::SmallVector<int, 4> getSchedulingParameterFunctionMetadata(
-    const llvm::Function &f);
+llvm::SmallVector<int, 4>
+getSchedulingParameterFunctionMetadata(const llvm::Function &f);
 
 /// @brief Sets scheduling-parameter metadata on the given function
 void setSchedulingParameterFunctionMetadata(llvm::Function &f,
@@ -228,8 +228,8 @@ std::optional<unsigned> isSchedulingParameter(const llvm::Function &f,
 /// @param[in] f Kernel for extraction.
 ///
 /// @return The work group size or std::nullopt if there is no such metadata.
-std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
-    const llvm::Function &f);
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const llvm::Function &f);
 
 /// @brief Extracts the required work group size from an opencl.kernels subnode,
 /// which is similar to the function metadata, but the size is stored under
@@ -238,8 +238,8 @@ std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
 /// @param[in] node Kernel's subnode for extraction.
 ///
 /// @return The work group size or std::nullopt if there is no such metadata.
-std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
-    const llvm::MDNode &node);
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const llvm::MDNode &node);
 
 /// @brief Extracts the maximum work dimension from a kernel's function
 /// metadata
@@ -290,7 +290,7 @@ void encodeReqdSubgroupSizeMetadata(llvm::Function &f, uint32_t size);
 /// @returns The required sub-group size if present, else `std::nullopt`
 std::optional<uint32_t> getReqdSubgroupSize(const llvm::Function &f);
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_METADATA_H_INCLUDED
+#endif // COMPILER_UTILS_METADATA_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
index ec32ecec950c3..678b753b98a7e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
@@ -50,7 +50,7 @@ namespace utils {
 /// * Invoking emitBuiltinInline from BuiltinInfo analysis
 class OptimalBuiltinReplacementPass
     : public llvm::PassInfoMixin<OptimalBuiltinReplacementPass> {
- public:
+public:
   using ReplacementFnTy = std::function<llvm::Value *(
       llvm::CallBase &, llvm::StringRef,
       const llvm::SmallVectorImpl<llvm::Type *> &,
@@ -103,13 +103,13 @@ class OptimalBuiltinReplacementPass
       const llvm::SmallVectorImpl<llvm::Type *> &,
       const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
 
- private:
+private:
   std::vector<ReplacementFnTy> replacements;
 
   llvm::Value *replaceBuiltinWithInlineIR(llvm::CallBase &CB) const;
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
index 3d33531c350da..b60847eb53f1f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
@@ -43,7 +43,7 @@ class ModulePass;
 class Type;
 class Value;
 class IRBuilderBase;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -313,7 +313,7 @@ llvm::CallInst *createCallToWrappedFunction(
 llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
                                      llvm::Value *RHS, llvm::RecurKind Kind);
 /// @}
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
+#endif // COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
index 9d1e8516867cd..671cc9baf7051 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
@@ -45,7 +45,7 @@ extern DebugLogging DebugPasses;
 /// @brief A class that manages the lifetime and initialization of all
 /// components required to set up a new-style LLVM pass manager.
 class PassMachinery {
- public:
+public:
   PassMachinery(llvm::LLVMContext &Ctx, llvm::TargetMachine *TM,
                 bool VerifyEach = false,
                 DebugLogging debugLogLevel = DebugLogging::None);
@@ -105,7 +105,7 @@ class PassMachinery {
   llvm::TargetMachine *getTM() { return TM; }
   const llvm::TargetMachine *getTM() const { return TM; }
 
- protected:
+protected:
   /// @brief TargetMachine to be used for passes. May be nullptr.
   llvm::TargetMachine *TM;
   // Note: the order here is important! They must be destructed in this order.
@@ -142,7 +142,7 @@ void printPassName(llvm::StringRef PassName, llvm::raw_ostream &OS);
 void printPassName(llvm::StringRef PassName, llvm::StringRef Params,
                    llvm::raw_ostream &OS);
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
+#endif // COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
index 743846da1c109..4bdcb2da83969 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
@@ -36,10 +36,10 @@ namespace utils {
 /// Runs over all kernels with "kernel entry point" metadata.
 class PrepareBarriersPass final
     : public llvm::PassInfoMixin<PrepareBarriersPass> {
- public:
+public:
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
 };
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
index 9c94da90a7da6..bde53d712aab7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
@@ -35,10 +35,10 @@ namespace utils {
 /// Runs over all kernels with "kernel" metadata.
 class ReplaceLocalModuleScopeVariablesPass final
     : public llvm::PassInfoMixin<ReplaceLocalModuleScopeVariablesPass> {
- public:
+public:
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
 };
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
index e5742b324f96a..08c923b5e56f8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
@@ -29,7 +29,7 @@ class Function;
 class Module;
 class StructType;
 class Argument;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -137,7 +137,7 @@ void populateStructSetterFunction(llvm::Function &F,
                                   llvm::StructType *const structTy,
                                   uint32_t structFieldIdx, bool hasRankArg);
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_SCHEDULING_H_INCLUDED
+#endif // COMPILER_UTILS_SCHEDULING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
index 726538f5beef2..af615f3a6f4bf 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
@@ -52,7 +52,7 @@ class GlobalSubgroupInfo {
 
   compiler::utils::BuiltinInfo &BI;
 
- public:
+public:
   GlobalSubgroupInfo(llvm::Module &M, BuiltinInfo &);
 
   compiler::utils::BuiltinInfo &getBuiltinInfo() { return BI; }
@@ -73,15 +73,15 @@ class GlobalSubgroupInfo {
 
   /// @brief Returns true if the provided function is a mux sub-group
   /// collective builtin or sub-group barrier.
-  std::optional<compiler::utils::Builtin> isMuxSubgroupBuiltin(
-      const llvm::Function *F) const;
+  std::optional<compiler::utils::Builtin>
+  isMuxSubgroupBuiltin(const llvm::Function *F) const;
 };
 
 /// @brief Computes and returns the GlobalSubgroupInfo for a Module.
 class SubgroupAnalysis : public llvm::AnalysisInfoMixin<SubgroupAnalysis> {
   friend AnalysisInfoMixin<SubgroupAnalysis>;
 
- public:
+public:
   using Result = GlobalSubgroupInfo;
 
   explicit SubgroupAnalysis() {}
@@ -92,7 +92,7 @@ class SubgroupAnalysis : public llvm::AnalysisInfoMixin<SubgroupAnalysis> {
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Sub-group analysis"; }
 
- private:
+private:
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
 };
@@ -103,13 +103,13 @@ class SubgroupAnalysisPrinterPass
     : public llvm::PassInfoMixin<SubgroupAnalysisPrinterPass> {
   llvm::raw_ostream &OS;
 
- public:
+public:
   explicit SubgroupAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
 
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
+#endif // COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
index e8c2c226590d9..c8c97f7848a2e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
@@ -20,7 +20,7 @@
 namespace llvm {
 class Type;
 class LLVMContext;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
@@ -53,10 +53,10 @@ enum ImageTyDimensionalityParam {
 
 /// @brief Values the 'depth' parameter of a "spirv.Image" type may hold.
 enum ImageTyDepthParam {
-  ImageDepthNone = 0,  // Not a depth image
-  ImageDepth,          // A depth image
-  ImageDepthUnknown,   // No indication as to whether this is a depth or
-                       // non-depth image
+  ImageDepthNone = 0, // Not a depth image
+  ImageDepth,         // A depth image
+  ImageDepthUnknown,  // No indication as to whether this is a depth or
+                      // non-depth image
 };
 
 /// @brief Values the 'arrayed' parameter of a "spirv.Image" type may hold.
@@ -73,10 +73,10 @@ enum ImageTyMSParam {
 
 /// @brief Values the 'Sampled' parameter of a "spirv.Image" type may hold.
 enum ImageTySampledParam {
-  ImageSampledRuntime = 0,      // only known at run time
-  ImageSampledCompat,           // compatible with sampling operations
-  ImageSampledReadWriteCompat,  // compatiable with read/write operations (a
-                                // storage or subpass data image)
+  ImageSampledRuntime = 0,     // only known at run time
+  ImageSampledCompat,          // compatible with sampling operations
+  ImageSampledReadWriteCompat, // compatiable with read/write operations (a
+                               // storage or subpass data image)
 };
 
 enum ImageTyAccessQualParam {
@@ -98,47 +98,47 @@ llvm::Type *getSamplerTy(llvm::LLVMContext &Ctx);
 /// @brief Returns the TargetExtType representing an 'image1d_t' type.
 ///
 /// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
-llvm::Type *getImage1DTy(
-    llvm::LLVMContext &Ctx,
-    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+llvm::Type *
+getImage1DTy(llvm::LLVMContext &Ctx,
+             ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
 
 /// @brief Returns the TargetExtType representing an 'image1d_array_t' type.
 ///
 /// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
-llvm::Type *getImage1DArrayTy(
-    llvm::LLVMContext &Ctx,
-    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+llvm::Type *
+getImage1DArrayTy(llvm::LLVMContext &Ctx,
+                  ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
 
 /// @brief Returns the TargetExtType representing an 'image1d_buffer_t' type.
 ///
 /// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
-llvm::Type *getImage1DBufferTy(
-    llvm::LLVMContext &Ctx,
-    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+llvm::Type *
+getImage1DBufferTy(llvm::LLVMContext &Ctx,
+                   ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
 
 /// @brief Returns the TargetExtType representing an 'image2d_t' type.
 ///
 /// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
-llvm::Type *getImage2DTy(
-    llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
-    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+llvm::Type *
+getImage2DTy(llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
+             ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
 
 /// @brief Returns the TargetExtType representing an 'image2d_array_t' type.
 ///
 /// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
-llvm::Type *getImage2DArrayTy(
-    llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
-    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+llvm::Type *
+getImage2DArrayTy(llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
+                  ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
 
 /// @brief Returns the TargetExtType representing an 'image3d_t' type.
 ///
 /// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
-llvm::Type *getImage3DTy(
-    llvm::LLVMContext &Ctx,
-    ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+llvm::Type *
+getImage3DTy(llvm::LLVMContext &Ctx,
+             ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
 
-}  // namespace tgtext
-}  // namespace utils
-}  // namespace compiler
+} // namespace tgtext
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
+#endif // COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
index e56e847f1da12..88dd7a6fb0c50 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
@@ -44,12 +44,12 @@ namespace utils {
 /// problematic types and replacing them with their unsuffixed version.
 class UniqueOpaqueStructsPass
     : public llvm::PassInfoMixin<UniqueOpaqueStructsPass> {
- public:
+public:
   UniqueOpaqueStructsPass() = default;
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
index ddf7e65b3e91b..21c7b62dff496 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
@@ -71,14 +71,14 @@ struct WorkItemLoopsPassOptions {
 /// Runs over all kernels with "kernel entry point" metadata. Work-item orders
 /// are sourced from the "work item order" function metadata on each kernel.
 class WorkItemLoopsPass final : public llvm::PassInfoMixin<WorkItemLoopsPass> {
- public:
+public:
   /// @brief Constructor.
   WorkItemLoopsPass(const WorkItemLoopsPassOptions &Options)
       : IsDebug(Options.IsDebug), ForceNoTail(Options.ForceNoTail) {}
 
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
 
- private:
+private:
   /// @brief Make the work-item-loop wrapper function.
   /// This creates a wrapper function that iterates over a work group, calling
   /// the kernel for each work item, respecting the semantics of any barriers
@@ -110,7 +110,7 @@ class WorkItemLoopsPass final : public llvm::PassInfoMixin<WorkItemLoopsPass> {
   const bool IsDebug;
   const bool ForceNoTail;
 };
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
-#endif  // COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
+#endif // COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
index fabe42ae57a93..8da2fdcae20dd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
@@ -37,10 +37,10 @@ struct BinOpHelper;
 #include <multi_llvm/instructions.inc>
 #undef LLVM
 
-}  // namespace detail
+} // namespace detail
 
-static std::optional<llvm::AtomicRMWInst::BinOp> consume_binop_with_underscore(
-    llvm::StringRef &String) {
+static std::optional<llvm::AtomicRMWInst::BinOp>
+consume_binop_with_underscore(llvm::StringRef &String) {
   return multi_llvm::detail::BinOpHelper<>::consume_front_with_underscore(
       String);
 }
@@ -49,6 +49,6 @@ static llvm::StringRef to_string(llvm::AtomicRMWInst::BinOp BinOp) {
   return multi_llvm::detail::BinOpHelper<>::to_string(BinOp);
 }
 
-}  // namespace multi_llvm
+} // namespace multi_llvm
 
-#endif  // MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
+#endif // MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
index d9049b2584bf4..787822d16859b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
@@ -24,34 +24,34 @@ struct BinOpHelper
 #define BINOP_LLVM21(OP, STR) BINOP(OP, STR)
 #endif
 {
-#define BINOPS()                     \
-  BINOP(Xchg, "xchg")                \
-  BINOP(Add, "add")                  \
-  BINOP(Sub, "sub")                  \
-  BINOP(And, "and")                  \
-  BINOP(Nand, "nand")                \
-  BINOP(Or, "or")                    \
-  BINOP(Xor, "xor")                  \
-  BINOP(Max, "max")                  \
-  BINOP(Min, "min")                  \
-  BINOP(UMax, "umax")                \
-  BINOP(UMin, "umin")                \
-  BINOP(FAdd, "fadd")                \
-  BINOP(FSub, "fsub")                \
-  BINOP(FMax, "fmax")                \
-  BINOP(FMin, "fmin")                \
-  BINOP_LLVM21(FMaximum, "fmaximum") \
-  BINOP_LLVM21(FMinimum, "fminumum") \
-  BINOP(UIncWrap, "uincwrap")        \
-  BINOP(UDecWrap, "udecwrap")        \
-  BINOP(USubCond, "usubcond")        \
+#define BINOPS()                                                               \
+  BINOP(Xchg, "xchg")                                                          \
+  BINOP(Add, "add")                                                            \
+  BINOP(Sub, "sub")                                                            \
+  BINOP(And, "and")                                                            \
+  BINOP(Nand, "nand")                                                          \
+  BINOP(Or, "or")                                                              \
+  BINOP(Xor, "xor")                                                            \
+  BINOP(Max, "max")                                                            \
+  BINOP(Min, "min")                                                            \
+  BINOP(UMax, "umax")                                                          \
+  BINOP(UMin, "umin")                                                          \
+  BINOP(FAdd, "fadd")                                                          \
+  BINOP(FSub, "fsub")                                                          \
+  BINOP(FMax, "fmax")                                                          \
+  BINOP(FMin, "fmin")                                                          \
+  BINOP_LLVM21(FMaximum, "fmaximum")                                           \
+  BINOP_LLVM21(FMinimum, "fminumum")                                           \
+  BINOP(UIncWrap, "uincwrap")                                                  \
+  BINOP(UDecWrap, "udecwrap")                                                  \
+  BINOP(USubCond, "usubcond")                                                  \
   BINOP(USubSat, "usubsat")
 
-  static std::optional<T> consume_front_with_underscore(
-      llvm::StringRef &String) {
-#define BINOP(BINOP, STR)              \
-  if (String.consume_front(STR "_")) { \
-    return T::BINOP;                   \
+  static std::optional<T>
+  consume_front_with_underscore(llvm::StringRef &String) {
+#define BINOP(BINOP, STR)                                                      \
+  if (String.consume_front(STR "_")) {                                         \
+    return T::BINOP;                                                           \
   }
     BINOPS()
 #undef BINOP
@@ -60,13 +60,13 @@ struct BinOpHelper
 
   static llvm::StringRef to_string(T BinOp) {
     switch (BinOp) {
-#define BINOP(BINOP, STR) \
-  case T::BINOP:          \
+#define BINOP(BINOP, STR)                                                      \
+  case T::BINOP:                                                               \
     return STR;
       BINOPS()
 #undef BINOP
-      case T::BAD_BINOP:
-        break;
+    case T::BAD_BINOP:
+      break;
     }
     llvm_unreachable("Unexpected BinOp");
   }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
index 3df2c19ae805f..cecbb7f02ddae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
@@ -34,7 +34,7 @@ auto getAttributes(T... args, llvm::FunctionType *)
     -> decltype(llvm::Intrinsic::getAttributes(args...)) {
   return llvm::Intrinsic::getAttributes(args...);
 }
-}  // namespace detail
+} // namespace detail
 
 namespace Intrinsic {
 static inline auto getAttributes(llvm::LLVMContext &C, llvm::Intrinsic::ID ID,
@@ -42,8 +42,8 @@ static inline auto getAttributes(llvm::LLVMContext &C, llvm::Intrinsic::ID ID,
   return detail::getAttributes<llvm::LLVMContext &, llvm::Intrinsic::ID>(C, ID,
                                                                          FT);
 }
-}  // namespace Intrinsic
+} // namespace Intrinsic
 
-}  // namespace multi_llvm
+} // namespace multi_llvm
 
-#endif  // MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
+#endif // MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
index 55cc6fca85f8f..802471f4562cc 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
@@ -18,21 +18,21 @@
 
 #include <llvm/Config/llvm-config.h>
 
-#define LLVM_VERSION_EQUAL(MAJOR, MINOR) \
+#define LLVM_VERSION_EQUAL(MAJOR, MINOR)                                       \
   (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR == (MINOR))
 
-#define LLVM_VERSION_LESS(MAJOR, MINOR) \
-  ((LLVM_VERSION_MAJOR < (MAJOR)) ||    \
+#define LLVM_VERSION_LESS(MAJOR, MINOR)                                        \
+  ((LLVM_VERSION_MAJOR < (MAJOR)) ||                                           \
    (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR < (MINOR)))
 
-#define LLVM_VERSION_LESS_EQUAL(MAJOR, MINOR) \
+#define LLVM_VERSION_LESS_EQUAL(MAJOR, MINOR)                                  \
   (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_LESS(MAJOR, MINOR))
 
-#define LLVM_VERSION_GREATER(MAJOR, MINOR) \
-  ((LLVM_VERSION_MAJOR > (MAJOR)) ||       \
+#define LLVM_VERSION_GREATER(MAJOR, MINOR)                                     \
+  ((LLVM_VERSION_MAJOR > (MAJOR)) ||                                           \
    (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR > (MINOR)))
 
-#define LLVM_VERSION_GREATER_EQUAL(MAJOR, MINOR) \
+#define LLVM_VERSION_GREATER_EQUAL(MAJOR, MINOR)                               \
   (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_GREATER(MAJOR, MINOR))
 
-#endif  // MULTI_LLVM_LLVM_VERSION_H_INCLUDED
+#endif // MULTI_LLVM_LLVM_VERSION_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
index 34ee6707448a5..ea350fd4bdec2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -19,4 +19,4 @@
 
 #include <multi_llvm/llvm_version.h>
 
-#endif  // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
+#endif // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
index eaab65e1eeb1f..6d8e608b860bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
@@ -57,7 +57,7 @@ auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
 }
 #endif
 
-}  // namespace detail
+} // namespace detail
 
 bool isLegalMaskedLoad(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty,
                        llvm::Align Alignment, unsigned AddrSpace) {
@@ -69,6 +69,6 @@ bool isLegalMaskedStore(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty,
   return detail::isLegalMaskedStoreImpl(TTI, Ty, Alignment, AddrSpace);
 }
 
-}  // namespace multi_llvm
+} // namespace multi_llvm
 
-#endif  // MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
+#endif // MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
index bae892aaa2194..576b04f284d8e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
@@ -44,7 +44,7 @@ auto createTargetInfo(clang::DiagnosticsEngine &Diags,
       Diags, std::make_shared<clang::TargetOptions>(Opts));
 }
 
-}  // namespace detail
+} // namespace detail
 
 struct TargetInfo {
   static clang::TargetInfo *CreateTargetInfo(clang::DiagnosticsEngine &Diags,
@@ -53,6 +53,6 @@ struct TargetInfo {
   }
 };
 
-}  // namespace multi_llvm
+} // namespace multi_llvm
 
-#endif  // MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
+#endif // MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
index 269e0d28c8b22..d13b9d531b8a9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
@@ -64,6 +64,6 @@ inline unsigned getVectorKnownMinNumElements(llvm::Type *ty) {
 inline unsigned getVectorKnownMinNumElements(const llvm::Type *ty) {
   return getVectorElementCount(ty).getKnownMinValue();
 }
-}  // namespace multi_llvm
+} // namespace multi_llvm
 
-#endif  // MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
+#endif // MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
index 6a92014ea6f48..98d63c713e0d4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
@@ -154,18 +154,18 @@ static constexpr const char *BarrierScheduleAttrName = "mux-barrier-schedule";
 void setBarrierSchedule(CallInst &CI, BarrierSchedule Sched) {
   StringRef Val = "unknown";
   switch (Sched) {
-    case BarrierSchedule::Unordered:
-      Val = "unordered";
-      break;
-    case BarrierSchedule::Once:
-      Val = "once";
-      break;
-    case BarrierSchedule::ScalarTail:
-      Val = "scalar-tail";
-      break;
-    case BarrierSchedule::Linear:
-      Val = "linear";
-      break;
+  case BarrierSchedule::Unordered:
+    Val = "unordered";
+    break;
+  case BarrierSchedule::Once:
+    Val = "once";
+    break;
+  case BarrierSchedule::ScalarTail:
+    Val = "scalar-tail";
+    break;
+  case BarrierSchedule::Linear:
+    Val = "linear";
+    break;
   }
 
   const Attribute Attr =
@@ -202,5 +202,5 @@ unsigned getMuxSubgroupSize(const llvm::Function &) {
   // hard-coding the constant 1 in places that will eventually need updated.
   return 1;
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
index 494ca6d0727ea..df6cf77da1b8e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -52,8 +52,9 @@ using AlignIntTy = uint64_t;
 
 /// @brief it returns true if and only if the instruction is a work group
 /// collective call, and returns false otherwise.
-std::optional<compiler::utils::GroupCollective> getWorkGroupCollectiveCall(
-    Instruction *inst, compiler::utils::BuiltinInfo &bi) {
+std::optional<compiler::utils::GroupCollective>
+getWorkGroupCollectiveCall(Instruction *inst,
+                           compiler::utils::BuiltinInfo &bi) {
   auto *const ci = dyn_cast_or_null<CallInst>(inst);
   if (!ci) {
     return std::nullopt;
@@ -295,7 +296,7 @@ bool isStructWithScalables(Type *ty) {
   return false;
 }
 
-}  // namespace
+} // namespace
 
 Value *compiler::utils::Barrier::LiveValuesHelper::getExtractValueGEP(
     const Value *live) {
@@ -560,8 +561,8 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
     const auto barrier_id = kBarrier_StartNewID + id->getZExtValue();
 
     if (is_debug_) {
-      assert(entry_stub != nullptr);  // Guaranteed as is_debug_ is const.
-      assert(exit_stub != nullptr);   // Guaranteed as is_debug_ is const.
+      assert(entry_stub != nullptr); // Guaranteed as is_debug_ is const.
+      assert(exit_stub != nullptr);  // Guaranteed as is_debug_ is const.
 
       // Create call instructions invoking debug stubs for every barrier. We
       // don't insert these into a basic block yet since we want to insert
@@ -757,7 +758,7 @@ void compiler::utils::Barrier::FindLiveVariables() {
       assert(!isa<AllocaInst>(inst) && "Alloca found outside entry block!");
     }
   }
-#endif  // ndef NDEBUG
+#endif // ndef NDEBUG
 
   // Put all the original allocas into the barrier struct, in case they get
   // indirectly referenced from the other side of a barrier.
@@ -969,7 +970,7 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() {
   // Pad the end of the struct to the max alignment as we are creating an
   // array
   offset = PadTypeToAlignment(field_tys, offset, max_live_var_alignment);
-  live_var_mem_size_fixed = offset;  // No more offsets required.
+  live_var_mem_size_fixed = offset; // No more offsets required.
 
   // Now deal with any scalable members. We reset the offset to zero because
   // scalables are indexed bytewise starting from the beginning of the
@@ -992,7 +993,7 @@ void compiler::utils::Barrier::MakeLiveVariableMemType() {
   // array
   offset =
       PadTypeToAlignment(field_tys_scalable, offset, max_live_var_alignment);
-  live_var_mem_size_scalable = offset;  // No more offsets required.
+  live_var_mem_size_scalable = offset; // No more offsets required.
 
   LLVMContext &context = module_.getContext();
   // if the barrier contains scalables, add a flexible byte array on the end
@@ -1415,12 +1416,14 @@ BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
     BasicBlock *bb, ValueToValueMapTy &vmap, const Twine &name_suffix,
     live_variable_mem_t &live_defs_info, Function *F) {
   BasicBlock *new_bb = BasicBlock::Create(bb->getContext(), "", F);
-  if (bb->hasName()) new_bb->setName(bb->getName() + name_suffix);
+  if (bb->hasName())
+    new_bb->setName(bb->getName() + name_suffix);
 
   // Loop over all instructions, and copy them over.
   for (Instruction &i : *bb) {
     Instruction *new_inst = i.clone();
-    if (i.hasName()) new_inst->setName(i.getName() + name_suffix);
+    if (i.hasName())
+      new_inst->setName(i.getName() + name_suffix);
     new_inst->insertInto(new_bb, new_bb->end());
 
     // Record live variables' defs which are in current kernel.
@@ -1435,7 +1438,8 @@ BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
 
 /// @brief Seperate kernel function with barrier boundary.
 void compiler::utils::Barrier::SeperateKernelWithBarrier() {
-  if (barriers_.empty()) return;
+  if (barriers_.empty())
+    return;
 
   for (auto &[i, region] : barrier_region_id_map_) {
     kernel_id_map_[region.id] = GenerateNewKernel(region);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
index dc85cd0ad508d..372280d135302 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -90,21 +90,21 @@ BuiltinInfo::identifyMuxBuiltin(const Function &F) const {
           .Default(std::nullopt);
   if (ID) {
     switch (*ID) {
-      default:
-        return {{*ID, {}}};
-      case eMuxBuiltinDMARead1D:
-      case eMuxBuiltinDMARead2D:
-      case eMuxBuiltinDMARead3D:
-      case eMuxBuiltinDMAWrite1D:
-      case eMuxBuiltinDMAWrite2D:
-      case eMuxBuiltinDMAWrite3D:
-        // Return the event type used by these builtins. The event type is
-        // required to declare/define these builtins, so return it here for
-        // the sake of completeness. The event type doesn't change the
-        // builtins' name (i.e., it's not mangled) as it's required to be
-        // consistent at any single snapshot of the module, though it may
-        // change through time.
-        return {{*ID, {F.getReturnType()}}};
+    default:
+      return {{*ID, {}}};
+    case eMuxBuiltinDMARead1D:
+    case eMuxBuiltinDMARead2D:
+    case eMuxBuiltinDMARead3D:
+    case eMuxBuiltinDMAWrite1D:
+    case eMuxBuiltinDMAWrite2D:
+    case eMuxBuiltinDMAWrite3D:
+      // Return the event type used by these builtins. The event type is
+      // required to declare/define these builtins, so return it here for
+      // the sake of completeness. The event type doesn't change the
+      // builtins' name (i.e., it's not mangled) as it's required to be
+      // consistent at any single snapshot of the module, though it may
+      // change through time.
+      return {{*ID, {F.getReturnType()}}};
     }
   }
 
@@ -118,9 +118,9 @@ BuiltinInfo::identifyMuxBuiltin(const Function &F) const {
     return std::nullopt;
   }
 
-#define SCOPED_GROUP_OP(OP)                 \
-  (IsSubgroupOp   ? eMuxBuiltinSubgroup##OP \
-   : IsVecgroupOp ? eMuxBuiltinVecgroup##OP \
+#define SCOPED_GROUP_OP(OP)                                                    \
+  (IsSubgroupOp   ? eMuxBuiltinSubgroup##OP                                    \
+   : IsVecgroupOp ? eMuxBuiltinVecgroup##OP                                    \
                   : eMuxBuiltinWorkgroup##OP)
 
   // Most group operations have one argument, except for broadcasts. Despite
@@ -159,7 +159,7 @@ BuiltinInfo::identifyMuxBuiltin(const Function &F) const {
     Name = Name.drop_front(Group.size());
 
     if (Group == "logical") {
-      Name = Name.drop_front();  // Drop the underscore
+      Name = Name.drop_front(); // Drop the underscore
       auto NextIdx = Name.find_first_of('_');
       auto RealGroup = Name.substr(0, NextIdx);
       Group += "_" + RealGroup.str();
@@ -279,44 +279,43 @@ BuiltinUniformity BuiltinInfo::isBuiltinUniform(const Builtin &B,
                                                 const CallInst *CI,
                                                 unsigned SimdDimIdx) const {
   switch (B.ID) {
-    default:
-      break;
-    case eMuxBuiltinGetGlobalId:
-    case eMuxBuiltinGetLocalId: {
-      // We need to know the dimension requested from these builtins at compile
-      // time to infer their uniformity.
-      if (!CI || CI->arg_empty()) {
-        return eBuiltinUniformityNever;
-      }
-      auto *Rank = dyn_cast<ConstantInt>(CI->getArgOperand(0));
-      if (!Rank) {
-        // The Rank is some function, which "might" evaluate to zero
-        // sometimes, so we let the packetizer sort it out with some
-        // conditional magic.
-        // TODO Make sure this can never go haywire in weird edge cases.
-        // Where we have one get_global_id() dependent on another, this is
-        // not packetized correctly. Doing so is very hard!  We should
-        // probably just fail to packetize in this case.  We might also be
-        // able to return eBuiltinUniformityNever here, in cases where we can
-        // prove that the value can never be zero.
-        return eBuiltinUniformityMaybeInstanceID;
-      }
-      // Only vectorize on selected dimension. The value of get_global_id with
-      // other ranks is uniform.
-      if (Rank->getZExtValue() == SimdDimIdx) {
-        return eBuiltinUniformityInstanceID;
-      }
-
-      return eBuiltinUniformityAlways;
+  default:
+    break;
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetLocalId: {
+    // We need to know the dimension requested from these builtins at compile
+    // time to infer their uniformity.
+    if (!CI || CI->arg_empty()) {
+      return eBuiltinUniformityNever;
+    }
+    auto *Rank = dyn_cast<ConstantInt>(CI->getArgOperand(0));
+    if (!Rank) {
+      // The Rank is some function, which "might" evaluate to zero
+      // sometimes, so we let the packetizer sort it out with some
+      // conditional magic.
+      // TODO Make sure this can never go haywire in weird edge cases.
+      // Where we have one get_global_id() dependent on another, this is
+      // not packetized correctly. Doing so is very hard!  We should
+      // probably just fail to packetize in this case.  We might also be
+      // able to return eBuiltinUniformityNever here, in cases where we can
+      // prove that the value can never be zero.
+      return eBuiltinUniformityMaybeInstanceID;
     }
-    case eMuxBuiltinGetSubGroupLocalId:
+    // Only vectorize on selected dimension. The value of get_global_id with
+    // other ranks is uniform.
+    if (Rank->getZExtValue() == SimdDimIdx) {
       return eBuiltinUniformityInstanceID;
-    case eMuxBuiltinGetLocalLinearId:
-    case eMuxBuiltinGetGlobalLinearId:
-      // TODO: This is fine for vectorizing in the x-axis, but currently we do
-      // not support vectorizing along y or z.
-      return SimdDimIdx ? eBuiltinUniformityNever
-                        : eBuiltinUniformityInstanceID;
+    }
+
+    return eBuiltinUniformityAlways;
+  }
+  case eMuxBuiltinGetSubGroupLocalId:
+    return eBuiltinUniformityInstanceID;
+  case eMuxBuiltinGetLocalLinearId:
+  case eMuxBuiltinGetGlobalLinearId:
+    // TODO: This is fine for vectorizing in the x-axis, but currently we do
+    // not support vectorizing along y or z.
+    return SimdDimIdx ? eBuiltinUniformityNever : eBuiltinUniformityInstanceID;
   }
 
   // Reductions and broadcasts are always uniform
@@ -343,68 +342,68 @@ std::optional<Builtin> BuiltinInfo::analyzeBuiltin(const Function &F) const {
     const bool NoSideEffect = F.onlyReadsMemory();
     bool SafeIntrinsic = false;
     switch (IntrID) {
-      default:
-        SafeIntrinsic = false;
-        break;
-      case Intrinsic::smin:
-      case Intrinsic::smax:
-      case Intrinsic::umin:
-      case Intrinsic::umax:
-      case Intrinsic::abs:
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-      case Intrinsic::sqrt:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::pow:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::log:
-      case Intrinsic::log10:
-      case Intrinsic::log2:
-      case Intrinsic::fma:
-      case Intrinsic::fabs:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-      case Intrinsic::copysign:
-      case Intrinsic::floor:
-      case Intrinsic::ceil:
-      case Intrinsic::trunc:
-      case Intrinsic::rint:
-      case Intrinsic::nearbyint:
-      case Intrinsic::round:
-      case Intrinsic::ctpop:
-      case Intrinsic::fmuladd:
-      case Intrinsic::fshl:
-      case Intrinsic::fshr:
-      case Intrinsic::sadd_sat:
-      case Intrinsic::uadd_sat:
-      case Intrinsic::ssub_sat:
-      case Intrinsic::usub_sat:
-      case Intrinsic::bitreverse:
-        // All these function are overloadable and have both scalar and vector
-        // versions.
-        Properties |= eBuiltinPropertyVectorEquivalent;
-        SafeIntrinsic = true;
-        break;
-      case Intrinsic::assume:
-      case Intrinsic::dbg_declare:
-      case Intrinsic::dbg_value:
-      case Intrinsic::invariant_start:
-      case Intrinsic::invariant_end:
-      case Intrinsic::lifetime_start:
-      case Intrinsic::lifetime_end:
-      case Intrinsic::objectsize:
-      case Intrinsic::ptr_annotation:
-      case Intrinsic::var_annotation:
-      case Intrinsic::experimental_noalias_scope_decl:
-        SafeIntrinsic = true;
-        break;
-      case Intrinsic::memset:
-      case Intrinsic::memcpy:
-        Properties |= eBuiltinPropertyNoVectorEquivalent;
-        Properties |= eBuiltinPropertySideEffects;
-        break;
+    default:
+      SafeIntrinsic = false;
+      break;
+    case Intrinsic::smin:
+    case Intrinsic::smax:
+    case Intrinsic::umin:
+    case Intrinsic::umax:
+    case Intrinsic::abs:
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+    case Intrinsic::sqrt:
+    case Intrinsic::sin:
+    case Intrinsic::cos:
+    case Intrinsic::pow:
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+    case Intrinsic::log:
+    case Intrinsic::log10:
+    case Intrinsic::log2:
+    case Intrinsic::fma:
+    case Intrinsic::fabs:
+    case Intrinsic::minnum:
+    case Intrinsic::maxnum:
+    case Intrinsic::copysign:
+    case Intrinsic::floor:
+    case Intrinsic::ceil:
+    case Intrinsic::trunc:
+    case Intrinsic::rint:
+    case Intrinsic::nearbyint:
+    case Intrinsic::round:
+    case Intrinsic::ctpop:
+    case Intrinsic::fmuladd:
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+    case Intrinsic::sadd_sat:
+    case Intrinsic::uadd_sat:
+    case Intrinsic::ssub_sat:
+    case Intrinsic::usub_sat:
+    case Intrinsic::bitreverse:
+      // All these function are overloadable and have both scalar and vector
+      // versions.
+      Properties |= eBuiltinPropertyVectorEquivalent;
+      SafeIntrinsic = true;
+      break;
+    case Intrinsic::assume:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    case Intrinsic::experimental_noalias_scope_decl:
+      SafeIntrinsic = true;
+      break;
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertySideEffects;
+      break;
     }
     if (NoSideEffect || SafeIntrinsic) {
       Properties |= eBuiltinPropertyNoSideEffects;
@@ -434,49 +433,49 @@ std::optional<Builtin> BuiltinInfo::analyzeBuiltin(const Function &F) const {
   bool IsConvergent = false;
   unsigned Properties = eBuiltinPropertyNone;
   switch (ID) {
-    default:
-      break;
-    case eMuxBuiltinMemBarrier:
-      Properties = eBuiltinPropertySideEffects;
-      break;
-    case eMuxBuiltinSubGroupBarrier:
-    case eMuxBuiltinWorkGroupBarrier:
-      IsConvergent = true;
-      Properties = eBuiltinPropertyExecutionFlow | eBuiltinPropertySideEffects;
-      break;
-    case eMuxBuiltinDMARead1D:
-    case eMuxBuiltinDMARead2D:
-    case eMuxBuiltinDMARead3D:
-    case eMuxBuiltinDMAWrite1D:
-    case eMuxBuiltinDMAWrite2D:
-    case eMuxBuiltinDMAWrite3D:
-    case eMuxBuiltinDMAWait:
-      // Our DMA builtins, by default, rely on thread checks against specific
-      // work-item IDs, so they must be convergent.
-      IsConvergent = true;
-      Properties = eBuiltinPropertyNoSideEffects;
-      break;
-    case eMuxBuiltinGetWorkDim:
-    case eMuxBuiltinGetGroupId:
-    case eMuxBuiltinGetGlobalSize:
-    case eMuxBuiltinGetGlobalOffset:
-    case eMuxBuiltinGetLocalSize:
-    case eMuxBuiltinGetNumGroups:
-    case eMuxBuiltinGetGlobalLinearId:
-    case eMuxBuiltinGetLocalLinearId:
-    case eMuxBuiltinGetGlobalId:
-    case eMuxBuiltinGetSubGroupLocalId:
-      Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyRematerializable;
-      break;
-    case eMuxBuiltinGetLocalId:
-      Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyLocalID |
-                   eBuiltinPropertyRematerializable;
-      break;
-    case eMuxBuiltinIsFTZ:
-    case eMuxBuiltinIsEmbeddedProfile:
-    case eMuxBuiltinUseFast:
-      Properties = eBuiltinPropertyNoSideEffects;
-      break;
+  default:
+    break;
+  case eMuxBuiltinMemBarrier:
+    Properties = eBuiltinPropertySideEffects;
+    break;
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinWorkGroupBarrier:
+    IsConvergent = true;
+    Properties = eBuiltinPropertyExecutionFlow | eBuiltinPropertySideEffects;
+    break;
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite1D:
+  case eMuxBuiltinDMAWrite2D:
+  case eMuxBuiltinDMAWrite3D:
+  case eMuxBuiltinDMAWait:
+    // Our DMA builtins, by default, rely on thread checks against specific
+    // work-item IDs, so they must be convergent.
+    IsConvergent = true;
+    Properties = eBuiltinPropertyNoSideEffects;
+    break;
+  case eMuxBuiltinGetWorkDim:
+  case eMuxBuiltinGetGroupId:
+  case eMuxBuiltinGetGlobalSize:
+  case eMuxBuiltinGetGlobalOffset:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetNumGroups:
+  case eMuxBuiltinGetGlobalLinearId:
+  case eMuxBuiltinGetLocalLinearId:
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetSubGroupLocalId:
+    Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyRematerializable;
+    break;
+  case eMuxBuiltinGetLocalId:
+    Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyLocalID |
+                 eBuiltinPropertyRematerializable;
+    break;
+  case eMuxBuiltinIsFTZ:
+  case eMuxBuiltinIsEmbeddedProfile:
+  case eMuxBuiltinUseFast:
+    Properties = eBuiltinPropertyNoSideEffects;
+    break;
   }
 
   // Group functions are convergent.
@@ -491,8 +490,8 @@ std::optional<Builtin> BuiltinInfo::analyzeBuiltin(const Function &F) const {
   return Builtin{F, ID, (BuiltinProperties)Properties, OverloadInfo};
 }
 
-std::optional<BuiltinCall> BuiltinInfo::analyzeBuiltinCall(
-    const CallInst &CI, unsigned SimdDimIdx) const {
+std::optional<BuiltinCall>
+BuiltinInfo::analyzeBuiltinCall(const CallInst &CI, unsigned SimdDimIdx) const {
   if (auto *const callee = dyn_cast<Function>(CI.getCalledOperand())) {
     if (const auto B = analyzeBuiltin(*callee)) {
       const auto U = isBuiltinUniform(*B, &CI, SimdDimIdx);
@@ -648,25 +647,25 @@ std::string BuiltinInfo::getMangledTypeStr(Type *Ty) {
 
   if (Ty) {
     switch (Ty->getTypeID()) {
-      default:
-        break;
-      case Type::HalfTyID:
-        return "f16";
-      case Type::BFloatTyID:
-        return "bf16";
-      case Type::FloatTyID:
-        return "f32";
-      case Type::DoubleTyID:
-        return "f64";
-      case Type::IntegerTyID:
-        return "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
+    default:
+      break;
+    case Type::HalfTyID:
+      return "f16";
+    case Type::BFloatTyID:
+      return "bf16";
+    case Type::FloatTyID:
+      return "f32";
+    case Type::DoubleTyID:
+      return "f64";
+    case Type::IntegerTyID:
+      return "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
     }
   }
   llvm_unreachable("Unhandled type");
 }
 
-std::pair<Type *, StringRef> BuiltinInfo::getDemangledTypeFromStr(
-    StringRef TyStr, LLVMContext &Ctx) {
+std::pair<Type *, StringRef>
+BuiltinInfo::getDemangledTypeFromStr(StringRef TyStr, LLVMContext &Ctx) {
   const bool IsScalable = TyStr.consume_front("nx");
   if (TyStr.consume_front("v")) {
     unsigned EC;
@@ -702,199 +701,199 @@ std::string BuiltinInfo::getMuxBuiltinName(BuiltinID ID,
                                            ArrayRef<Type *> OverloadInfo) {
   assert(isMuxBuiltinID(ID));
   switch (ID) {
-    default:
-      break;
-    case eMuxBuiltinIsFTZ:
-      return MuxBuiltins::isftz;
-    case eMuxBuiltinUseFast:
-      return MuxBuiltins::usefast;
-    case eMuxBuiltinIsEmbeddedProfile:
-      return MuxBuiltins::isembeddedprofile;
-    case eMuxBuiltinGetGlobalSize:
-      return MuxBuiltins::get_global_size;
-    case eMuxBuiltinGetGlobalId:
-      return MuxBuiltins::get_global_id;
-    case eMuxBuiltinGetGlobalOffset:
-      return MuxBuiltins::get_global_offset;
-    case eMuxBuiltinGetLocalSize:
-      return MuxBuiltins::get_local_size;
-    case eMuxBuiltinGetLocalId:
-      return MuxBuiltins::get_local_id;
-    case eMuxBuiltinSetLocalId:
-      return MuxBuiltins::set_local_id;
-    case eMuxBuiltinGetSubGroupId:
-      return MuxBuiltins::get_sub_group_id;
-    case eMuxBuiltinSetSubGroupId:
-      return MuxBuiltins::set_sub_group_id;
-    case eMuxBuiltinGetNumGroups:
-      return MuxBuiltins::get_num_groups;
-    case eMuxBuiltinGetNumSubGroups:
-      return MuxBuiltins::get_num_sub_groups;
-    case eMuxBuiltinSetNumSubGroups:
-      return MuxBuiltins::set_num_sub_groups;
-    case eMuxBuiltinGetMaxSubGroupSize:
-      return MuxBuiltins::get_max_sub_group_size;
-    case eMuxBuiltinSetMaxSubGroupSize:
-      return MuxBuiltins::set_max_sub_group_size;
-    case eMuxBuiltinGetGroupId:
-      return MuxBuiltins::get_group_id;
-    case eMuxBuiltinGetWorkDim:
-      return MuxBuiltins::get_work_dim;
-    case eMuxBuiltinDMARead1D:
-      return MuxBuiltins::dma_read_1d;
-    case eMuxBuiltinDMARead2D:
-      return MuxBuiltins::dma_read_2d;
-    case eMuxBuiltinDMARead3D:
-      return MuxBuiltins::dma_read_3d;
-    case eMuxBuiltinDMAWrite1D:
-      return MuxBuiltins::dma_write_1d;
-    case eMuxBuiltinDMAWrite2D:
-      return MuxBuiltins::dma_write_2d;
-    case eMuxBuiltinDMAWrite3D:
-      return MuxBuiltins::dma_write_3d;
-    case eMuxBuiltinDMAWait:
-      return MuxBuiltins::dma_wait;
-    case eMuxBuiltinGetGlobalLinearId:
-      return MuxBuiltins::get_global_linear_id;
-    case eMuxBuiltinGetLocalLinearId:
-      return MuxBuiltins::get_local_linear_id;
-    case eMuxBuiltinGetEnqueuedLocalSize:
-      return MuxBuiltins::get_enqueued_local_size;
-    case eMuxBuiltinGetSubGroupSize:
-      return MuxBuiltins::get_sub_group_size;
-    case eMuxBuiltinGetSubGroupLocalId:
-      return MuxBuiltins::get_sub_group_local_id;
-    case eMuxBuiltinMemBarrier:
-      return MuxBuiltins::mem_barrier;
-    case eMuxBuiltinWorkGroupBarrier:
-      return MuxBuiltins::work_group_barrier;
-    case eMuxBuiltinSubGroupBarrier:
-      return MuxBuiltins::sub_group_barrier;
+  default:
+    break;
+  case eMuxBuiltinIsFTZ:
+    return MuxBuiltins::isftz;
+  case eMuxBuiltinUseFast:
+    return MuxBuiltins::usefast;
+  case eMuxBuiltinIsEmbeddedProfile:
+    return MuxBuiltins::isembeddedprofile;
+  case eMuxBuiltinGetGlobalSize:
+    return MuxBuiltins::get_global_size;
+  case eMuxBuiltinGetGlobalId:
+    return MuxBuiltins::get_global_id;
+  case eMuxBuiltinGetGlobalOffset:
+    return MuxBuiltins::get_global_offset;
+  case eMuxBuiltinGetLocalSize:
+    return MuxBuiltins::get_local_size;
+  case eMuxBuiltinGetLocalId:
+    return MuxBuiltins::get_local_id;
+  case eMuxBuiltinSetLocalId:
+    return MuxBuiltins::set_local_id;
+  case eMuxBuiltinGetSubGroupId:
+    return MuxBuiltins::get_sub_group_id;
+  case eMuxBuiltinSetSubGroupId:
+    return MuxBuiltins::set_sub_group_id;
+  case eMuxBuiltinGetNumGroups:
+    return MuxBuiltins::get_num_groups;
+  case eMuxBuiltinGetNumSubGroups:
+    return MuxBuiltins::get_num_sub_groups;
+  case eMuxBuiltinSetNumSubGroups:
+    return MuxBuiltins::set_num_sub_groups;
+  case eMuxBuiltinGetMaxSubGroupSize:
+    return MuxBuiltins::get_max_sub_group_size;
+  case eMuxBuiltinSetMaxSubGroupSize:
+    return MuxBuiltins::set_max_sub_group_size;
+  case eMuxBuiltinGetGroupId:
+    return MuxBuiltins::get_group_id;
+  case eMuxBuiltinGetWorkDim:
+    return MuxBuiltins::get_work_dim;
+  case eMuxBuiltinDMARead1D:
+    return MuxBuiltins::dma_read_1d;
+  case eMuxBuiltinDMARead2D:
+    return MuxBuiltins::dma_read_2d;
+  case eMuxBuiltinDMARead3D:
+    return MuxBuiltins::dma_read_3d;
+  case eMuxBuiltinDMAWrite1D:
+    return MuxBuiltins::dma_write_1d;
+  case eMuxBuiltinDMAWrite2D:
+    return MuxBuiltins::dma_write_2d;
+  case eMuxBuiltinDMAWrite3D:
+    return MuxBuiltins::dma_write_3d;
+  case eMuxBuiltinDMAWait:
+    return MuxBuiltins::dma_wait;
+  case eMuxBuiltinGetGlobalLinearId:
+    return MuxBuiltins::get_global_linear_id;
+  case eMuxBuiltinGetLocalLinearId:
+    return MuxBuiltins::get_local_linear_id;
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    return MuxBuiltins::get_enqueued_local_size;
+  case eMuxBuiltinGetSubGroupSize:
+    return MuxBuiltins::get_sub_group_size;
+  case eMuxBuiltinGetSubGroupLocalId:
+    return MuxBuiltins::get_sub_group_local_id;
+  case eMuxBuiltinMemBarrier:
+    return MuxBuiltins::mem_barrier;
+  case eMuxBuiltinWorkGroupBarrier:
+    return MuxBuiltins::work_group_barrier;
+  case eMuxBuiltinSubGroupBarrier:
+    return MuxBuiltins::sub_group_barrier;
   }
 
-    // A sneaky macro to do case statements on all scopes of a group operation.
-    // Note that it is missing a leading 'case' and a trailing ':' to trick
-    // clang-format into formatting it like a regular case statement.
-#define CASE_GROUP_OP_ALL_SCOPES(OP)                      \
-  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP: \
+  // A sneaky macro to do case statements on all scopes of a group operation.
+  // Note that it is missing a leading 'case' and a trailing ':' to trick
+  // clang-format into formatting it like a regular case statement.
+#define CASE_GROUP_OP_ALL_SCOPES(OP)                                           \
+  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP:                      \
   case eMuxBuiltinWorkgroup##OP
 
   std::string BaseName = [](BuiltinID ID) {
     // For simplicity, return all group operations as 'work_group' and replace
     // the string with 'sub_group' or 'vec_group' post-hoc.
     switch (ID) {
-      default:
-        return "";
-      case CASE_GROUP_OP_ALL_SCOPES(All):
-        return "__mux_work_group_all";
-      case CASE_GROUP_OP_ALL_SCOPES(Any):
-        return "__mux_work_group_any";
-      case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
-        return "__mux_work_group_broadcast";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
-        return "__mux_work_group_reduce_add";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
-        return "__mux_work_group_reduce_fadd";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
-        return "__mux_work_group_reduce_smin";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
-        return "__mux_work_group_reduce_umin";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
-        return "__mux_work_group_reduce_fmin";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
-        return "__mux_work_group_reduce_smax";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
-        return "__mux_work_group_reduce_umax";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
-        return "__mux_work_group_reduce_fmax";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
-        return "__mux_work_group_reduce_mul";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
-        return "__mux_work_group_reduce_fmul";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
-        return "__mux_work_group_reduce_and";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
-        return "__mux_work_group_reduce_or";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
-        return "__mux_work_group_reduce_xor";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
-        return "__mux_work_group_reduce_logical_and";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
-        return "__mux_work_group_reduce_logical_or";
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
-        return "__mux_work_group_reduce_logical_xor";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
-        return "__mux_work_group_scan_inclusive_add";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
-        return "__mux_work_group_scan_inclusive_fadd";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
-        return "__mux_work_group_scan_exclusive_add";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
-        return "__mux_work_group_scan_exclusive_fadd";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
-        return "__mux_work_group_scan_inclusive_smin";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
-        return "__mux_work_group_scan_inclusive_umin";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
-        return "__mux_work_group_scan_inclusive_fmin";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
-        return "__mux_work_group_scan_exclusive_smin";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
-        return "__mux_work_group_scan_exclusive_umin";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
-        return "__mux_work_group_scan_exclusive_fmin";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
-        return "__mux_work_group_scan_inclusive_smax";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
-        return "__mux_work_group_scan_inclusive_umax";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
-        return "__mux_work_group_scan_inclusive_fmax";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
-        return "__mux_work_group_scan_exclusive_smax";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
-        return "__mux_work_group_scan_exclusive_umax";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
-        return "__mux_work_group_scan_exclusive_fmax";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
-        return "__mux_work_group_scan_inclusive_mul";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
-        return "__mux_work_group_scan_inclusive_fmul";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
-        return "__mux_work_group_scan_exclusive_mul";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
-        return "__mux_work_group_scan_exclusive_fmul";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
-        return "__mux_work_group_scan_inclusive_and";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
-        return "__mux_work_group_scan_exclusive_and";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
-        return "__mux_work_group_scan_inclusive_or";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
-        return "__mux_work_group_scan_exclusive_or";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
-        return "__mux_work_group_scan_inclusive_xor";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
-        return "__mux_work_group_scan_exclusive_xor";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
-        return "__mux_work_group_scan_inclusive_logical_and";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
-        return "__mux_work_group_scan_exclusive_logical_and";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
-        return "__mux_work_group_scan_inclusive_logical_or";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
-        return "__mux_work_group_scan_exclusive_logical_or";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
-        return "__mux_work_group_scan_inclusive_logical_xor";
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
-        return "__mux_work_group_scan_exclusive_logical_xor";
-      case eMuxBuiltinSubgroupShuffle:
-        return "__mux_work_group_shuffle";
-      case eMuxBuiltinSubgroupShuffleUp:
-        return "__mux_work_group_shuffle_up";
-      case eMuxBuiltinSubgroupShuffleDown:
-        return "__mux_work_group_shuffle_down";
-      case eMuxBuiltinSubgroupShuffleXor:
-        return "__mux_work_group_shuffle_xor";
+    default:
+      return "";
+    case CASE_GROUP_OP_ALL_SCOPES(All):
+      return "__mux_work_group_all";
+    case CASE_GROUP_OP_ALL_SCOPES(Any):
+      return "__mux_work_group_any";
+    case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
+      return "__mux_work_group_broadcast";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+      return "__mux_work_group_reduce_add";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+      return "__mux_work_group_reduce_fadd";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+      return "__mux_work_group_reduce_smin";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+      return "__mux_work_group_reduce_umin";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+      return "__mux_work_group_reduce_fmin";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+      return "__mux_work_group_reduce_smax";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+      return "__mux_work_group_reduce_umax";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+      return "__mux_work_group_reduce_fmax";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+      return "__mux_work_group_reduce_mul";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+      return "__mux_work_group_reduce_fmul";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+      return "__mux_work_group_reduce_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+      return "__mux_work_group_reduce_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+      return "__mux_work_group_reduce_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+      return "__mux_work_group_reduce_logical_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+      return "__mux_work_group_reduce_logical_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+      return "__mux_work_group_reduce_logical_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+      return "__mux_work_group_scan_inclusive_add";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+      return "__mux_work_group_scan_inclusive_fadd";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+      return "__mux_work_group_scan_exclusive_add";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+      return "__mux_work_group_scan_exclusive_fadd";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+      return "__mux_work_group_scan_inclusive_smin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+      return "__mux_work_group_scan_inclusive_umin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+      return "__mux_work_group_scan_inclusive_fmin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+      return "__mux_work_group_scan_exclusive_smin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+      return "__mux_work_group_scan_exclusive_umin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+      return "__mux_work_group_scan_exclusive_fmin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+      return "__mux_work_group_scan_inclusive_smax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+      return "__mux_work_group_scan_inclusive_umax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+      return "__mux_work_group_scan_inclusive_fmax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+      return "__mux_work_group_scan_exclusive_smax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+      return "__mux_work_group_scan_exclusive_umax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+      return "__mux_work_group_scan_exclusive_fmax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+      return "__mux_work_group_scan_inclusive_mul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+      return "__mux_work_group_scan_inclusive_fmul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+      return "__mux_work_group_scan_exclusive_mul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+      return "__mux_work_group_scan_exclusive_fmul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+      return "__mux_work_group_scan_inclusive_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+      return "__mux_work_group_scan_exclusive_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+      return "__mux_work_group_scan_inclusive_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+      return "__mux_work_group_scan_exclusive_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+      return "__mux_work_group_scan_inclusive_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+      return "__mux_work_group_scan_exclusive_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+      return "__mux_work_group_scan_inclusive_logical_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+      return "__mux_work_group_scan_exclusive_logical_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+      return "__mux_work_group_scan_inclusive_logical_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+      return "__mux_work_group_scan_exclusive_logical_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+      return "__mux_work_group_scan_inclusive_logical_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+      return "__mux_work_group_scan_exclusive_logical_xor";
+    case eMuxBuiltinSubgroupShuffle:
+      return "__mux_work_group_shuffle";
+    case eMuxBuiltinSubgroupShuffleUp:
+      return "__mux_work_group_shuffle_up";
+    case eMuxBuiltinSubgroupShuffleDown:
+      return "__mux_work_group_shuffle_down";
+    case eMuxBuiltinSubgroupShuffleXor:
+      return "__mux_work_group_shuffle_xor";
     }
   }(ID);
 
@@ -966,181 +965,181 @@ std::optional<GroupCollective> BuiltinInfo::isMuxGroupCollective(BuiltinID ID) {
   // A sneaky macro to do case statements on all scopes of a group operation.
   // Note that it is missing a leading 'case' and a trailing ':' to trick
   // clang-format into formatting it like a regular case statement.
-#define CASE_GROUP_OP_ALL_SCOPES(OP)                      \
-  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP: \
+#define CASE_GROUP_OP_ALL_SCOPES(OP)                                           \
+  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP:                      \
   case eMuxBuiltinWorkgroup##OP
 
   switch (ID) {
-    default:
-      llvm_unreachable("Unhandled mux group builtin");
-    case CASE_GROUP_OP_ALL_SCOPES(All):
-      Collective.Op = GroupCollective::OpKind::All;
-      break;
-    case CASE_GROUP_OP_ALL_SCOPES(Any):
-      Collective.Op = GroupCollective::OpKind::Any;
-      break;
-    case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
-      Collective.Op = GroupCollective::OpKind::Broadcast;
-      break;
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
-      Collective.IsLogical = true;
-      [[fallthrough]];
+  default:
+    llvm_unreachable("Unhandled mux group builtin");
+  case CASE_GROUP_OP_ALL_SCOPES(All):
+    Collective.Op = GroupCollective::OpKind::All;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(Any):
+    Collective.Op = GroupCollective::OpKind::Any;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
+    Collective.Op = GroupCollective::OpKind::Broadcast;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+    Collective.IsLogical = true;
+    [[fallthrough]];
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+    Collective.Op = GroupCollective::OpKind::Reduction;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+    Collective.IsLogical = true;
+    [[fallthrough]];
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+    Collective.Op = GroupCollective::OpKind::ScanInclusive;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+    Collective.IsLogical = true;
+    [[fallthrough]];
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+    Collective.Op = GroupCollective::OpKind::ScanExclusive;
+    break;
+  case eMuxBuiltinSubgroupShuffle:
+    Collective.Op = GroupCollective::OpKind::Shuffle;
+    break;
+  case eMuxBuiltinSubgroupShuffleUp:
+    Collective.Op = GroupCollective::OpKind::ShuffleUp;
+    break;
+  case eMuxBuiltinSubgroupShuffleDown:
+    Collective.Op = GroupCollective::OpKind::ShuffleDown;
+    break;
+  case eMuxBuiltinSubgroupShuffleXor:
+    Collective.Op = GroupCollective::OpKind::ShuffleXor;
+    break;
+  }
+
+  // Then the recurrence kind.
+  if (Collective.Op == GroupCollective::OpKind::All) {
+    Collective.Recurrence = RecurKind::And;
+  } else if (Collective.Op == GroupCollective::OpKind::Any) {
+    Collective.Recurrence = RecurKind::Or;
+  } else if (Collective.Op == GroupCollective::OpKind::Reduction ||
+             Collective.Op == GroupCollective::OpKind::ScanExclusive ||
+             Collective.Op == GroupCollective::OpKind::ScanInclusive) {
+    switch (ID) {
     case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
-    case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
-      Collective.Op = GroupCollective::OpKind::Reduction;
-      break;
-    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
-      Collective.IsLogical = true;
-      [[fallthrough]];
     case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
-      Collective.Op = GroupCollective::OpKind::ScanInclusive;
-      break;
-    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
-      Collective.IsLogical = true;
-      [[fallthrough]];
     case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+      Collective.Recurrence = RecurKind::Add;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+      Collective.Recurrence = RecurKind::FAdd;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+      Collective.Recurrence = RecurKind::Mul;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+      Collective.Recurrence = RecurKind::FMul;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+      Collective.Recurrence = RecurKind::SMin;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+      Collective.Recurrence = RecurKind::UMin;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+      Collective.Recurrence = RecurKind::FMin;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+      Collective.Recurrence = RecurKind::SMax;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
     case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
-    case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
-      Collective.Op = GroupCollective::OpKind::ScanExclusive;
+      Collective.Recurrence = RecurKind::UMax;
       break;
-    case eMuxBuiltinSubgroupShuffle:
-      Collective.Op = GroupCollective::OpKind::Shuffle;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+      Collective.Recurrence = RecurKind::FMax;
       break;
-    case eMuxBuiltinSubgroupShuffleUp:
-      Collective.Op = GroupCollective::OpKind::ShuffleUp;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+      Collective.Recurrence = RecurKind::And;
       break;
-    case eMuxBuiltinSubgroupShuffleDown:
-      Collective.Op = GroupCollective::OpKind::ShuffleDown;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+      Collective.Recurrence = RecurKind::Or;
       break;
-    case eMuxBuiltinSubgroupShuffleXor:
-      Collective.Op = GroupCollective::OpKind::ShuffleXor;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+      Collective.Recurrence = RecurKind::Xor;
       break;
-  }
-
-  // Then the recurrence kind.
-  if (Collective.Op == GroupCollective::OpKind::All) {
-    Collective.Recurrence = RecurKind::And;
-  } else if (Collective.Op == GroupCollective::OpKind::Any) {
-    Collective.Recurrence = RecurKind::Or;
-  } else if (Collective.Op == GroupCollective::OpKind::Reduction ||
-             Collective.Op == GroupCollective::OpKind::ScanExclusive ||
-             Collective.Op == GroupCollective::OpKind::ScanInclusive) {
-    switch (ID) {
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
-        Collective.Recurrence = RecurKind::Add;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
-        Collective.Recurrence = RecurKind::FAdd;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
-        Collective.Recurrence = RecurKind::Mul;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
-        Collective.Recurrence = RecurKind::FMul;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
-        Collective.Recurrence = RecurKind::SMin;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
-        Collective.Recurrence = RecurKind::UMin;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
-        Collective.Recurrence = RecurKind::FMin;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
-        Collective.Recurrence = RecurKind::SMax;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
-        Collective.Recurrence = RecurKind::UMax;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
-        Collective.Recurrence = RecurKind::FMax;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
-        Collective.Recurrence = RecurKind::And;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
-        Collective.Recurrence = RecurKind::Or;
-        break;
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
-      case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
-      case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
-        Collective.Recurrence = RecurKind::Xor;
-        break;
-      default:
-        llvm_unreachable("Unhandled mux group operation");
+    default:
+      llvm_unreachable("Unhandled mux group operation");
     }
   } else if (!Collective.isBroadcast() && !Collective.isShuffleLike()) {
     llvm_unreachable("Unhandled mux group operation");
@@ -1150,100 +1149,100 @@ std::optional<GroupCollective> BuiltinInfo::isMuxGroupCollective(BuiltinID ID) {
 #undef CASE_GROUP_OP_ALL_SCOPES
 }
 
-std::optional<BuiltinID> BuiltinInfo::getMuxGroupCollective(
-    const GroupCollective &Group) {
-#define SIMPLE_SCOPE_SWITCH(OP)                     \
-  do {                                              \
-    switch (Group.Scope) {                          \
-      case GroupCollective::ScopeKind::SubGroup:    \
-        return eMuxBuiltinSubgroup##OP;             \
-      case GroupCollective::ScopeKind::WorkGroup:   \
-        return eMuxBuiltinWorkgroup##OP;            \
-      case GroupCollective::ScopeKind::VectorGroup: \
-        return eMuxBuiltinVecgroup##OP;             \
-    }                                               \
-    llvm_unreachable("Impossible scope kind");      \
+std::optional<BuiltinID>
+BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
+#define SIMPLE_SCOPE_SWITCH(OP)                                                \
+  do {                                                                         \
+    switch (Group.Scope) {                                                     \
+    case GroupCollective::ScopeKind::SubGroup:                                 \
+      return eMuxBuiltinSubgroup##OP;                                          \
+    case GroupCollective::ScopeKind::WorkGroup:                                \
+      return eMuxBuiltinWorkgroup##OP;                                         \
+    case GroupCollective::ScopeKind::VectorGroup:                              \
+      return eMuxBuiltinVecgroup##OP;                                          \
+    }                                                                          \
+    llvm_unreachable("Impossible scope kind");                                 \
   } while (0)
 
-#define COMPLEX_SCOPE_SWITCH(OP, SUFFIX)               \
-  do {                                                 \
-    switch (Group.Recurrence) {                        \
-      default:                                         \
-        llvm_unreachable("Unhandled recursion kind");  \
-      case RecurKind::Add:                             \
-        SIMPLE_SCOPE_SWITCH(OP##Add##SUFFIX);          \
-      case RecurKind::Mul:                             \
-        SIMPLE_SCOPE_SWITCH(OP##Mul##SUFFIX);          \
-      case RecurKind::FAdd:                            \
-        SIMPLE_SCOPE_SWITCH(OP##FAdd##SUFFIX);         \
-      case RecurKind::FMul:                            \
-        SIMPLE_SCOPE_SWITCH(OP##FMul##SUFFIX);         \
-      case RecurKind::SMin:                            \
-        SIMPLE_SCOPE_SWITCH(OP##SMin##SUFFIX);         \
-      case RecurKind::UMin:                            \
-        SIMPLE_SCOPE_SWITCH(OP##UMin##SUFFIX);         \
-      case RecurKind::FMin:                            \
-        SIMPLE_SCOPE_SWITCH(OP##FMin##SUFFIX);         \
-      case RecurKind::SMax:                            \
-        SIMPLE_SCOPE_SWITCH(OP##SMax##SUFFIX);         \
-      case RecurKind::UMax:                            \
-        SIMPLE_SCOPE_SWITCH(OP##UMax##SUFFIX);         \
-      case RecurKind::FMax:                            \
-        SIMPLE_SCOPE_SWITCH(OP##FMax##SUFFIX);         \
-      case RecurKind::And:                             \
-        if (Group.IsLogical) {                         \
-          SIMPLE_SCOPE_SWITCH(OP##LogicalAnd##SUFFIX); \
-        } else {                                       \
-          SIMPLE_SCOPE_SWITCH(OP##And##SUFFIX);        \
-        }                                              \
-      case RecurKind::Or:                              \
-        if (Group.IsLogical) {                         \
-          SIMPLE_SCOPE_SWITCH(OP##LogicalOr##SUFFIX);  \
-        } else {                                       \
-          SIMPLE_SCOPE_SWITCH(OP##Or##SUFFIX);         \
-        }                                              \
-      case RecurKind::Xor:                             \
-        if (Group.IsLogical) {                         \
-          SIMPLE_SCOPE_SWITCH(OP##LogicalXor##SUFFIX); \
-        } else {                                       \
-          SIMPLE_SCOPE_SWITCH(OP##Xor##SUFFIX);        \
-        }                                              \
-    }                                                  \
+#define COMPLEX_SCOPE_SWITCH(OP, SUFFIX)                                       \
+  do {                                                                         \
+    switch (Group.Recurrence) {                                                \
+    default:                                                                   \
+      llvm_unreachable("Unhandled recursion kind");                            \
+    case RecurKind::Add:                                                       \
+      SIMPLE_SCOPE_SWITCH(OP##Add##SUFFIX);                                    \
+    case RecurKind::Mul:                                                       \
+      SIMPLE_SCOPE_SWITCH(OP##Mul##SUFFIX);                                    \
+    case RecurKind::FAdd:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FAdd##SUFFIX);                                   \
+    case RecurKind::FMul:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FMul##SUFFIX);                                   \
+    case RecurKind::SMin:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##SMin##SUFFIX);                                   \
+    case RecurKind::UMin:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##UMin##SUFFIX);                                   \
+    case RecurKind::FMin:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FMin##SUFFIX);                                   \
+    case RecurKind::SMax:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##SMax##SUFFIX);                                   \
+    case RecurKind::UMax:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##UMax##SUFFIX);                                   \
+    case RecurKind::FMax:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FMax##SUFFIX);                                   \
+    case RecurKind::And:                                                       \
+      if (Group.IsLogical) {                                                   \
+        SIMPLE_SCOPE_SWITCH(OP##LogicalAnd##SUFFIX);                           \
+      } else {                                                                 \
+        SIMPLE_SCOPE_SWITCH(OP##And##SUFFIX);                                  \
+      }                                                                        \
+    case RecurKind::Or:                                                        \
+      if (Group.IsLogical) {                                                   \
+        SIMPLE_SCOPE_SWITCH(OP##LogicalOr##SUFFIX);                            \
+      } else {                                                                 \
+        SIMPLE_SCOPE_SWITCH(OP##Or##SUFFIX);                                   \
+      }                                                                        \
+    case RecurKind::Xor:                                                       \
+      if (Group.IsLogical) {                                                   \
+        SIMPLE_SCOPE_SWITCH(OP##LogicalXor##SUFFIX);                           \
+      } else {                                                                 \
+        SIMPLE_SCOPE_SWITCH(OP##Xor##SUFFIX);                                  \
+      }                                                                        \
+    }                                                                          \
   } while (0)
 
   switch (Group.Op) {
-    case GroupCollective::OpKind::All:
-      SIMPLE_SCOPE_SWITCH(All);
-    case GroupCollective::OpKind::Any:
-      SIMPLE_SCOPE_SWITCH(Any);
-    case GroupCollective::OpKind::Broadcast:
-      SIMPLE_SCOPE_SWITCH(Broadcast);
-    case GroupCollective::OpKind::Reduction:
-      COMPLEX_SCOPE_SWITCH(Reduce, );
-    case GroupCollective::OpKind::ScanExclusive:
-      COMPLEX_SCOPE_SWITCH(Scan, Exclusive);
-    case GroupCollective::OpKind::ScanInclusive:
-      COMPLEX_SCOPE_SWITCH(Scan, Inclusive);
+  case GroupCollective::OpKind::All:
+    SIMPLE_SCOPE_SWITCH(All);
+  case GroupCollective::OpKind::Any:
+    SIMPLE_SCOPE_SWITCH(Any);
+  case GroupCollective::OpKind::Broadcast:
+    SIMPLE_SCOPE_SWITCH(Broadcast);
+  case GroupCollective::OpKind::Reduction:
+    COMPLEX_SCOPE_SWITCH(Reduce, );
+  case GroupCollective::OpKind::ScanExclusive:
+    COMPLEX_SCOPE_SWITCH(Scan, Exclusive);
+  case GroupCollective::OpKind::ScanInclusive:
+    COMPLEX_SCOPE_SWITCH(Scan, Inclusive);
+    break;
+  case GroupCollective::OpKind::Shuffle:
+  case GroupCollective::OpKind::ShuffleUp:
+  case GroupCollective::OpKind::ShuffleDown:
+  case GroupCollective::OpKind::ShuffleXor:
+    if (!Group.isSubGroupScope()) {
       break;
+    }
+    switch (Group.Op) {
+    default:
+      llvm_unreachable("Unhandled op");
     case GroupCollective::OpKind::Shuffle:
+      return eMuxBuiltinSubgroupShuffle;
     case GroupCollective::OpKind::ShuffleUp:
+      return eMuxBuiltinSubgroupShuffleUp;
     case GroupCollective::OpKind::ShuffleDown:
+      return eMuxBuiltinSubgroupShuffleDown;
     case GroupCollective::OpKind::ShuffleXor:
-      if (!Group.isSubGroupScope()) {
-        break;
-      }
-      switch (Group.Op) {
-        default:
-          llvm_unreachable("Unhandled op");
-        case GroupCollective::OpKind::Shuffle:
-          return eMuxBuiltinSubgroupShuffle;
-        case GroupCollective::OpKind::ShuffleUp:
-          return eMuxBuiltinSubgroupShuffleUp;
-        case GroupCollective::OpKind::ShuffleDown:
-          return eMuxBuiltinSubgroupShuffleDown;
-        case GroupCollective::OpKind::ShuffleXor:
-          return eMuxBuiltinSubgroupShuffleXor;
-      }
+      return eMuxBuiltinSubgroupShuffleXor;
+    }
   }
   return std::nullopt;
 #undef COMPLEX_SCOPE_SWITCH
@@ -1255,17 +1254,17 @@ bool BuiltinInfo::isOverloadableMuxBuiltinID(BuiltinID ID) {
     return false;
   }
   switch (ID) {
-    default:
-      return isMuxGroupCollective(ID).has_value();
-    case eMuxBuiltinDMARead1D:
-    case eMuxBuiltinDMAWrite1D:
-    case eMuxBuiltinDMARead2D:
-    case eMuxBuiltinDMAWrite2D:
-    case eMuxBuiltinDMARead3D:
-    case eMuxBuiltinDMAWrite3D:
-      return true;
+  default:
+    return isMuxGroupCollective(ID).has_value();
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMAWrite1D:
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMAWrite2D:
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite3D:
+    return true;
   }
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
index a60dc0deb09ec..20b934795d20c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -44,8 +44,8 @@ namespace stdcompat {
 using ::ilogb;
 #else
 using std::ilogb;
-#endif  // __ANDROID__
-}  // namespace stdcompat
+#endif // __ANDROID__
+} // namespace stdcompat
 
 namespace {
 /// @brief Identifiers for recognized OpenCL builtins.
@@ -415,7 +415,7 @@ enum CLBuiltinID : compiler::utils::BuiltinID {
   // 6.3 Conversions & Type Casting Examples
   eCLBuiltinAs,
 };
-}  // namespace
+} // namespace
 
 namespace {
 using namespace llvm;
@@ -429,7 +429,7 @@ bool isValidVecWidth(unsigned w) {
 
 /// @brief Copy global variables to a module on demand.
 class GlobalValueMaterializer final : public llvm::ValueMaterializer {
- public:
+public:
   /// @brief Create a new global variable materializer.
   /// @param[in] M Module to materialize the variables in.
   GlobalValueMaterializer(Module &M) : DestM(M) {}
@@ -460,13 +460,13 @@ class GlobalValueMaterializer final : public llvm::ValueMaterializer {
     return NewGV;
   }
 
- private:
+private:
   /// @brief Modules to materialize variables in.
   Module &DestM;
   /// @brief Materialized variables.
   std::vector<GlobalVariable *> Variables;
 };
-}  // namespace
+} // namespace
 
 namespace compiler {
 namespace utils {
@@ -841,8 +841,8 @@ Function *CLBuiltinInfo::materializeBuiltin(StringRef BuiltinName,
   return Loader->materializeBuiltin(BuiltinName, DestM, Flags);
 }
 
-std::optional<BuiltinID> CLBuiltinInfo::identifyBuiltin(
-    const Function &F) const {
+std::optional<BuiltinID>
+CLBuiltinInfo::identifyBuiltin(const Function &F) const {
   NameMangler Mangler(nullptr);
   const StringRef Name = F.getName();
   const CLBuiltinEntry *entry = Builtins;
@@ -937,8 +937,8 @@ BuiltinUniformity CLBuiltinInfo::isBuiltinUniform(const Builtin &,
   return eBuiltinUniformityLikeInputs;
 }
 
-std::optional<Builtin> CLBuiltinInfo::analyzeBuiltin(
-    const Function &Callee) const {
+std::optional<Builtin>
+CLBuiltinInfo::analyzeBuiltin(const Function &Callee) const {
   const auto ID = identifyBuiltin(Callee);
   if (!ID) {
     return std::nullopt;
@@ -947,277 +947,277 @@ std::optional<Builtin> CLBuiltinInfo::analyzeBuiltin(
   bool IsConvergent = false;
   unsigned Properties = eBuiltinPropertyNone;
   switch (*ID) {
-    default:
-      // Assume convergence on unknown builtins.
-      IsConvergent = true;
-      break;
-    case eBuiltinUnknown: {
-      // Assume convergence on unknown builtins.
-      IsConvergent = true;
-      // If we know that this is an OpenCL builtin, but we don't have any
-      // special information about it, we can determine if it has side effects
-      // or not by its return type and its paramaters. This depends on being
-      // able to identify all the "special" builtins, such as barriers and
-      // fences.
-      bool HasSideEffects = false;
-
-      // Void functions have side effects
-      if (Callee.getReturnType() == Type::getVoidTy(Callee.getContext())) {
+  default:
+    // Assume convergence on unknown builtins.
+    IsConvergent = true;
+    break;
+  case eBuiltinUnknown: {
+    // Assume convergence on unknown builtins.
+    IsConvergent = true;
+    // If we know that this is an OpenCL builtin, but we don't have any
+    // special information about it, we can determine if it has side effects
+    // or not by its return type and its paramaters. This depends on being
+    // able to identify all the "special" builtins, such as barriers and
+    // fences.
+    bool HasSideEffects = false;
+
+    // Void functions have side effects
+    if (Callee.getReturnType() == Type::getVoidTy(Callee.getContext())) {
+      HasSideEffects = true;
+    }
+    // Functions that take pointers probably have side effects
+    for (const auto &arg : Callee.args()) {
+      if (arg.getType()->isPointerTy()) {
         HasSideEffects = true;
       }
-      // Functions that take pointers probably have side effects
-      for (const auto &arg : Callee.args()) {
-        if (arg.getType()->isPointerTy()) {
-          HasSideEffects = true;
-        }
-      }
-      Properties |= HasSideEffects ? eBuiltinPropertySideEffects
-                                   : eBuiltinPropertyNoSideEffects;
-    } break;
-    case eCLBuiltinBarrier:
-      IsConvergent = true;
-      Properties |= eBuiltinPropertyExecutionFlow;
-      Properties |= eBuiltinPropertySideEffects;
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-    case eCLBuiltinMemFence:
-    case eCLBuiltinReadMemFence:
-    case eCLBuiltinWriteMemFence:
-      Properties |= eBuiltinPropertySupportsInstantiation;
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-    case eCLBuiltinPrintf:
-      Properties |= eBuiltinPropertySideEffects;
-      Properties |= eBuiltinPropertySupportsInstantiation;
-      break;
-    case eCLBuiltinAsyncWorkGroupCopy:
-    case eCLBuiltinAsyncWorkGroupStridedCopy:
-    case eCLBuiltinWaitGroupEvents:
-    case eCLBuiltinAsyncWorkGroupCopy2D2D:
-    case eCLBuiltinAsyncWorkGroupCopy3D3D:
-      // Our implementation of these builtins uses thread checks against
-      // specific work-item IDs, so they are convergent.
-      IsConvergent = true;
-      Properties |= eBuiltinPropertyNoSideEffects;
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-    case eCLBuiltinAtomicAdd:
-    case eCLBuiltinAtomicSub:
-    case eCLBuiltinAtomicXchg:
-    case eCLBuiltinAtomicInc:
-    case eCLBuiltinAtomicDec:
-    case eCLBuiltinAtomicCmpxchg:
-    case eCLBuiltinAtomicMin:
-    case eCLBuiltinAtomicMax:
-    case eCLBuiltinAtomicAnd:
-    case eCLBuiltinAtomicOr:
-    case eCLBuiltinAtomicXor:
-      Properties |= eBuiltinPropertySideEffects;
-      Properties |= eBuiltinPropertySupportsInstantiation;
-      Properties |= eBuiltinPropertyAtomic;
-      break;
-    case eCLBuiltinGetWorkDim:
-    case eCLBuiltinGetGroupId:
-    case eCLBuiltinGetGlobalSize:
-    case eCLBuiltinGetGlobalOffset:
-    case eCLBuiltinGetNumGroups:
-    case eCLBuiltinGetGlobalId:
-    case eCLBuiltinGetLocalSize:
-    case eCLBuiltinGetEnqueuedLocalSize:
-    case eCLBuiltinGetLocalLinearId:
-    case eCLBuiltinGetGlobalLinearId:
-    case eCLBuiltinGetSubgroupLocalId:
-      Properties |= eBuiltinPropertyWorkItem;
-      Properties |= eBuiltinPropertyRematerializable;
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-    case eCLBuiltinGetLocalId:
-      Properties |= eBuiltinPropertyWorkItem;
-      Properties |= eBuiltinPropertyLocalID;
-      Properties |= eBuiltinPropertyRematerializable;
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-    case eCLBuiltinDot:
-    case eCLBuiltinCross:
-    case eCLBuiltinFastDistance:
-    case eCLBuiltinFastLength:
-    case eCLBuiltinFastNormalize:
-      Properties |= eBuiltinPropertyReduction;
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinDistance:
-    case eCLBuiltinLength:
-    case eCLBuiltinNormalize:
-      Properties |= eBuiltinPropertyReduction;
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      // XXX The inline implementation seems to have precision issues. The dot
-      // product can overflow to +inf which results in the wrong result.
-      // See redmine #6427 and #9115
-      // Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinIsEqual:
-    case eCLBuiltinIsNotEqual:
-    case eCLBuiltinIsGreater:
-    case eCLBuiltinIsGreaterEqual:
-    case eCLBuiltinIsLess:
-    case eCLBuiltinIsLessEqual:
-    case eCLBuiltinIsLessGreater:
-    case eCLBuiltinIsOrdered:
-    case eCLBuiltinIsUnordered:
-    case eCLBuiltinIsFinite:
-    case eCLBuiltinIsInf:
-    case eCLBuiltinIsNan:
-    case eCLBuiltinIsNormal:
-    case eCLBuiltinSignBit:
-      // Scalar variants return '0' or '1', vector variants '0' or '111...1'.
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      Properties |= eBuiltinPropertySupportsInstantiation;
-      break;
-    case eCLBuiltinAny:
-    case eCLBuiltinAll:
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinFract:
-    case eCLBuiltinModF:
-    case eCLBuiltinSinCos:
-      Properties |= eBuiltinPropertyPointerReturnEqualRetTy;
-      break;
-    case eCLBuiltinFrexp:
-    case eCLBuiltinLGammaR:
-    case eCLBuiltinRemquo:
-      Properties |= eBuiltinPropertyPointerReturnEqualIntRetTy;
-      break;
-    case eCLBuiltinShuffle:
-    case eCLBuiltinShuffle2:
-      // While there are vector equivalents for these builtins, they require a
-      // modified mask, so we cannot use them by simply packetizing their
-      // arguments.
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinFMax:
-    case eCLBuiltinFMin:
-    case eCLBuiltinAddSat:
-    case eCLBuiltinSubSat:
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinConvertChar:
-    case eCLBuiltinConvertShort:
-    case eCLBuiltinConvertInt:
-    case eCLBuiltinConvertLong:
-    case eCLBuiltinConvertUChar:
-    case eCLBuiltinConvertUShort:
-    case eCLBuiltinConvertUInt:
-    case eCLBuiltinConvertULong:
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinVLoad:
-    case eCLBuiltinVLoadHalf:
-      Properties |= eBuiltinPropertyNoSideEffects;
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinVStore:
-    case eCLBuiltinVStoreHalf:
-      Properties |= eBuiltinPropertySideEffects;
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinSelect:
-    case eCLBuiltinAs:
-      // Some of these builtins do have vector equivalents, but since we can
-      // emit all variants inline, we mark them as having none for simplicity.
-      Properties |= eBuiltinPropertyNoVectorEquivalent;
-      Properties |= eBuiltinPropertyCanEmitInline;
-      break;
-    case eCLBuiltinWorkGroupBarrier:
-    case eCLBuiltinSubGroupBarrier:
-      IsConvergent = true;
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinAtomicWorkItemFence:
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-    case eCLBuiltinGetSubgroupSize:
-    case eCLBuiltinGetMaxSubgroupSize:
-    case eCLBuiltinGetNumSubgroups:
-    case eCLBuiltinGetEnqueuedNumSubgroups:
-    case eCLBuiltinGetSubgroupId:
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
-      // Subgroup collectives
-    case eCLBuiltinSubgroupAll:
-    case eCLBuiltinSubgroupAny:
-    case eCLBuiltinSubgroupBroadcast:
-    case eCLBuiltinSubgroupReduceAdd:
-    case eCLBuiltinSubgroupReduceMin:
-    case eCLBuiltinSubgroupReduceMax:
-    case eCLBuiltinSubgroupScanAddInclusive:
-    case eCLBuiltinSubgroupScanAddExclusive:
-    case eCLBuiltinSubgroupScanMinInclusive:
-    case eCLBuiltinSubgroupScanMinExclusive:
-    case eCLBuiltinSubgroupScanMaxInclusive:
-    case eCLBuiltinSubgroupScanMaxExclusive:
-    case eCLBuiltinSubgroupReduceMul:
-    case eCLBuiltinSubgroupReduceAnd:
-    case eCLBuiltinSubgroupReduceOr:
-    case eCLBuiltinSubgroupReduceXor:
-    case eCLBuiltinSubgroupReduceLogicalAnd:
-    case eCLBuiltinSubgroupReduceLogicalOr:
-    case eCLBuiltinSubgroupReduceLogicalXor:
-    case eCLBuiltinSubgroupScanMulInclusive:
-    case eCLBuiltinSubgroupScanMulExclusive:
-    case eCLBuiltinSubgroupScanAndInclusive:
-    case eCLBuiltinSubgroupScanAndExclusive:
-    case eCLBuiltinSubgroupScanOrInclusive:
-    case eCLBuiltinSubgroupScanOrExclusive:
-    case eCLBuiltinSubgroupScanXorInclusive:
-    case eCLBuiltinSubgroupScanXorExclusive:
-    case eCLBuiltinSubgroupScanLogicalAndInclusive:
-    case eCLBuiltinSubgroupScanLogicalAndExclusive:
-    case eCLBuiltinSubgroupScanLogicalOrInclusive:
-    case eCLBuiltinSubgroupScanLogicalOrExclusive:
-    case eCLBuiltinSubgroupScanLogicalXorInclusive:
-    case eCLBuiltinSubgroupScanLogicalXorExclusive:
-      // Work-group collectives
-    case eCLBuiltinWorkgroupAll:
-    case eCLBuiltinWorkgroupAny:
-    case eCLBuiltinWorkgroupBroadcast:
-    case eCLBuiltinWorkgroupReduceAdd:
-    case eCLBuiltinWorkgroupReduceMin:
-    case eCLBuiltinWorkgroupReduceMax:
-    case eCLBuiltinWorkgroupScanAddInclusive:
-    case eCLBuiltinWorkgroupScanAddExclusive:
-    case eCLBuiltinWorkgroupScanMinInclusive:
-    case eCLBuiltinWorkgroupScanMinExclusive:
-    case eCLBuiltinWorkgroupScanMaxInclusive:
-    case eCLBuiltinWorkgroupScanMaxExclusive:
-    case eCLBuiltinWorkgroupReduceMul:
-    case eCLBuiltinWorkgroupReduceAnd:
-    case eCLBuiltinWorkgroupReduceOr:
-    case eCLBuiltinWorkgroupReduceXor:
-    case eCLBuiltinWorkgroupReduceLogicalAnd:
-    case eCLBuiltinWorkgroupReduceLogicalOr:
-    case eCLBuiltinWorkgroupReduceLogicalXor:
-    case eCLBuiltinWorkgroupScanMulInclusive:
-    case eCLBuiltinWorkgroupScanMulExclusive:
-    case eCLBuiltinWorkgroupScanAndInclusive:
-    case eCLBuiltinWorkgroupScanAndExclusive:
-    case eCLBuiltinWorkgroupScanOrInclusive:
-    case eCLBuiltinWorkgroupScanOrExclusive:
-    case eCLBuiltinWorkgroupScanXorInclusive:
-    case eCLBuiltinWorkgroupScanXorExclusive:
-    case eCLBuiltinWorkgroupScanLogicalAndInclusive:
-    case eCLBuiltinWorkgroupScanLogicalAndExclusive:
-    case eCLBuiltinWorkgroupScanLogicalOrInclusive:
-    case eCLBuiltinWorkgroupScanLogicalOrExclusive:
-    case eCLBuiltinWorkgroupScanLogicalXorInclusive:
-    case eCLBuiltinWorkgroupScanLogicalXorExclusive:
-      IsConvergent = true;
-      Properties |= eBuiltinPropertyLowerToMuxBuiltin;
-      break;
+    }
+    Properties |= HasSideEffects ? eBuiltinPropertySideEffects
+                                 : eBuiltinPropertyNoSideEffects;
+  } break;
+  case eCLBuiltinBarrier:
+    IsConvergent = true;
+    Properties |= eBuiltinPropertyExecutionFlow;
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinMemFence:
+  case eCLBuiltinReadMemFence:
+  case eCLBuiltinWriteMemFence:
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinPrintf:
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    break;
+  case eCLBuiltinAsyncWorkGroupCopy:
+  case eCLBuiltinAsyncWorkGroupStridedCopy:
+  case eCLBuiltinWaitGroupEvents:
+  case eCLBuiltinAsyncWorkGroupCopy2D2D:
+  case eCLBuiltinAsyncWorkGroupCopy3D3D:
+    // Our implementation of these builtins uses thread checks against
+    // specific work-item IDs, so they are convergent.
+    IsConvergent = true;
+    Properties |= eBuiltinPropertyNoSideEffects;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinAtomicAdd:
+  case eCLBuiltinAtomicSub:
+  case eCLBuiltinAtomicXchg:
+  case eCLBuiltinAtomicInc:
+  case eCLBuiltinAtomicDec:
+  case eCLBuiltinAtomicCmpxchg:
+  case eCLBuiltinAtomicMin:
+  case eCLBuiltinAtomicMax:
+  case eCLBuiltinAtomicAnd:
+  case eCLBuiltinAtomicOr:
+  case eCLBuiltinAtomicXor:
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    Properties |= eBuiltinPropertyAtomic;
+    break;
+  case eCLBuiltinGetWorkDim:
+  case eCLBuiltinGetGroupId:
+  case eCLBuiltinGetGlobalSize:
+  case eCLBuiltinGetGlobalOffset:
+  case eCLBuiltinGetNumGroups:
+  case eCLBuiltinGetGlobalId:
+  case eCLBuiltinGetLocalSize:
+  case eCLBuiltinGetEnqueuedLocalSize:
+  case eCLBuiltinGetLocalLinearId:
+  case eCLBuiltinGetGlobalLinearId:
+  case eCLBuiltinGetSubgroupLocalId:
+    Properties |= eBuiltinPropertyWorkItem;
+    Properties |= eBuiltinPropertyRematerializable;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinGetLocalId:
+    Properties |= eBuiltinPropertyWorkItem;
+    Properties |= eBuiltinPropertyLocalID;
+    Properties |= eBuiltinPropertyRematerializable;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinDot:
+  case eCLBuiltinCross:
+  case eCLBuiltinFastDistance:
+  case eCLBuiltinFastLength:
+  case eCLBuiltinFastNormalize:
+    Properties |= eBuiltinPropertyReduction;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinDistance:
+  case eCLBuiltinLength:
+  case eCLBuiltinNormalize:
+    Properties |= eBuiltinPropertyReduction;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    // XXX The inline implementation seems to have precision issues. The dot
+    // product can overflow to +inf which results in the wrong result.
+    // See redmine #6427 and #9115
+    // Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinIsEqual:
+  case eCLBuiltinIsNotEqual:
+  case eCLBuiltinIsGreater:
+  case eCLBuiltinIsGreaterEqual:
+  case eCLBuiltinIsLess:
+  case eCLBuiltinIsLessEqual:
+  case eCLBuiltinIsLessGreater:
+  case eCLBuiltinIsOrdered:
+  case eCLBuiltinIsUnordered:
+  case eCLBuiltinIsFinite:
+  case eCLBuiltinIsInf:
+  case eCLBuiltinIsNan:
+  case eCLBuiltinIsNormal:
+  case eCLBuiltinSignBit:
+    // Scalar variants return '0' or '1', vector variants '0' or '111...1'.
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    break;
+  case eCLBuiltinAny:
+  case eCLBuiltinAll:
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinFract:
+  case eCLBuiltinModF:
+  case eCLBuiltinSinCos:
+    Properties |= eBuiltinPropertyPointerReturnEqualRetTy;
+    break;
+  case eCLBuiltinFrexp:
+  case eCLBuiltinLGammaR:
+  case eCLBuiltinRemquo:
+    Properties |= eBuiltinPropertyPointerReturnEqualIntRetTy;
+    break;
+  case eCLBuiltinShuffle:
+  case eCLBuiltinShuffle2:
+    // While there are vector equivalents for these builtins, they require a
+    // modified mask, so we cannot use them by simply packetizing their
+    // arguments.
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinFMax:
+  case eCLBuiltinFMin:
+  case eCLBuiltinAddSat:
+  case eCLBuiltinSubSat:
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinConvertChar:
+  case eCLBuiltinConvertShort:
+  case eCLBuiltinConvertInt:
+  case eCLBuiltinConvertLong:
+  case eCLBuiltinConvertUChar:
+  case eCLBuiltinConvertUShort:
+  case eCLBuiltinConvertUInt:
+  case eCLBuiltinConvertULong:
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinVLoad:
+  case eCLBuiltinVLoadHalf:
+    Properties |= eBuiltinPropertyNoSideEffects;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinVStore:
+  case eCLBuiltinVStoreHalf:
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinSelect:
+  case eCLBuiltinAs:
+    // Some of these builtins do have vector equivalents, but since we can
+    // emit all variants inline, we mark them as having none for simplicity.
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinWorkGroupBarrier:
+  case eCLBuiltinSubGroupBarrier:
+    IsConvergent = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinAtomicWorkItemFence:
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinGetSubgroupSize:
+  case eCLBuiltinGetMaxSubgroupSize:
+  case eCLBuiltinGetNumSubgroups:
+  case eCLBuiltinGetEnqueuedNumSubgroups:
+  case eCLBuiltinGetSubgroupId:
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+    // Subgroup collectives
+  case eCLBuiltinSubgroupAll:
+  case eCLBuiltinSubgroupAny:
+  case eCLBuiltinSubgroupBroadcast:
+  case eCLBuiltinSubgroupReduceAdd:
+  case eCLBuiltinSubgroupReduceMin:
+  case eCLBuiltinSubgroupReduceMax:
+  case eCLBuiltinSubgroupScanAddInclusive:
+  case eCLBuiltinSubgroupScanAddExclusive:
+  case eCLBuiltinSubgroupScanMinInclusive:
+  case eCLBuiltinSubgroupScanMinExclusive:
+  case eCLBuiltinSubgroupScanMaxInclusive:
+  case eCLBuiltinSubgroupScanMaxExclusive:
+  case eCLBuiltinSubgroupReduceMul:
+  case eCLBuiltinSubgroupReduceAnd:
+  case eCLBuiltinSubgroupReduceOr:
+  case eCLBuiltinSubgroupReduceXor:
+  case eCLBuiltinSubgroupReduceLogicalAnd:
+  case eCLBuiltinSubgroupReduceLogicalOr:
+  case eCLBuiltinSubgroupReduceLogicalXor:
+  case eCLBuiltinSubgroupScanMulInclusive:
+  case eCLBuiltinSubgroupScanMulExclusive:
+  case eCLBuiltinSubgroupScanAndInclusive:
+  case eCLBuiltinSubgroupScanAndExclusive:
+  case eCLBuiltinSubgroupScanOrInclusive:
+  case eCLBuiltinSubgroupScanOrExclusive:
+  case eCLBuiltinSubgroupScanXorInclusive:
+  case eCLBuiltinSubgroupScanXorExclusive:
+  case eCLBuiltinSubgroupScanLogicalAndInclusive:
+  case eCLBuiltinSubgroupScanLogicalAndExclusive:
+  case eCLBuiltinSubgroupScanLogicalOrInclusive:
+  case eCLBuiltinSubgroupScanLogicalOrExclusive:
+  case eCLBuiltinSubgroupScanLogicalXorInclusive:
+  case eCLBuiltinSubgroupScanLogicalXorExclusive:
+    // Work-group collectives
+  case eCLBuiltinWorkgroupAll:
+  case eCLBuiltinWorkgroupAny:
+  case eCLBuiltinWorkgroupBroadcast:
+  case eCLBuiltinWorkgroupReduceAdd:
+  case eCLBuiltinWorkgroupReduceMin:
+  case eCLBuiltinWorkgroupReduceMax:
+  case eCLBuiltinWorkgroupScanAddInclusive:
+  case eCLBuiltinWorkgroupScanAddExclusive:
+  case eCLBuiltinWorkgroupScanMinInclusive:
+  case eCLBuiltinWorkgroupScanMinExclusive:
+  case eCLBuiltinWorkgroupScanMaxInclusive:
+  case eCLBuiltinWorkgroupScanMaxExclusive:
+  case eCLBuiltinWorkgroupReduceMul:
+  case eCLBuiltinWorkgroupReduceAnd:
+  case eCLBuiltinWorkgroupReduceOr:
+  case eCLBuiltinWorkgroupReduceXor:
+  case eCLBuiltinWorkgroupReduceLogicalAnd:
+  case eCLBuiltinWorkgroupReduceLogicalOr:
+  case eCLBuiltinWorkgroupReduceLogicalXor:
+  case eCLBuiltinWorkgroupScanMulInclusive:
+  case eCLBuiltinWorkgroupScanMulExclusive:
+  case eCLBuiltinWorkgroupScanAndInclusive:
+  case eCLBuiltinWorkgroupScanAndExclusive:
+  case eCLBuiltinWorkgroupScanOrInclusive:
+  case eCLBuiltinWorkgroupScanOrExclusive:
+  case eCLBuiltinWorkgroupScanXorInclusive:
+  case eCLBuiltinWorkgroupScanXorExclusive:
+  case eCLBuiltinWorkgroupScanLogicalAndInclusive:
+  case eCLBuiltinWorkgroupScanLogicalAndExclusive:
+  case eCLBuiltinWorkgroupScanLogicalOrInclusive:
+  case eCLBuiltinWorkgroupScanLogicalOrExclusive:
+  case eCLBuiltinWorkgroupScanLogicalXorInclusive:
+  case eCLBuiltinWorkgroupScanLogicalXorExclusive:
+    IsConvergent = true;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
   }
 
   if (!IsConvergent) {
@@ -1270,8 +1270,8 @@ Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
         Type *NewType = OldPtrTy;
         TypeQualifiers NewQuals;
         TypeQualifiers EleQuals = OldQuals;
-        NewQuals.push_back(EleQuals.pop_front());  // Pointer qualifier
-        NewQuals.push_back(eTypeQualNone);         // Vector qualifier
+        NewQuals.push_back(EleQuals.pop_front()); // Pointer qualifier
+        NewQuals.push_back(eTypeQualNone);        // Vector qualifier
         NewQuals.push_back(EleQuals);
 
         VectorTypes.push_back(NewType);
@@ -1286,8 +1286,8 @@ Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
     }
     TypeQualifiers NewQuals;
     Type *NewType = FixedVectorType::get(OldTy, Width);
-    NewQuals.push_back(eTypeQualNone);  // Vector qualifier
-    NewQuals.push_back(OldQuals);       // Element qualifier
+    NewQuals.push_back(eTypeQualNone); // Vector qualifier
+    NewQuals.push_back(OldQuals);      // Element qualifier
 
     VectorTypes.push_back(NewType);
     VectorQuals.push_back(NewQuals);
@@ -1453,9 +1453,8 @@ Function *CLBuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
 /// (assumed builtin) Function is known to possess the given qualifier.
 /// @return true if the parameter is known to have the qualifier, false if not,
 /// and None on error.
-static std::optional<bool> paramHasTypeQual(const Function &F,
-                                            unsigned ParamIdx,
-                                            TypeQualifier Q) {
+static std::optional<bool>
+paramHasTypeQual(const Function &F, unsigned ParamIdx, TypeQualifier Q) {
   // Demangle the function name to get the type qualifiers.
   SmallVector<Type *, 2> Types;
   SmallVector<TypeQualifiers, 2> Quals;
@@ -1490,77 +1489,77 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
     // the source operand is signed or not. It is not possible to do this based
     // solely on the BuiltinID.
     switch (*BuiltinID) {
-        // 6.2 Explicit Conversions
-      case eCLBuiltinConvertChar:
-      case eCLBuiltinConvertShort:
-      case eCLBuiltinConvertInt:
-      case eCLBuiltinConvertLong:
-      case eCLBuiltinConvertUChar:
-      case eCLBuiltinConvertUShort:
-      case eCLBuiltinConvertUInt:
-      case eCLBuiltinConvertULong:
-        return emitBuiltinInlineConvert(F, *BuiltinID, B, Args);
-        // 6.12.3 Integer Functions
-      case eCLBuiltinAddSat:
-      case eCLBuiltinSubSat: {
-        std::optional<bool> IsParamSignedOrNone =
-            paramHasTypeQual(*F, 0, eTypeQualSignedInt);
-        if (!IsParamSignedOrNone.has_value()) {
-          return nullptr;
-        }
-        const bool IsSigned = *IsParamSignedOrNone;
-        const Intrinsic::ID IntrinsicOpc = [=] {
-          if (BuiltinID == eCLBuiltinSubSat) {
-            return IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat;
-          } else {
-            return IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat;
-          }
-        }();
-        return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
-                                                      IntrinsicOpc);
+      // 6.2 Explicit Conversions
+    case eCLBuiltinConvertChar:
+    case eCLBuiltinConvertShort:
+    case eCLBuiltinConvertInt:
+    case eCLBuiltinConvertLong:
+    case eCLBuiltinConvertUChar:
+    case eCLBuiltinConvertUShort:
+    case eCLBuiltinConvertUInt:
+    case eCLBuiltinConvertULong:
+      return emitBuiltinInlineConvert(F, *BuiltinID, B, Args);
+      // 6.12.3 Integer Functions
+    case eCLBuiltinAddSat:
+    case eCLBuiltinSubSat: {
+      std::optional<bool> IsParamSignedOrNone =
+          paramHasTypeQual(*F, 0, eTypeQualSignedInt);
+      if (!IsParamSignedOrNone.has_value()) {
+        return nullptr;
       }
-      case eCLBuiltinVLoad: {
-        NameMangler Mangler(&F->getContext());
-        Lexer L(Mangler.demangleName(F->getName()));
-        if (L.Consume("vload")) {
-          unsigned Width = 0;
-          if (L.ConsumeInteger(Width)) {
-            return emitBuiltinInlineVLoad(F, Width, B, Args);
-          }
-        }
-      } break;
-      case eCLBuiltinVLoadHalf: {
-        NameMangler Mangler(&F->getContext());
-        const auto name = Mangler.demangleName(F->getName());
-        if (name == "vload_half") {
-          // TODO handle "vload_halfn"
-          return emitBuiltinInlineVLoadHalf(F, B, Args);
+      const bool IsSigned = *IsParamSignedOrNone;
+      const Intrinsic::ID IntrinsicOpc = [=] {
+        if (BuiltinID == eCLBuiltinSubSat) {
+          return IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat;
+        } else {
+          return IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat;
         }
-      } break;
-      case eCLBuiltinVStore: {
-        NameMangler Mangler(&F->getContext());
-        Lexer L(Mangler.demangleName(F->getName()));
-        if (L.Consume("vstore")) {
-          unsigned Width = 0;
-          if (L.ConsumeInteger(Width)) {
-            return emitBuiltinInlineVStore(F, Width, B, Args);
-          }
+      }();
+      return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                    IntrinsicOpc);
+    }
+    case eCLBuiltinVLoad: {
+      NameMangler Mangler(&F->getContext());
+      Lexer L(Mangler.demangleName(F->getName()));
+      if (L.Consume("vload")) {
+        unsigned Width = 0;
+        if (L.ConsumeInteger(Width)) {
+          return emitBuiltinInlineVLoad(F, Width, B, Args);
         }
-      } break;
-      case eCLBuiltinVStoreHalf: {
-        NameMangler Mangler(&F->getContext());
-        Lexer L(Mangler.demangleName(F->getName()));
-        if (L.Consume("vstore_half")) {
-          // TODO handle "vstore_halfn"
-          return emitBuiltinInlineVStoreHalf(F, L.TextLeft(), B, Args);
+      }
+    } break;
+    case eCLBuiltinVLoadHalf: {
+      NameMangler Mangler(&F->getContext());
+      const auto name = Mangler.demangleName(F->getName());
+      if (name == "vload_half") {
+        // TODO handle "vload_halfn"
+        return emitBuiltinInlineVLoadHalf(F, B, Args);
+      }
+    } break;
+    case eCLBuiltinVStore: {
+      NameMangler Mangler(&F->getContext());
+      Lexer L(Mangler.demangleName(F->getName()));
+      if (L.Consume("vstore")) {
+        unsigned Width = 0;
+        if (L.ConsumeInteger(Width)) {
+          return emitBuiltinInlineVStore(F, Width, B, Args);
         }
-      } break;
-      case eCLBuiltinSelect:
-        return emitBuiltinInlineSelect(F, B, Args);
-      case eCLBuiltinAs:
-        return emitBuiltinInlineAs(F, B, Args);
-      default:
-        break;
+      }
+    } break;
+    case eCLBuiltinVStoreHalf: {
+      NameMangler Mangler(&F->getContext());
+      Lexer L(Mangler.demangleName(F->getName()));
+      if (L.Consume("vstore_half")) {
+        // TODO handle "vstore_halfn"
+        return emitBuiltinInlineVStoreHalf(F, L.TextLeft(), B, Args);
+      }
+    } break;
+    case eCLBuiltinSelect:
+      return emitBuiltinInlineSelect(F, B, Args);
+    case eCLBuiltinAs:
+      return emitBuiltinInlineAs(F, B, Args);
+    default:
+      break;
     }
     return emitBuiltinInline(*BuiltinID, B, Args);
   }
@@ -1571,54 +1570,54 @@ Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
 Value *CLBuiltinInfo::emitBuiltinInline(BuiltinID BuiltinID, IRBuilder<> &B,
                                         ArrayRef<Value *> Args) {
   switch (BuiltinID) {
-    default:
-      return nullptr;
+  default:
+    return nullptr;
 
-    case eCLBuiltinDot:
-    case eCLBuiltinCross:
-    case eCLBuiltinLength:
-    case eCLBuiltinDistance:
-    case eCLBuiltinNormalize:
-    case eCLBuiltinFastLength:
-    case eCLBuiltinFastDistance:
-    case eCLBuiltinFastNormalize:
-      return emitBuiltinInlineGeometrics(BuiltinID, B, Args);
-    // 6.12.2 Math Functions
-    case eCLBuiltinFMax:
-      return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
-                                                    llvm::Intrinsic::maxnum);
-    case eCLBuiltinFMin:
-      return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
-                                                    llvm::Intrinsic::minnum);
-    // 6.12.6 Relational Functions
-    case eCLBuiltinAll:
-      return emitBuiltinInlineAll(B, Args);
-    case eCLBuiltinAny:
-      return emitBuiltinInlineAny(B, Args);
-    case eCLBuiltinIsEqual:
-    case eCLBuiltinIsNotEqual:
-    case eCLBuiltinIsGreater:
-    case eCLBuiltinIsGreaterEqual:
-    case eCLBuiltinIsLess:
-    case eCLBuiltinIsLessEqual:
-    case eCLBuiltinIsLessGreater:
-    case eCLBuiltinIsOrdered:
-    case eCLBuiltinIsUnordered:
-      return emitBuiltinInlineRelationalsWithTwoArguments(BuiltinID, B, Args);
-    case eCLBuiltinIsFinite:
-    case eCLBuiltinIsInf:
-    case eCLBuiltinIsNan:
-    case eCLBuiltinIsNormal:
-    case eCLBuiltinSignBit:
-      assert(Args.size() == 1 && "Invalid number of arguments");
-      return emitBuiltinInlineRelationalsWithOneArgument(BuiltinID, B, Args[0]);
-    // 6.12.12 Miscellaneous Vector Functions
-    case eCLBuiltinShuffle:
-    case eCLBuiltinShuffle2:
-      return emitBuiltinInlineShuffle(BuiltinID, B, Args);
-
-    case eCLBuiltinPrintf:
-      return emitBuiltinInlinePrintf(BuiltinID, B, Args);
+  case eCLBuiltinDot:
+  case eCLBuiltinCross:
+  case eCLBuiltinLength:
+  case eCLBuiltinDistance:
+  case eCLBuiltinNormalize:
+  case eCLBuiltinFastLength:
+  case eCLBuiltinFastDistance:
+  case eCLBuiltinFastNormalize:
+    return emitBuiltinInlineGeometrics(BuiltinID, B, Args);
+  // 6.12.2 Math Functions
+  case eCLBuiltinFMax:
+    return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                  llvm::Intrinsic::maxnum);
+  case eCLBuiltinFMin:
+    return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                  llvm::Intrinsic::minnum);
+  // 6.12.6 Relational Functions
+  case eCLBuiltinAll:
+    return emitBuiltinInlineAll(B, Args);
+  case eCLBuiltinAny:
+    return emitBuiltinInlineAny(B, Args);
+  case eCLBuiltinIsEqual:
+  case eCLBuiltinIsNotEqual:
+  case eCLBuiltinIsGreater:
+  case eCLBuiltinIsGreaterEqual:
+  case eCLBuiltinIsLess:
+  case eCLBuiltinIsLessEqual:
+  case eCLBuiltinIsLessGreater:
+  case eCLBuiltinIsOrdered:
+  case eCLBuiltinIsUnordered:
+    return emitBuiltinInlineRelationalsWithTwoArguments(BuiltinID, B, Args);
+  case eCLBuiltinIsFinite:
+  case eCLBuiltinIsInf:
+  case eCLBuiltinIsNan:
+  case eCLBuiltinIsNormal:
+  case eCLBuiltinSignBit:
+    assert(Args.size() == 1 && "Invalid number of arguments");
+    return emitBuiltinInlineRelationalsWithOneArgument(BuiltinID, B, Args[0]);
+  // 6.12.12 Miscellaneous Vector Functions
+  case eCLBuiltinShuffle:
+  case eCLBuiltinShuffle2:
+    return emitBuiltinInlineShuffle(BuiltinID, B, Args);
+
+  case eCLBuiltinPrintf:
+    return emitBuiltinInlinePrintf(BuiltinID, B, Args);
   }
 }
 
@@ -1627,25 +1626,25 @@ Value *CLBuiltinInfo::emitBuiltinInlineGeometrics(BuiltinID BuiltinID,
                                                   ArrayRef<Value *> Args) {
   Value *Src = nullptr;
   switch (BuiltinID) {
-    default:
+  default:
+    return nullptr;
+  case eCLBuiltinDot:
+    return emitBuiltinInlineDot(B, Args);
+  case eCLBuiltinCross:
+    return emitBuiltinInlineCross(B, Args);
+  case eCLBuiltinLength:
+  case eCLBuiltinFastLength:
+    return emitBuiltinInlineLength(B, Args);
+  case eCLBuiltinDistance:
+  case eCLBuiltinFastDistance:
+    if (Args.size() != 2) {
       return nullptr;
-    case eCLBuiltinDot:
-      return emitBuiltinInlineDot(B, Args);
-    case eCLBuiltinCross:
-      return emitBuiltinInlineCross(B, Args);
-    case eCLBuiltinLength:
-    case eCLBuiltinFastLength:
-      return emitBuiltinInlineLength(B, Args);
-    case eCLBuiltinDistance:
-    case eCLBuiltinFastDistance:
-      if (Args.size() != 2) {
-        return nullptr;
-      }
-      Src = B.CreateFSub(Args[0], Args[1], "distance");
-      return emitBuiltinInlineLength(B, ArrayRef<Value *>(&Src, 1));
-    case eCLBuiltinNormalize:
-    case eCLBuiltinFastNormalize:
-      return emitBuiltinInlineNormalize(B, Args);
+    }
+    Src = B.CreateFSub(Args[0], Args[1], "distance");
+    return emitBuiltinInlineLength(B, ArrayRef<Value *>(&Src, 1));
+  case eCLBuiltinNormalize:
+  case eCLBuiltinFastNormalize:
+    return emitBuiltinInlineNormalize(B, Args);
   }
 }
 
@@ -2064,33 +2063,33 @@ Value *CLBuiltinInfo::emitBuiltinInlineConvert(Function *F, BuiltinID builtinID,
   bool DstIsSigned = false;
   auto &Ctx = B.getContext();
   switch (builtinID) {
-    case eCLBuiltinConvertChar:
-      DstIsSigned = true;
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinConvertUChar:
-      DstTy = IntegerType::getInt8Ty(Ctx);
-      break;
-    case eCLBuiltinConvertShort:
-      DstIsSigned = true;
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinConvertUShort:
-      DstTy = IntegerType::getInt16Ty(Ctx);
-      break;
-    case eCLBuiltinConvertInt:
-      DstIsSigned = true;
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinConvertUInt:
-      DstTy = IntegerType::getInt32Ty(Ctx);
-      break;
-    case eCLBuiltinConvertLong:
-      DstIsSigned = true;
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinConvertULong:
-      DstTy = IntegerType::getInt64Ty(Ctx);
-      break;
-
-    default:
-      return nullptr;
+  case eCLBuiltinConvertChar:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertUChar:
+    DstTy = IntegerType::getInt8Ty(Ctx);
+    break;
+  case eCLBuiltinConvertShort:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertUShort:
+    DstTy = IntegerType::getInt16Ty(Ctx);
+    break;
+  case eCLBuiltinConvertInt:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertUInt:
+    DstTy = IntegerType::getInt32Ty(Ctx);
+    break;
+  case eCLBuiltinConvertLong:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertULong:
+    DstTy = IntegerType::getInt64Ty(Ctx);
+    break;
+
+  default:
+    return nullptr;
   }
   if (!DstTy) {
     return nullptr;
@@ -2357,36 +2356,36 @@ Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithTwoArguments(
   CmpInst::Predicate Pred = CmpInst::FCMP_FALSE;
   CmpInst::Predicate Pred2 = CmpInst::FCMP_FALSE;
   switch (BuiltinID) {
-    default:
-      return nullptr;
-    case eCLBuiltinIsEqual:
-      Pred = CmpInst::FCMP_OEQ;
-      break;
-    case eCLBuiltinIsNotEqual:
-      Pred = CmpInst::FCMP_UNE;
-      break;
-    case eCLBuiltinIsGreater:
-      Pred = CmpInst::FCMP_OGT;
-      break;
-    case eCLBuiltinIsGreaterEqual:
-      Pred = CmpInst::FCMP_OGE;
-      break;
-    case eCLBuiltinIsLess:
-      Pred = CmpInst::FCMP_OLT;
-      break;
-    case eCLBuiltinIsLessEqual:
-      Pred = CmpInst::FCMP_OLE;
-      break;
-    case eCLBuiltinIsLessGreater:
-      Pred = CmpInst::FCMP_OLT;
-      Pred2 = CmpInst::FCMP_OGT;
-      break;
-    case eCLBuiltinIsOrdered:
-      Pred = CmpInst::FCMP_ORD;
-      break;
-    case eCLBuiltinIsUnordered:
-      Pred = CmpInst::FCMP_UNO;
-      break;
+  default:
+    return nullptr;
+  case eCLBuiltinIsEqual:
+    Pred = CmpInst::FCMP_OEQ;
+    break;
+  case eCLBuiltinIsNotEqual:
+    Pred = CmpInst::FCMP_UNE;
+    break;
+  case eCLBuiltinIsGreater:
+    Pred = CmpInst::FCMP_OGT;
+    break;
+  case eCLBuiltinIsGreaterEqual:
+    Pred = CmpInst::FCMP_OGE;
+    break;
+  case eCLBuiltinIsLess:
+    Pred = CmpInst::FCMP_OLT;
+    break;
+  case eCLBuiltinIsLessEqual:
+    Pred = CmpInst::FCMP_OLE;
+    break;
+  case eCLBuiltinIsLessGreater:
+    Pred = CmpInst::FCMP_OLT;
+    Pred2 = CmpInst::FCMP_OGT;
+    break;
+  case eCLBuiltinIsOrdered:
+    Pred = CmpInst::FCMP_ORD;
+    break;
+  case eCLBuiltinIsUnordered:
+    Pred = CmpInst::FCMP_UNO;
+    break;
   }
 
   if (Args.size() != 2) {
@@ -2496,39 +2495,39 @@ Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithOneArgument(
 
   // Emit the IR that will calculate the result
   switch (BuiltinID) {
-    default:
-      llvm_unreachable("Invalid Builtin ID");
-      break;
-    case eCLBuiltinIsFinite:
-      Result = B.CreateAnd(STArg, NonSignMask);
-      Result = B.CreateICmpSLT(Result, ExponentMask);
-      break;
-    case eCLBuiltinIsInf:
-      Result = B.CreateAnd(STArg, NonSignMask);
-      Result = B.CreateICmpEQ(Result, ExponentMask);
-      break;
-    case eCLBuiltinIsNan: {
-      Result = B.CreateAnd(STArg, NonSignMask);
-      // This checks if the exponent is all ones (the same as the ExponentMask)
-      // and also if the significant (the mantissa) is not zero. If the mantissa
-      // is zero then it would be infinite, not NaN.
-      Value *ExponentAllOnes =
-          B.CreateICmpEQ(ExponentMask, B.CreateAnd(ExponentMask, Result));
-      Value *MantissaNotZero =
-          B.CreateICmpSGT(B.CreateAnd(MantissaMask, Result), Zero);
-      Result = B.CreateAnd(ExponentAllOnes, MantissaNotZero);
-      break;
-    }
-    case eCLBuiltinIsNormal: {
-      Result = B.CreateAnd(STArg, NonSignMask);
-      Value *ExponentBitsNotAllSet = B.CreateICmpSLT(Result, ExponentMask);
-      Value *ExponentBitsNonZero = B.CreateICmpSGT(Result, MantissaMask);
-      Result = B.CreateAnd(ExponentBitsNotAllSet, ExponentBitsNonZero);
-      break;
-    }
-    case eCLBuiltinSignBit:
-      Result = B.CreateICmpSLT(STArg, Zero);
-      break;
+  default:
+    llvm_unreachable("Invalid Builtin ID");
+    break;
+  case eCLBuiltinIsFinite:
+    Result = B.CreateAnd(STArg, NonSignMask);
+    Result = B.CreateICmpSLT(Result, ExponentMask);
+    break;
+  case eCLBuiltinIsInf:
+    Result = B.CreateAnd(STArg, NonSignMask);
+    Result = B.CreateICmpEQ(Result, ExponentMask);
+    break;
+  case eCLBuiltinIsNan: {
+    Result = B.CreateAnd(STArg, NonSignMask);
+    // This checks if the exponent is all ones (the same as the ExponentMask)
+    // and also if the significant (the mantissa) is not zero. If the mantissa
+    // is zero then it would be infinite, not NaN.
+    Value *ExponentAllOnes =
+        B.CreateICmpEQ(ExponentMask, B.CreateAnd(ExponentMask, Result));
+    Value *MantissaNotZero =
+        B.CreateICmpSGT(B.CreateAnd(MantissaMask, Result), Zero);
+    Result = B.CreateAnd(ExponentAllOnes, MantissaNotZero);
+    break;
+  }
+  case eCLBuiltinIsNormal: {
+    Result = B.CreateAnd(STArg, NonSignMask);
+    Value *ExponentBitsNotAllSet = B.CreateICmpSLT(Result, ExponentMask);
+    Value *ExponentBitsNonZero = B.CreateICmpSGT(Result, MantissaMask);
+    Result = B.CreateAnd(ExponentBitsNotAllSet, ExponentBitsNonZero);
+    break;
+  }
+  case eCLBuiltinSignBit:
+    Result = B.CreateICmpSLT(STArg, Zero);
+    break;
   }
 
   // Convert the i1 result from the comparison instruction to the type that the
@@ -2649,17 +2648,17 @@ static std::optional<unsigned> parseMemFenceFlagsParam(Value *const P) {
     // cl_mem_fence_flags is a bitfield and can be 0 or a combination of
     // CLK_(GLOBAL|LOCAL|IMAGE)_MEM_FENCE values ORed together.
     switch (Flags->getZExtValue()) {
-      case 0:
-        return std::nullopt;
-      case CLK_LOCAL_MEM_FENCE:
-        return BIMuxInfoConcept::MemSemanticsWorkGroupMemory;
-      case CLK_GLOBAL_MEM_FENCE:
-        return BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory;
-      case CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE:
-        return (BIMuxInfoConcept::MemSemanticsWorkGroupMemory |
-                BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory);
-      default:
-        llvm_unreachable("unhandled memory fence flags");
+    case 0:
+      return std::nullopt;
+    case CLK_LOCAL_MEM_FENCE:
+      return BIMuxInfoConcept::MemSemanticsWorkGroupMemory;
+    case CLK_GLOBAL_MEM_FENCE:
+      return BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory;
+    case CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE:
+      return (BIMuxInfoConcept::MemSemanticsWorkGroupMemory |
+              BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory);
+    default:
+      llvm_unreachable("unhandled memory fence flags");
     }
   }
   return std::nullopt;
@@ -2668,21 +2667,21 @@ static std::optional<unsigned> parseMemFenceFlagsParam(Value *const P) {
 static std::optional<unsigned> parseMemoryScopeParam(Value *const P) {
   if (auto *const Scope = dyn_cast<ConstantInt>(P)) {
     switch (Scope->getZExtValue()) {
-      case memory_scope_work_item:
-        return BIMuxInfoConcept::MemScopeWorkItem;
-      case memory_scope_sub_group:
-        return BIMuxInfoConcept::MemScopeSubGroup;
-      case memory_scope_work_group:
-        return BIMuxInfoConcept::MemScopeWorkGroup;
-      case memory_scope_device:
-        return BIMuxInfoConcept::MemScopeDevice;
-      // 3.3.5. memory_scope_all_devices is an alias for
-      // memory_scope_all_svm_devices.
-      case memory_scope_all_devices:
-      case memory_scope_all_svm_devices:
-        return BIMuxInfoConcept::MemScopeCrossDevice;
-      default:
-        llvm_unreachable("unhandled memory scope");
+    case memory_scope_work_item:
+      return BIMuxInfoConcept::MemScopeWorkItem;
+    case memory_scope_sub_group:
+      return BIMuxInfoConcept::MemScopeSubGroup;
+    case memory_scope_work_group:
+      return BIMuxInfoConcept::MemScopeWorkGroup;
+    case memory_scope_device:
+      return BIMuxInfoConcept::MemScopeDevice;
+    // 3.3.5. memory_scope_all_devices is an alias for
+    // memory_scope_all_svm_devices.
+    case memory_scope_all_devices:
+    case memory_scope_all_svm_devices:
+      return BIMuxInfoConcept::MemScopeCrossDevice;
+    default:
+      llvm_unreachable("unhandled memory scope");
     }
   }
   return std::nullopt;
@@ -2691,18 +2690,18 @@ static std::optional<unsigned> parseMemoryScopeParam(Value *const P) {
 static std::optional<unsigned> parseMemoryOrderParam(Value *const P) {
   if (auto *const Order = dyn_cast<ConstantInt>(P)) {
     switch (Order->getZExtValue()) {
-      case memory_order_relaxed:
-        return BIMuxInfoConcept::MemSemanticsRelaxed;
-      case memory_order_acquire:
-        return BIMuxInfoConcept::MemSemanticsAcquire;
-      case memory_order_release:
-        return BIMuxInfoConcept::MemSemanticsRelease;
-      case memory_order_acq_rel:
-        return BIMuxInfoConcept::MemSemanticsAcquireRelease;
-      case memory_order_seq_cst:
-        return BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
-      default:
-        llvm_unreachable("unhandled memory order");
+    case memory_order_relaxed:
+      return BIMuxInfoConcept::MemSemanticsRelaxed;
+    case memory_order_acquire:
+      return BIMuxInfoConcept::MemSemanticsAcquire;
+    case memory_order_release:
+      return BIMuxInfoConcept::MemSemanticsRelease;
+    case memory_order_acq_rel:
+      return BIMuxInfoConcept::MemSemanticsAcquireRelease;
+    case memory_order_seq_cst:
+      return BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
+    default:
+      llvm_unreachable("unhandled memory order");
     }
   }
   return std::nullopt;
@@ -2713,50 +2712,51 @@ static std::optional<unsigned> parseMemoryOrderParam(Value *const P) {
 // are identical.
 static std::optional<BuiltinID> get1To1BuiltinLowering(BuiltinID CLBuiltinID) {
   switch (CLBuiltinID) {
-    default:
-      return std::nullopt;
-    case eCLBuiltinGetWorkDim:
-      return eMuxBuiltinGetWorkDim;
-    case eCLBuiltinGetGroupId:
-      return eMuxBuiltinGetGroupId;
-    case eCLBuiltinGetGlobalSize:
-      return eMuxBuiltinGetGlobalSize;
-    case eCLBuiltinGetGlobalOffset:
-      return eMuxBuiltinGetGlobalOffset;
-    case eCLBuiltinGetLocalId:
-      return eMuxBuiltinGetLocalId;
-    case eCLBuiltinGetLocalSize:
-      return eMuxBuiltinGetLocalSize;
-    case eCLBuiltinGetEnqueuedLocalSize:
-      return eMuxBuiltinGetEnqueuedLocalSize;
-    case eCLBuiltinGetNumGroups:
-      return eMuxBuiltinGetNumGroups;
-    case eCLBuiltinGetGlobalId:
-      return eMuxBuiltinGetGlobalId;
-    case eCLBuiltinGetLocalLinearId:
-      return eMuxBuiltinGetLocalLinearId;
-    case eCLBuiltinGetGlobalLinearId:
-      return eMuxBuiltinGetGlobalLinearId;
-    case eCLBuiltinGetSubgroupSize:
-      return eMuxBuiltinGetSubGroupSize;
-    case eCLBuiltinGetMaxSubgroupSize:
-      return eMuxBuiltinGetMaxSubGroupSize;
-    case eCLBuiltinGetSubgroupLocalId:
-      return eMuxBuiltinGetSubGroupLocalId;
-    case eCLBuiltinGetNumSubgroups:
-      return eMuxBuiltinGetNumSubGroups;
-    case eCLBuiltinGetEnqueuedNumSubgroups:
-      // Note - this is mapping to the same builtin as
-      // eCLBuiltinGetNumSubgroups, as we don't currently support
-      // non-uniform work-group sizes.
-      return eMuxBuiltinGetNumSubGroups;
-    case eCLBuiltinGetSubgroupId:
-      return eMuxBuiltinGetSubGroupId;
+  default:
+    return std::nullopt;
+  case eCLBuiltinGetWorkDim:
+    return eMuxBuiltinGetWorkDim;
+  case eCLBuiltinGetGroupId:
+    return eMuxBuiltinGetGroupId;
+  case eCLBuiltinGetGlobalSize:
+    return eMuxBuiltinGetGlobalSize;
+  case eCLBuiltinGetGlobalOffset:
+    return eMuxBuiltinGetGlobalOffset;
+  case eCLBuiltinGetLocalId:
+    return eMuxBuiltinGetLocalId;
+  case eCLBuiltinGetLocalSize:
+    return eMuxBuiltinGetLocalSize;
+  case eCLBuiltinGetEnqueuedLocalSize:
+    return eMuxBuiltinGetEnqueuedLocalSize;
+  case eCLBuiltinGetNumGroups:
+    return eMuxBuiltinGetNumGroups;
+  case eCLBuiltinGetGlobalId:
+    return eMuxBuiltinGetGlobalId;
+  case eCLBuiltinGetLocalLinearId:
+    return eMuxBuiltinGetLocalLinearId;
+  case eCLBuiltinGetGlobalLinearId:
+    return eMuxBuiltinGetGlobalLinearId;
+  case eCLBuiltinGetSubgroupSize:
+    return eMuxBuiltinGetSubGroupSize;
+  case eCLBuiltinGetMaxSubgroupSize:
+    return eMuxBuiltinGetMaxSubGroupSize;
+  case eCLBuiltinGetSubgroupLocalId:
+    return eMuxBuiltinGetSubGroupLocalId;
+  case eCLBuiltinGetNumSubgroups:
+    return eMuxBuiltinGetNumSubGroups;
+  case eCLBuiltinGetEnqueuedNumSubgroups:
+    // Note - this is mapping to the same builtin as
+    // eCLBuiltinGetNumSubgroups, as we don't currently support
+    // non-uniform work-group sizes.
+    return eMuxBuiltinGetNumSubGroups;
+  case eCLBuiltinGetSubgroupId:
+    return eMuxBuiltinGetSubGroupId;
   }
 }
 
-Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
-    CallInst &CI, BIMuxInfoConcept &BIMuxImpl) {
+Instruction *
+CLBuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI,
+                                        BIMuxInfoConcept &BIMuxImpl) {
   auto &M = *CI.getModule();
   auto *const F = CI.getCalledFunction();
   if (!F) {
@@ -2789,111 +2789,108 @@ Instruction *CLBuiltinInfo::lowerBuiltinToMuxBuiltin(
       BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
 
   switch (*ID) {
-    default:
-      // Sub-group and work-group builtins need lowering to their mux
-      // equivalents.
-      if (auto *const NewI =
-              lowerGroupBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl)) {
-        return NewI;
-      }
-      return nullptr;
-    case eCLBuiltinSubGroupBarrier:
-      CtrlBarrierID = eMuxBuiltinSubGroupBarrier;
-      DefaultMemScope = BIMuxInfoConcept::MemScopeSubGroup;
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinBarrier:
-    case eCLBuiltinWorkGroupBarrier: {
-      // Memory Scope which the barrier controls. Defaults to 'workgroup' or
-      // 'subgroup' scope depending on the barrier, but sub_group_barrier and
-      // work_group_barrier can optionally provide a scope.
-      unsigned ScopeVal = DefaultMemScope;
-      if ((ID == eCLBuiltinSubGroupBarrier ||
-           ID == eCLBuiltinWorkGroupBarrier) &&
-          F->arg_size() == 2) {
-        if (auto Scope = parseMemoryScopeParam(CI.getOperand(1))) {
-          ScopeVal = *Scope;
-        }
-      }
-
-      const unsigned SemanticsVal =
-          DefaultMemOrder |
-          parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
-
-      auto *const CtrlBarrier =
-          BIMuxImpl.getOrDeclareMuxBuiltin(CtrlBarrierID, M);
-
-      auto *const BarrierID = ConstantInt::get(I32Ty, 0);
-      auto *const Scope = ConstantInt::get(I32Ty, ScopeVal);
-      auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
-      auto *const NewCI = B.CreateCall(
-          CtrlBarrier, {BarrierID, Scope, Semantics}, CI.getName());
-      NewCI->setAttributes(CtrlBarrier->getAttributes());
-      NewCI->takeName(&CI);
-      return NewCI;
+  default:
+    // Sub-group and work-group builtins need lowering to their mux
+    // equivalents.
+    if (auto *const NewI = lowerGroupBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl)) {
+      return NewI;
     }
-    case eCLBuiltinAtomicWorkItemFence:
-      // atomic_work_item_fence has two parameters which we can parse.
-      DefaultMemOrder =
-          parseMemoryOrderParam(CI.getOperand(1)).value_or(DefaultMemOrder);
-      DefaultMemScope =
-          parseMemoryScopeParam(CI.getOperand(2)).value_or(DefaultMemScope);
-      LLVM_FALLTHROUGH;
-    case eCLBuiltinMemFence:
-    case eCLBuiltinReadMemFence:
-    case eCLBuiltinWriteMemFence: {
-      // The deprecated 'fence' builtins default to memory_scope_work_group and
-      // have one possible order each.
-      if (ID == eCLBuiltinMemFence) {
-        DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquireRelease;
-      } else if (ID == eCLBuiltinReadMemFence) {
-        DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquire;
-      } else if (ID == eCLBuiltinWriteMemFence) {
-        DefaultMemOrder = BIMuxInfoConcept::MemSemanticsRelease;
+    return nullptr;
+  case eCLBuiltinSubGroupBarrier:
+    CtrlBarrierID = eMuxBuiltinSubGroupBarrier;
+    DefaultMemScope = BIMuxInfoConcept::MemScopeSubGroup;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinBarrier:
+  case eCLBuiltinWorkGroupBarrier: {
+    // Memory Scope which the barrier controls. Defaults to 'workgroup' or
+    // 'subgroup' scope depending on the barrier, but sub_group_barrier and
+    // work_group_barrier can optionally provide a scope.
+    unsigned ScopeVal = DefaultMemScope;
+    if ((ID == eCLBuiltinSubGroupBarrier || ID == eCLBuiltinWorkGroupBarrier) &&
+        F->arg_size() == 2) {
+      if (auto Scope = parseMemoryScopeParam(CI.getOperand(1))) {
+        ScopeVal = *Scope;
       }
-      const unsigned SemanticsVal =
-          DefaultMemOrder |
-          parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
-      auto *const MemBarrier =
-          BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
-      auto *const Scope = ConstantInt::get(I32Ty, DefaultMemScope);
-      auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
-      auto *const NewCI =
-          B.CreateCall(MemBarrier, {Scope, Semantics}, CI.getName());
-      NewCI->setAttributes(MemBarrier->getAttributes());
-      NewCI->takeName(&CI);
-      return NewCI;
     }
-    case eCLBuiltinAsyncWorkGroupCopy:
-    case eCLBuiltinAsyncWorkGroupStridedCopy:
-    case eCLBuiltinAsyncWorkGroupCopy2D2D:
-    case eCLBuiltinAsyncWorkGroupCopy3D3D:
-      return lowerAsyncBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl);
-    case eCLBuiltinWaitGroupEvents: {
-      auto *const MuxWait =
-          BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinDMAWait, M);
-      assert(MuxWait && "Could not get/declare __mux_dma_wait");
-      auto *const Count = CI.getArgOperand(0);
-      auto *Events = CI.getArgOperand(1);
-
-      assert(Events->getType()->isPointerTy() &&
-             (Events->getType()->getPointerAddressSpace() ==
-                  compiler::utils::AddressSpace::Private ||
-              Events->getType()->getPointerAddressSpace() ==
-                  compiler::utils::AddressSpace::Generic) &&
-             "Pointer to event must be in address space 0 or 4.");
-
-      Events = B.CreatePointerBitCastOrAddrSpaceCast(
-          Events, PointerType::getUnqual(Ctx), "mux.events");
-      auto *const NewCI = B.CreateCall(MuxWait, {Count, Events}, CI.getName());
-      NewCI->setAttributes(MuxWait->getAttributes());
-      NewCI->takeName(&CI);
-      return NewCI;
+
+    const unsigned SemanticsVal =
+        DefaultMemOrder | parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
+
+    auto *const CtrlBarrier =
+        BIMuxImpl.getOrDeclareMuxBuiltin(CtrlBarrierID, M);
+
+    auto *const BarrierID = ConstantInt::get(I32Ty, 0);
+    auto *const Scope = ConstantInt::get(I32Ty, ScopeVal);
+    auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
+    auto *const NewCI =
+        B.CreateCall(CtrlBarrier, {BarrierID, Scope, Semantics}, CI.getName());
+    NewCI->setAttributes(CtrlBarrier->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAtomicWorkItemFence:
+    // atomic_work_item_fence has two parameters which we can parse.
+    DefaultMemOrder =
+        parseMemoryOrderParam(CI.getOperand(1)).value_or(DefaultMemOrder);
+    DefaultMemScope =
+        parseMemoryScopeParam(CI.getOperand(2)).value_or(DefaultMemScope);
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinMemFence:
+  case eCLBuiltinReadMemFence:
+  case eCLBuiltinWriteMemFence: {
+    // The deprecated 'fence' builtins default to memory_scope_work_group and
+    // have one possible order each.
+    if (ID == eCLBuiltinMemFence) {
+      DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquireRelease;
+    } else if (ID == eCLBuiltinReadMemFence) {
+      DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquire;
+    } else if (ID == eCLBuiltinWriteMemFence) {
+      DefaultMemOrder = BIMuxInfoConcept::MemSemanticsRelease;
     }
+    const unsigned SemanticsVal =
+        DefaultMemOrder | parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
+    auto *const MemBarrier =
+        BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
+    auto *const Scope = ConstantInt::get(I32Ty, DefaultMemScope);
+    auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
+    auto *const NewCI =
+        B.CreateCall(MemBarrier, {Scope, Semantics}, CI.getName());
+    NewCI->setAttributes(MemBarrier->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAsyncWorkGroupCopy:
+  case eCLBuiltinAsyncWorkGroupStridedCopy:
+  case eCLBuiltinAsyncWorkGroupCopy2D2D:
+  case eCLBuiltinAsyncWorkGroupCopy3D3D:
+    return lowerAsyncBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl);
+  case eCLBuiltinWaitGroupEvents: {
+    auto *const MuxWait =
+        BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinDMAWait, M);
+    assert(MuxWait && "Could not get/declare __mux_dma_wait");
+    auto *const Count = CI.getArgOperand(0);
+    auto *Events = CI.getArgOperand(1);
+
+    assert(Events->getType()->isPointerTy() &&
+           (Events->getType()->getPointerAddressSpace() ==
+                compiler::utils::AddressSpace::Private ||
+            Events->getType()->getPointerAddressSpace() ==
+                compiler::utils::AddressSpace::Generic) &&
+           "Pointer to event must be in address space 0 or 4.");
+
+    Events = B.CreatePointerBitCastOrAddrSpaceCast(
+        Events, PointerType::getUnqual(Ctx), "mux.events");
+    auto *const NewCI = B.CreateCall(MuxWait, {Count, Events}, CI.getName());
+    NewCI->setAttributes(MuxWait->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
   }
 }
 
-Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
-    CallInst &CI, BuiltinID ID, BIMuxInfoConcept &BIMuxImpl) {
+Instruction *
+CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(CallInst &CI, BuiltinID ID,
+                                             BIMuxInfoConcept &BIMuxImpl) {
   auto &M = *CI.getModule();
   auto *const F = CI.getCalledFunction();
   assert(F && "No calling function?");
@@ -2907,230 +2904,230 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
   bool RecheckOpType = false;
   BaseBuiltinID MuxBuiltinID;
   switch (ID) {
-    default:
-      return nullptr;
-    case eCLBuiltinSubgroupAll:
-      MuxBuiltinID = eMuxBuiltinSubgroupAll;
-      break;
-    case eCLBuiltinSubgroupAny:
-      MuxBuiltinID = eMuxBuiltinSubgroupAny;
-      break;
-    case eCLBuiltinSubgroupBroadcast:
-      MuxBuiltinID = eMuxBuiltinSubgroupBroadcast;
-      break;
-    case eCLBuiltinSubgroupReduceAdd:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceAdd;
-      break;
-    case eCLBuiltinSubgroupReduceMin:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceUMin;
-      break;
-    case eCLBuiltinSubgroupReduceMax:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceUMax;
-      break;
-    case eCLBuiltinSubgroupReduceMul:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceMul;
-      break;
-    case eCLBuiltinSubgroupReduceAnd:
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceAnd;
-      break;
-    case eCLBuiltinSubgroupReduceOr:
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceOr;
-      break;
-    case eCLBuiltinSubgroupReduceXor:
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceXor;
-      break;
-    case eCLBuiltinSubgroupReduceLogicalAnd:
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalAnd;
-      break;
-    case eCLBuiltinSubgroupReduceLogicalOr:
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalOr;
-      break;
-    case eCLBuiltinSubgroupReduceLogicalXor:
-      MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalXor;
-      break;
-    case eCLBuiltinSubgroupScanAddInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanAddInclusive;
-      break;
-    case eCLBuiltinSubgroupScanAddExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanAddExclusive;
-      break;
-    case eCLBuiltinSubgroupScanMinInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanUMinInclusive;
-      break;
-    case eCLBuiltinSubgroupScanMinExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanUMinExclusive;
-      break;
-    case eCLBuiltinSubgroupScanMaxInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxInclusive;
-      break;
-    case eCLBuiltinSubgroupScanMaxExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxExclusive;
-      break;
-    case eCLBuiltinSubgroupScanMulInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanMulInclusive;
-      break;
-    case eCLBuiltinSubgroupScanMulExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinSubgroupScanMulExclusive;
-      break;
-    case eCLBuiltinSubgroupScanAndInclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanAndInclusive;
-      break;
-    case eCLBuiltinSubgroupScanAndExclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanAndExclusive;
-      break;
-    case eCLBuiltinSubgroupScanOrInclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanOrInclusive;
-      break;
-    case eCLBuiltinSubgroupScanOrExclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanOrExclusive;
-      break;
-    case eCLBuiltinSubgroupScanXorInclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanXorInclusive;
-      break;
-    case eCLBuiltinSubgroupScanXorExclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanXorExclusive;
-      break;
-    case eCLBuiltinSubgroupScanLogicalAndInclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndInclusive;
-      break;
-    case eCLBuiltinSubgroupScanLogicalAndExclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndExclusive;
-      break;
-    case eCLBuiltinSubgroupScanLogicalOrInclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrInclusive;
-      break;
-    case eCLBuiltinSubgroupScanLogicalOrExclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrExclusive;
-      break;
-    case eCLBuiltinSubgroupScanLogicalXorInclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorInclusive;
-      break;
-    case eCLBuiltinSubgroupScanLogicalXorExclusive:
-      MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorExclusive;
-      break;
-    case eCLBuiltinWorkgroupAll:
-      MuxBuiltinID = eMuxBuiltinWorkgroupAll;
-      break;
-    case eCLBuiltinWorkgroupAny:
-      MuxBuiltinID = eMuxBuiltinWorkgroupAny;
-      break;
-    case eCLBuiltinWorkgroupBroadcast:
-      MuxBuiltinID = eMuxBuiltinWorkgroupBroadcast;
-      break;
-    case eCLBuiltinWorkgroupReduceAdd:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceAdd;
-      break;
-    case eCLBuiltinWorkgroupReduceMin:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMin;
-      break;
-    case eCLBuiltinWorkgroupReduceMax:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMax;
-      break;
-    case eCLBuiltinWorkgroupReduceMul:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceMul;
-      break;
-    case eCLBuiltinWorkgroupReduceAnd:
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceAnd;
-      break;
-    case eCLBuiltinWorkgroupReduceOr:
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceOr;
-      break;
-    case eCLBuiltinWorkgroupReduceXor:
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceXor;
-      break;
-    case eCLBuiltinWorkgroupReduceLogicalAnd:
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalAnd;
-      break;
-    case eCLBuiltinWorkgroupReduceLogicalOr:
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalOr;
-      break;
-    case eCLBuiltinWorkgroupReduceLogicalXor:
-      MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalXor;
-      break;
-    case eCLBuiltinWorkgroupScanAddInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanAddInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanAddExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanAddExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanMinInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanMinExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanMaxInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanMaxExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanMulInclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanMulInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanMulExclusive:
-      RecheckOpType = true;
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanMulExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanAndInclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanAndInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanAndExclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanAndExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanOrInclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanOrInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanOrExclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanOrExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanXorInclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanXorInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanXorExclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanXorExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanLogicalAndInclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanLogicalAndExclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanLogicalOrInclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanLogicalOrExclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrExclusive;
-      break;
-    case eCLBuiltinWorkgroupScanLogicalXorInclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorInclusive;
-      break;
-    case eCLBuiltinWorkgroupScanLogicalXorExclusive:
-      MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorExclusive;
-      break;
+  default:
+    return nullptr;
+  case eCLBuiltinSubgroupAll:
+    MuxBuiltinID = eMuxBuiltinSubgroupAll;
+    break;
+  case eCLBuiltinSubgroupAny:
+    MuxBuiltinID = eMuxBuiltinSubgroupAny;
+    break;
+  case eCLBuiltinSubgroupBroadcast:
+    MuxBuiltinID = eMuxBuiltinSubgroupBroadcast;
+    break;
+  case eCLBuiltinSubgroupReduceAdd:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceAdd;
+    break;
+  case eCLBuiltinSubgroupReduceMin:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceUMin;
+    break;
+  case eCLBuiltinSubgroupReduceMax:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceUMax;
+    break;
+  case eCLBuiltinSubgroupReduceMul:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceMul;
+    break;
+  case eCLBuiltinSubgroupReduceAnd:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceAnd;
+    break;
+  case eCLBuiltinSubgroupReduceOr:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceOr;
+    break;
+  case eCLBuiltinSubgroupReduceXor:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceXor;
+    break;
+  case eCLBuiltinSubgroupReduceLogicalAnd:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalAnd;
+    break;
+  case eCLBuiltinSubgroupReduceLogicalOr:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalOr;
+    break;
+  case eCLBuiltinSubgroupReduceLogicalXor:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalXor;
+    break;
+  case eCLBuiltinSubgroupScanAddInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAddInclusive;
+    break;
+  case eCLBuiltinSubgroupScanAddExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAddExclusive;
+    break;
+  case eCLBuiltinSubgroupScanMinInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMinInclusive;
+    break;
+  case eCLBuiltinSubgroupScanMinExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMinExclusive;
+    break;
+  case eCLBuiltinSubgroupScanMaxInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxInclusive;
+    break;
+  case eCLBuiltinSubgroupScanMaxExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxExclusive;
+    break;
+  case eCLBuiltinSubgroupScanMulInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanMulInclusive;
+    break;
+  case eCLBuiltinSubgroupScanMulExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanMulExclusive;
+    break;
+  case eCLBuiltinSubgroupScanAndInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAndInclusive;
+    break;
+  case eCLBuiltinSubgroupScanAndExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAndExclusive;
+    break;
+  case eCLBuiltinSubgroupScanOrInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanOrInclusive;
+    break;
+  case eCLBuiltinSubgroupScanOrExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanOrExclusive;
+    break;
+  case eCLBuiltinSubgroupScanXorInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanXorInclusive;
+    break;
+  case eCLBuiltinSubgroupScanXorExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanXorExclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalAndInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndInclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalAndExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndExclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalOrInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrInclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalOrExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrExclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalXorInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorInclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalXorExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorExclusive;
+    break;
+  case eCLBuiltinWorkgroupAll:
+    MuxBuiltinID = eMuxBuiltinWorkgroupAll;
+    break;
+  case eCLBuiltinWorkgroupAny:
+    MuxBuiltinID = eMuxBuiltinWorkgroupAny;
+    break;
+  case eCLBuiltinWorkgroupBroadcast:
+    MuxBuiltinID = eMuxBuiltinWorkgroupBroadcast;
+    break;
+  case eCLBuiltinWorkgroupReduceAdd:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceAdd;
+    break;
+  case eCLBuiltinWorkgroupReduceMin:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMin;
+    break;
+  case eCLBuiltinWorkgroupReduceMax:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMax;
+    break;
+  case eCLBuiltinWorkgroupReduceMul:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceMul;
+    break;
+  case eCLBuiltinWorkgroupReduceAnd:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceAnd;
+    break;
+  case eCLBuiltinWorkgroupReduceOr:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceOr;
+    break;
+  case eCLBuiltinWorkgroupReduceXor:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceXor;
+    break;
+  case eCLBuiltinWorkgroupReduceLogicalAnd:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalAnd;
+    break;
+  case eCLBuiltinWorkgroupReduceLogicalOr:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalOr;
+    break;
+  case eCLBuiltinWorkgroupReduceLogicalXor:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalXor;
+    break;
+  case eCLBuiltinWorkgroupScanAddInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAddInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanAddExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAddExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMinInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMinExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMaxInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMaxExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMulInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanMulInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMulExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanMulExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanAndInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAndInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanAndExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAndExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanOrInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanOrInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanOrExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanOrExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanXorInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanXorInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanXorExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanXorExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalAndInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalAndExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalOrInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalOrExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalXorInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalXorExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorExclusive;
+    break;
   }
 
   if (RecheckOpType) {
@@ -3154,128 +3151,140 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
 
     const bool IsFP = ArgumentTypes[0]->isFloatingPointTy();
     switch (MuxBuiltinID) {
-      default:
-        llvm_unreachable("unknown group operation for which to check the type");
-      case eMuxBuiltinSubgroupReduceAdd:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupReduceFAdd;
-        break;
-      case eMuxBuiltinSubgroupReduceMul:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupReduceFMul;
-        break;
-      case eMuxBuiltinSubgroupReduceUMin:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinSubgroupReduceFMin;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinSubgroupReduceSMin;
-        }
-        break;
-      case eMuxBuiltinSubgroupReduceUMax:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinSubgroupReduceFMax;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinSubgroupReduceSMax;
-        }
-        break;
-      case eMuxBuiltinSubgroupScanAddInclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFAddInclusive;
-        break;
-      case eMuxBuiltinSubgroupScanAddExclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFAddExclusive;
-        break;
-      case eMuxBuiltinSubgroupScanMulInclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFMulInclusive;
-        break;
-      case eMuxBuiltinSubgroupScanMulExclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinSubgroupScanFMulExclusive;
-        break;
-      case eMuxBuiltinSubgroupScanUMinInclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanFMinInclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanSMinInclusive;
-        }
-        break;
-      case eMuxBuiltinSubgroupScanUMinExclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanFMinExclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanSMinExclusive;
-        }
-        break;
-      case eMuxBuiltinSubgroupScanUMaxInclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxInclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxInclusive;
-        }
-        break;
-      case eMuxBuiltinSubgroupScanUMaxExclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxExclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxExclusive;
-        }
-        break;
-      case eMuxBuiltinWorkgroupReduceAdd:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupReduceFAdd;
-        break;
-      case eMuxBuiltinWorkgroupReduceMul:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMul;
-        break;
-      case eMuxBuiltinWorkgroupReduceUMin:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMin;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMin;
-        }
-        break;
-      case eMuxBuiltinWorkgroupReduceUMax:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMax;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMax;
-        }
-        break;
-      case eMuxBuiltinWorkgroupScanAddInclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddInclusive;
-        break;
-      case eMuxBuiltinWorkgroupScanAddExclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddExclusive;
-        break;
-      case eMuxBuiltinWorkgroupScanMulInclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulInclusive;
-        break;
-      case eMuxBuiltinWorkgroupScanMulExclusive:
-        if (IsFP) MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulExclusive;
-        break;
-      case eMuxBuiltinWorkgroupScanUMinInclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinInclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinInclusive;
-        }
-        break;
-      case eMuxBuiltinWorkgroupScanUMinExclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinExclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinExclusive;
-        }
-        break;
-      case eMuxBuiltinWorkgroupScanUMaxInclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxInclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxInclusive;
-        }
-        break;
-      case eMuxBuiltinWorkgroupScanUMaxExclusive:
-        if (IsFP) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxExclusive;
-        } else if (IsSignedInt) {
-          MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxExclusive;
-        }
-        break;
+    default:
+      llvm_unreachable("unknown group operation for which to check the type");
+    case eMuxBuiltinSubgroupReduceAdd:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFAdd;
+      break;
+    case eMuxBuiltinSubgroupReduceMul:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFMul;
+      break;
+    case eMuxBuiltinSubgroupReduceUMin:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFMin;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceSMin;
+      }
+      break;
+    case eMuxBuiltinSubgroupReduceUMax:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFMax;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceSMax;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanAddInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFAddInclusive;
+      break;
+    case eMuxBuiltinSubgroupScanAddExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFAddExclusive;
+      break;
+    case eMuxBuiltinSubgroupScanMulInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMulInclusive;
+      break;
+    case eMuxBuiltinSubgroupScanMulExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMulExclusive;
+      break;
+    case eMuxBuiltinSubgroupScanUMinInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMinInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMinInclusive;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanUMinExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMinExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMinExclusive;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanUMaxInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxInclusive;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanUMaxExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxExclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupReduceAdd:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFAdd;
+      break;
+    case eMuxBuiltinWorkgroupReduceMul:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMul;
+      break;
+    case eMuxBuiltinWorkgroupReduceUMin:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMin;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMin;
+      }
+      break;
+    case eMuxBuiltinWorkgroupReduceUMax:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMax;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMax;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanAddInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddInclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanAddExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddExclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanMulInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulInclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanMulExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulExclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanUMinInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinInclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanUMinExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinExclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanUMaxInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxInclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanUMaxExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxExclusive;
+      }
+      break;
     }
   }
 
@@ -3344,8 +3353,9 @@ Instruction *CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(
   return SExt;
 }
 
-Instruction *CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(
-    CallInst &CI, BuiltinID ID, BIMuxInfoConcept &BIMuxImpl) {
+Instruction *
+CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(CallInst &CI, BuiltinID ID,
+                                             BIMuxInfoConcept &BIMuxImpl) {
   assert((ID == eCLBuiltinAsyncWorkGroupCopy ||
           ID == eCLBuiltinAsyncWorkGroupStridedCopy ||
           ID == eCLBuiltinAsyncWorkGroupCopy2D2D ||
@@ -3358,168 +3368,165 @@ Instruction *CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(
   const auto &DL = M.getDataLayout();
 
   switch (ID) {
-    default:
-      llvm_unreachable("Unhandled builtin");
-    case eCLBuiltinAsyncWorkGroupCopy:
-    case eCLBuiltinAsyncWorkGroupStridedCopy: {
-      NameMangler Mangler(&Ctx);
-
-      // Do a full demangle to determing the pointer element type of the first
-      // argument.
-      SmallVector<Type *, 4> BuiltinArgTypes, BuiltinArgPointeeTypes;
-      SmallVector<compiler::utils::TypeQualifiers, 4> BuiltinArgQuals;
-
-      [[maybe_unused]] const StringRef BuiltinName = Mangler.demangleName(
-          CI.getCalledFunction()->getName(), BuiltinArgTypes,
-          BuiltinArgPointeeTypes, BuiltinArgQuals);
-      assert(!BuiltinName.empty() && BuiltinArgTypes[0]->isPointerTy() &&
-             BuiltinArgPointeeTypes[0] && "Could not demangle async builtin");
-
-      auto *const DataTy = BuiltinArgPointeeTypes[0];
-      const bool IsStrided = ID == eCLBuiltinAsyncWorkGroupStridedCopy;
-
-      auto *const Dst = CI.getArgOperand(0);
-      auto *const Src = CI.getArgOperand(1);
-      auto *const NumElements = CI.getArgOperand(2);
-      auto *const EventIn = CI.getArgOperand(3 + IsStrided);
-
-      // Find out which way the DMA is going and declare the appropriate mux
-      // builtin.
-      const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
-                          compiler::utils::AddressSpace::Local;
-      const auto ElementTypeWidthInBytes =
-          DL.getTypeAllocSize(DataTy).getFixedValue();
-      auto *const ElementSize =
-          ConstantInt::get(NumElements->getType(), ElementTypeWidthInBytes);
-
-      auto *const WidthInBytes =
-          IsStrided ? ElementSize
-                    : B.CreateMul(ElementSize, NumElements, "width.bytes");
-
-      const BuiltinID MuxBuiltinID = [&] {
-        if (IsRead) {
-          return IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D;
-        } else {
-          return IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D;
-        }
-      }();
-
-      auto *const MuxDMA =
-          BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, EventIn->getType());
-      assert(MuxDMA && "Could not get/declare mux dma read/write");
-
-      CallInst *NewCI = nullptr;
-      if (!IsStrided) {
-        NewCI = B.CreateCall(MuxDMA, {Dst, Src, WidthInBytes, EventIn},
-                             "mux.out.event");
+  default:
+    llvm_unreachable("Unhandled builtin");
+  case eCLBuiltinAsyncWorkGroupCopy:
+  case eCLBuiltinAsyncWorkGroupStridedCopy: {
+    NameMangler Mangler(&Ctx);
+
+    // Do a full demangle to determing the pointer element type of the first
+    // argument.
+    SmallVector<Type *, 4> BuiltinArgTypes, BuiltinArgPointeeTypes;
+    SmallVector<compiler::utils::TypeQualifiers, 4> BuiltinArgQuals;
+
+    [[maybe_unused]] const StringRef BuiltinName =
+        Mangler.demangleName(CI.getCalledFunction()->getName(), BuiltinArgTypes,
+                             BuiltinArgPointeeTypes, BuiltinArgQuals);
+    assert(!BuiltinName.empty() && BuiltinArgTypes[0]->isPointerTy() &&
+           BuiltinArgPointeeTypes[0] && "Could not demangle async builtin");
+
+    auto *const DataTy = BuiltinArgPointeeTypes[0];
+    const bool IsStrided = ID == eCLBuiltinAsyncWorkGroupStridedCopy;
+
+    auto *const Dst = CI.getArgOperand(0);
+    auto *const Src = CI.getArgOperand(1);
+    auto *const NumElements = CI.getArgOperand(2);
+    auto *const EventIn = CI.getArgOperand(3 + IsStrided);
+
+    // Find out which way the DMA is going and declare the appropriate mux
+    // builtin.
+    const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                        compiler::utils::AddressSpace::Local;
+    const auto ElementTypeWidthInBytes =
+        DL.getTypeAllocSize(DataTy).getFixedValue();
+    auto *const ElementSize =
+        ConstantInt::get(NumElements->getType(), ElementTypeWidthInBytes);
+
+    auto *const WidthInBytes =
+        IsStrided ? ElementSize
+                  : B.CreateMul(ElementSize, NumElements, "width.bytes");
+
+    const BuiltinID MuxBuiltinID = [&] {
+      if (IsRead) {
+        return IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D;
       } else {
-        // The stride from async_work_group_strided_copy is in elements, but the
-        // stride in the __mux builtins are in bytes so we need to scale the
-        // value.
-        auto *const Stride = CI.getArgOperand(3);
-        auto *const StrideInBytes =
-            B.CreateMul(ElementSize, Stride, "stride.bytes");
-
-        // For async_work_group_strided_copy, the stride only applies to the
-        // global memory, as we are doing scatters/gathers.
-        auto *const DstStride = IsRead ? ElementSize : StrideInBytes;
-        auto *const SrcStride = IsRead ? StrideInBytes : ElementSize;
-
-        NewCI = B.CreateCall(MuxDMA,
-                             {Dst, Src, WidthInBytes, DstStride, SrcStride,
-                              NumElements, EventIn},
-                             "mux.out.event");
+        return IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D;
       }
-      NewCI->setAttributes(MuxDMA->getAttributes());
-      NewCI->takeName(&CI);
-      return NewCI;
-    }
-    case eCLBuiltinAsyncWorkGroupCopy2D2D: {
-      // Unpack the arguments for ease of access.
-      auto *const Dst = CI.getArgOperand(0);
-      auto *const DstOffset = CI.getArgOperand(1);
-      auto *const Src = CI.getArgOperand(2);
-      auto *const SrcOffset = CI.getArgOperand(3);
-      auto *const NumBytesPerEl = CI.getArgOperand(4);
-      auto *const NumElsPerLine = CI.getArgOperand(5);
-      auto *const NumLines = CI.getArgOperand(6);
-      auto *const SrcTotalLineLength = CI.getArgOperand(7);
-      auto *const DstTotalLineLength = CI.getArgOperand(8);
-      auto *const EventIn = CI.getArgOperand(9);
-
-      // Find out which way the DMA is going and declare the appropriate mux
-      // builtin.
-      const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
-                          compiler::utils::AddressSpace::Local;
-      auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
-          IsRead ? eMuxBuiltinDMARead2D : eMuxBuiltinDMAWrite2D, M,
-          EventIn->getType());
-      assert(MuxDMA && "Could not get/declare mux dma read/write");
-
-      auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
-      auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
-      auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
-      auto *const ByteTy = B.getInt8Ty();
-      auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
-      auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
-      auto *const SrcStrideBytes =
-          B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
-      auto *const DstStrideBytes =
-          B.CreateMul(DstTotalLineLength, NumBytesPerEl);
-      auto *const NewCI = B.CreateCall(
-          MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes, DstStrideBytes,
-                   SrcStrideBytes, NumLines, EventIn});
-      NewCI->setAttributes(MuxDMA->getAttributes());
-      NewCI->takeName(&CI);
-      return NewCI;
-    }
-    case eCLBuiltinAsyncWorkGroupCopy3D3D: {
-      auto *const Dst = CI.getArgOperand(0);
-      auto *const DstOffset = CI.getArgOperand(1);
-      auto *const Src = CI.getArgOperand(2);
-      auto *const SrcOffset = CI.getArgOperand(3);
-      auto *const NumBytesPerEl = CI.getArgOperand(4);
-      auto *const NumElsPerLine = CI.getArgOperand(5);
-      auto *const NumLines = CI.getArgOperand(6);
-      auto *const NumPlanes = CI.getArgOperand(7);
-      auto *const SrcTotalLineLength = CI.getArgOperand(8);
-      auto *const SrcTotalPlaneArea = CI.getArgOperand(9);
-      auto *const DstTotalLineLength = CI.getArgOperand(10);
-      auto *const DstTotalPlaneArea = CI.getArgOperand(11);
-      auto *const EventIn = CI.getArgOperand(12);
-
-      // Find out which way the DMA is going and declare the appropriate mux
-      // builtin.
-      const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
-                          compiler::utils::AddressSpace::Local;
-      auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
-          IsRead ? eMuxBuiltinDMARead3D : eMuxBuiltinDMAWrite3D, M,
-          EventIn->getType());
-      assert(MuxDMA && "Could not get/declare mux dma read/write");
-
-      auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
-      auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
-      auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
-      auto *const ByteTy = B.getInt8Ty();
-      auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
-      auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
-      auto *const SrcLineStrideBytes =
-          B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
-      auto *const DstLineStrideBytes =
-          B.CreateMul(DstTotalLineLength, NumBytesPerEl);
-      auto *const SrcPlaneStrideBytes =
-          B.CreateMul(SrcTotalPlaneArea, NumBytesPerEl);
-      auto *const DstPlaneStrideBytes =
-          B.CreateMul(DstTotalPlaneArea, NumBytesPerEl);
-      auto *const NewCI =
-          B.CreateCall(MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes,
-                                DstLineStrideBytes, SrcLineStrideBytes,
-                                NumLines, DstPlaneStrideBytes,
-                                SrcPlaneStrideBytes, NumPlanes, EventIn});
-      NewCI->setAttributes(MuxDMA->getAttributes());
-      NewCI->takeName(&CI);
-      return NewCI;
+    }();
+
+    auto *const MuxDMA =
+        BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, EventIn->getType());
+    assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+    CallInst *NewCI = nullptr;
+    if (!IsStrided) {
+      NewCI = B.CreateCall(MuxDMA, {Dst, Src, WidthInBytes, EventIn},
+                           "mux.out.event");
+    } else {
+      // The stride from async_work_group_strided_copy is in elements, but the
+      // stride in the __mux builtins are in bytes so we need to scale the
+      // value.
+      auto *const Stride = CI.getArgOperand(3);
+      auto *const StrideInBytes =
+          B.CreateMul(ElementSize, Stride, "stride.bytes");
+
+      // For async_work_group_strided_copy, the stride only applies to the
+      // global memory, as we are doing scatters/gathers.
+      auto *const DstStride = IsRead ? ElementSize : StrideInBytes;
+      auto *const SrcStride = IsRead ? StrideInBytes : ElementSize;
+
+      NewCI = B.CreateCall(
+          MuxDMA,
+          {Dst, Src, WidthInBytes, DstStride, SrcStride, NumElements, EventIn},
+          "mux.out.event");
     }
+    NewCI->setAttributes(MuxDMA->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAsyncWorkGroupCopy2D2D: {
+    // Unpack the arguments for ease of access.
+    auto *const Dst = CI.getArgOperand(0);
+    auto *const DstOffset = CI.getArgOperand(1);
+    auto *const Src = CI.getArgOperand(2);
+    auto *const SrcOffset = CI.getArgOperand(3);
+    auto *const NumBytesPerEl = CI.getArgOperand(4);
+    auto *const NumElsPerLine = CI.getArgOperand(5);
+    auto *const NumLines = CI.getArgOperand(6);
+    auto *const SrcTotalLineLength = CI.getArgOperand(7);
+    auto *const DstTotalLineLength = CI.getArgOperand(8);
+    auto *const EventIn = CI.getArgOperand(9);
+
+    // Find out which way the DMA is going and declare the appropriate mux
+    // builtin.
+    const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                        compiler::utils::AddressSpace::Local;
+    auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
+        IsRead ? eMuxBuiltinDMARead2D : eMuxBuiltinDMAWrite2D, M,
+        EventIn->getType());
+    assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+    auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
+    auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
+    auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
+    auto *const ByteTy = B.getInt8Ty();
+    auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
+    auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
+    auto *const SrcStrideBytes = B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
+    auto *const DstStrideBytes = B.CreateMul(DstTotalLineLength, NumBytesPerEl);
+    auto *const NewCI = B.CreateCall(
+        MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes, DstStrideBytes,
+                 SrcStrideBytes, NumLines, EventIn});
+    NewCI->setAttributes(MuxDMA->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAsyncWorkGroupCopy3D3D: {
+    auto *const Dst = CI.getArgOperand(0);
+    auto *const DstOffset = CI.getArgOperand(1);
+    auto *const Src = CI.getArgOperand(2);
+    auto *const SrcOffset = CI.getArgOperand(3);
+    auto *const NumBytesPerEl = CI.getArgOperand(4);
+    auto *const NumElsPerLine = CI.getArgOperand(5);
+    auto *const NumLines = CI.getArgOperand(6);
+    auto *const NumPlanes = CI.getArgOperand(7);
+    auto *const SrcTotalLineLength = CI.getArgOperand(8);
+    auto *const SrcTotalPlaneArea = CI.getArgOperand(9);
+    auto *const DstTotalLineLength = CI.getArgOperand(10);
+    auto *const DstTotalPlaneArea = CI.getArgOperand(11);
+    auto *const EventIn = CI.getArgOperand(12);
+
+    // Find out which way the DMA is going and declare the appropriate mux
+    // builtin.
+    const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                        compiler::utils::AddressSpace::Local;
+    auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
+        IsRead ? eMuxBuiltinDMARead3D : eMuxBuiltinDMAWrite3D, M,
+        EventIn->getType());
+    assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+    auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
+    auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
+    auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
+    auto *const ByteTy = B.getInt8Ty();
+    auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
+    auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
+    auto *const SrcLineStrideBytes =
+        B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
+    auto *const DstLineStrideBytes =
+        B.CreateMul(DstTotalLineLength, NumBytesPerEl);
+    auto *const SrcPlaneStrideBytes =
+        B.CreateMul(SrcTotalPlaneArea, NumBytesPerEl);
+    auto *const DstPlaneStrideBytes =
+        B.CreateMul(DstTotalPlaneArea, NumBytesPerEl);
+    auto *const NewCI = B.CreateCall(
+        MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes,
+                 DstLineStrideBytes, SrcLineStrideBytes, NumLines,
+                 DstPlaneStrideBytes, SrcPlaneStrideBytes, NumPlanes, EventIn});
+    NewCI->setAttributes(MuxDMA->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
   }
 
   return nullptr;
@@ -3643,5 +3650,5 @@ Function *CLBuiltinLoader::materializeBuiltin(StringRef BuiltinName,
 
   return cast<Function>(ValueMap[SrcBuiltin]);
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
index 61f8d0a83d7b1..a176ace88c196 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
@@ -22,8 +22,9 @@
 
 using namespace llvm;
 
-PreservedAnalyses compiler::utils::DefineMuxBuiltinsPass::run(
-    Module &M, ModuleAnalysisManager &AM) {
+PreservedAnalyses
+compiler::utils::DefineMuxBuiltinsPass::run(Module &M,
+                                            ModuleAnalysisManager &AM) {
   bool Changed = false;
   auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
index 310fa182fc1d3..66cb934125195 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
@@ -70,5 +70,5 @@ llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m) {
 
   return llvm::StructType::create(m.getContext(), MuxBuiltins::dma_event_type);
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
index 3e065a052f8f0..5b4db40b0d6be 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
@@ -21,8 +21,9 @@
 
 using namespace llvm;
 
-PreservedAnalyses compiler::utils::TransferKernelMetadataPass::run(
-    Module &M, ModuleAnalysisManager &) {
+PreservedAnalyses
+compiler::utils::TransferKernelMetadataPass::run(Module &M,
+                                                 ModuleAnalysisManager &) {
   SmallVector<KernelInfo, 4> Kernels;
   populateKernelList(M, Kernels);
 
@@ -39,8 +40,9 @@ PreservedAnalyses compiler::utils::TransferKernelMetadataPass::run(
   return PreservedAnalyses::all();
 }
 
-PreservedAnalyses compiler::utils::EncodeKernelMetadataPass::run(
-    Module &M, ModuleAnalysisManager &) {
+PreservedAnalyses
+compiler::utils::EncodeKernelMetadataPass::run(Module &M,
+                                               ModuleAnalysisManager &) {
   if (auto *F = M.getFunction(KernelName)) {
     setOrigFnName(*F);
     setIsKernelEntryPt(*F);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
index 04b15ff2cf79e..ace34338bb5d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
@@ -25,40 +25,38 @@ using namespace llvm;
 static llvm::Constant *getNeutralIdentityHelper(RecurKind Kind, Type *Ty,
                                                 bool UseNaN, bool UseFZero) {
   switch (Kind) {
-    default:
-      return nullptr;
-    case RecurKind::And:
-      return ConstantInt::getAllOnesValue(Ty);
-    case RecurKind::Or:
-    case RecurKind::Add:
-    case RecurKind::Xor:
-      return ConstantInt::getNullValue(Ty);
-    case RecurKind::SMin:
-      return ConstantInt::get(
-          Ty, APInt::getSignedMaxValue(Ty->getScalarSizeInBits()));
-    case RecurKind::SMax:
-      return ConstantInt::get(
-          Ty, APInt::getSignedMinValue(Ty->getScalarSizeInBits()));
-    case RecurKind::UMin:
-      return ConstantInt::get(Ty,
-                              APInt::getMaxValue(Ty->getScalarSizeInBits()));
-    case RecurKind::UMax:
-      return ConstantInt::get(Ty,
-                              APInt::getMinValue(Ty->getScalarSizeInBits()));
-    case RecurKind::FAdd:
-      // -0.0 + 0.0 = 0.0 meaning -0.0 (not 0.0) is the neutral value for floats
-      // under addition.
-      return UseFZero ? ConstantFP::get(Ty, 0.0) : ConstantFP::get(Ty, -0.0);
-    case RecurKind::FMin:
-      return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ false)
-                    : ConstantFP::getInfinity(Ty, /*Negative*/ false);
-    case RecurKind::FMax:
-      return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ true)
-                    : ConstantFP::getInfinity(Ty, /*Negative*/ true);
-    case RecurKind::Mul:
-      return ConstantInt::get(Ty, 1);
-    case RecurKind::FMul:
-      return ConstantFP::get(Ty, 1.0);
+  default:
+    return nullptr;
+  case RecurKind::And:
+    return ConstantInt::getAllOnesValue(Ty);
+  case RecurKind::Or:
+  case RecurKind::Add:
+  case RecurKind::Xor:
+    return ConstantInt::getNullValue(Ty);
+  case RecurKind::SMin:
+    return ConstantInt::get(
+        Ty, APInt::getSignedMaxValue(Ty->getScalarSizeInBits()));
+  case RecurKind::SMax:
+    return ConstantInt::get(
+        Ty, APInt::getSignedMinValue(Ty->getScalarSizeInBits()));
+  case RecurKind::UMin:
+    return ConstantInt::get(Ty, APInt::getMaxValue(Ty->getScalarSizeInBits()));
+  case RecurKind::UMax:
+    return ConstantInt::get(Ty, APInt::getMinValue(Ty->getScalarSizeInBits()));
+  case RecurKind::FAdd:
+    // -0.0 + 0.0 = 0.0 meaning -0.0 (not 0.0) is the neutral value for floats
+    // under addition.
+    return UseFZero ? ConstantFP::get(Ty, 0.0) : ConstantFP::get(Ty, -0.0);
+  case RecurKind::FMin:
+    return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ false)
+                  : ConstantFP::getInfinity(Ty, /*Negative*/ false);
+  case RecurKind::FMax:
+    return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ true)
+                  : ConstantFP::getInfinity(Ty, /*Negative*/ true);
+  case RecurKind::Mul:
+    return ConstantInt::get(Ty, 1);
+  case RecurKind::FMul:
+    return ConstantFP::get(Ty, 1.0);
   }
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
index d911e18b51977..d31b3022c7eb5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
@@ -49,10 +49,10 @@ std::string NameMangler::mangleName(StringRef Name, ArrayRef<Type *> Tys,
   return MangledName;
 }
 
-StringRef NameMangler::demangleName(
-    StringRef Name, SmallVectorImpl<llvm::Type *> &Types,
-    SmallVectorImpl<llvm::Type *> &PointerElementTypes,
-    SmallVectorImpl<TypeQualifiers> &Quals) {
+StringRef
+NameMangler::demangleName(StringRef Name, SmallVectorImpl<llvm::Type *> &Types,
+                          SmallVectorImpl<llvm::Type *> &PointerElementTypes,
+                          SmallVectorImpl<TypeQualifiers> &Quals) {
   // Parse the name part.
   Lexer L(Name);
   Name = demangleName(L);
@@ -152,49 +152,49 @@ bool NameMangler::emitSubstitution(raw_ostream &O, Type *Ty,
 bool NameMangler::isTypeBuiltin(Type *Ty, TypeQualifiers &Quals) {
   (void)Quals;
   switch (Ty->getTypeID()) {
-    default:
-    case Type::StructTyID:
-    case Type::ArrayTyID:
-    case Type::PointerTyID:
-    case Type::FixedVectorTyID:
-      return false;
-    case Type::VoidTyID:
-    case Type::HalfTyID:
-    case Type::FloatTyID:
-    case Type::DoubleTyID:
-    case Type::IntegerTyID:
-      return true;
+  default:
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::PointerTyID:
+  case Type::FixedVectorTyID:
+    return false;
+  case Type::VoidTyID:
+  case Type::HalfTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::IntegerTyID:
+    return true;
   }
 }
 
 const char *NameMangler::mangleSimpleType(Type *Ty, TypeQualifier Qual) {
   const bool IsSigned = (Qual & eTypeQualSignedInt);
   switch (Ty->getTypeID()) {
+  default:
+    break;
+  case Type::VoidTyID:
+    return "v";
+  case Type::HalfTyID:
+    return "Dh";
+  case Type::FloatTyID:
+    return "f";
+  case Type::DoubleTyID:
+    return "d";
+  case Type::IntegerTyID:
+    switch (cast<IntegerType>(Ty)->getBitWidth()) {
     default:
       break;
-    case Type::VoidTyID:
-      return "v";
-    case Type::HalfTyID:
-      return "Dh";
-    case Type::FloatTyID:
-      return "f";
-    case Type::DoubleTyID:
-      return "d";
-    case Type::IntegerTyID:
-      switch (cast<IntegerType>(Ty)->getBitWidth()) {
-        default:
-          break;
-        case 1:
-          return "b";  // bool
-        case 8:
-          return IsSigned ? "c" : "h";
-        case 16:
-          return IsSigned ? "s" : "t";
-        case 32:
-          return IsSigned ? "i" : "j";
-        case 64:
-          return IsSigned ? "l" : "m";
-      }
+    case 1:
+      return "b"; // bool
+    case 8:
+      return IsSigned ? "c" : "h";
+    case 16:
+      return IsSigned ? "s" : "t";
+    case 32:
+      return IsSigned ? "i" : "j";
+    case 64:
+      return IsSigned ? "l" : "m";
+    }
   }
   return nullptr;
 }
@@ -272,54 +272,54 @@ bool NameMangler::demangleSimpleType(Lexer &L, Type *&Ty, TypeQualifier &Qual) {
   }
 
   switch (c) {
-    default:
+  default:
+    return false;
+  case 'v':
+    Ty = llvm::Type::getVoidTy(*Context);
+    break;
+  case 'D':
+    if (!L.Consume("Dh")) {
       return false;
-    case 'v':
-      Ty = llvm::Type::getVoidTy(*Context);
-      break;
-    case 'D':
-      if (!L.Consume("Dh")) {
-        return false;
-      }
-      Ty = llvm::Type::getHalfTy(*Context);
-      return true;
-    case 'f':
-      Ty = llvm::Type::getFloatTy(*Context);
-      break;
-    case 'd':
-      Ty = llvm::Type::getDoubleTy(*Context);
-      break;
-    case 'b':
-      Ty = llvm::Type::getInt1Ty(*Context);
-      break;
-    case 'c':
-    case 'h':
-      Ty = llvm::Type::getInt8Ty(*Context);
-      if (c == 'c') {
-        Qual = eTypeQualSignedInt;
-      }
-      break;
-    case 's':
-    case 't':
-      Ty = llvm::Type::getInt16Ty(*Context);
-      if (c == 's') {
-        Qual = eTypeQualSignedInt;
-      }
-      break;
-    case 'i':
-    case 'j':
-      Ty = llvm::Type::getInt32Ty(*Context);
-      if (c == 'i') {
-        Qual = eTypeQualSignedInt;
-      }
-      break;
-    case 'l':
-    case 'm':
-      Ty = llvm::Type::getInt64Ty(*Context);
-      if (c == 'l') {
-        Qual = eTypeQualSignedInt;
-      }
-      break;
+    }
+    Ty = llvm::Type::getHalfTy(*Context);
+    return true;
+  case 'f':
+    Ty = llvm::Type::getFloatTy(*Context);
+    break;
+  case 'd':
+    Ty = llvm::Type::getDoubleTy(*Context);
+    break;
+  case 'b':
+    Ty = llvm::Type::getInt1Ty(*Context);
+    break;
+  case 'c':
+  case 'h':
+    Ty = llvm::Type::getInt8Ty(*Context);
+    if (c == 'c') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  case 's':
+  case 't':
+    Ty = llvm::Type::getInt16Ty(*Context);
+    if (c == 's') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  case 'i':
+  case 'j':
+    Ty = llvm::Type::getInt32Ty(*Context);
+    if (c == 'i') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  case 'l':
+  case 'm':
+    Ty = llvm::Type::getInt64Ty(*Context);
+    if (c == 'l') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
   }
   L.Consume();
   return true;
@@ -354,20 +354,20 @@ std::optional<std::string> NameMangler::mangleBuiltinType(Type *Ty) {
   std::string MangledName = "ocl_image";
 
   switch (Dim) {
-    default:
-      return std::nullopt;
-    case tgtext::ImageDim1D:
-      MangledName += "1d";
-      break;
-    case tgtext::ImageDim2D:
-      MangledName += "2d";
-      break;
-    case tgtext::ImageDim3D:
-      MangledName += "3d";
-      break;
-    case tgtext::ImageDimBuffer:
-      MangledName += "1dbuffer";
-      break;
+  default:
+    return std::nullopt;
+  case tgtext::ImageDim1D:
+    MangledName += "1d";
+    break;
+  case tgtext::ImageDim2D:
+    MangledName += "2d";
+    break;
+  case tgtext::ImageDim3D:
+    MangledName += "3d";
+    break;
+  case tgtext::ImageDimBuffer:
+    MangledName += "1dbuffer";
+    break;
   }
 
   if (Arrayed == tgtext::ImageArrayed) {
@@ -511,20 +511,20 @@ static std::optional<PointerASQuals> demanglePointerQuals(Lexer &L) {
   }
 
   switch (L.Current()) {
-    default:
-      break;
-    case 'K':
-      PointerQual = eTypeQualPointerConst;
-      L.Consume();
-      break;
-    case 'r':
-      PointerQual = eTypeQualPointerRestrict;
-      L.Consume();
-      break;
-    case 'V':
-      PointerQual = eTypeQualPointerVolatile;
-      L.Consume();
-      break;
+  default:
+    break;
+  case 'K':
+    PointerQual = eTypeQualPointerConst;
+    L.Consume();
+    break;
+  case 'r':
+    PointerQual = eTypeQualPointerRestrict;
+    L.Consume();
+    break;
+  case 'V':
+    PointerQual = eTypeQualPointerVolatile;
+    L.Consume();
+    break;
   }
 
   if (!DemangledAS && L.Consume("U3AS") && !L.ConsumeInteger(AddressSpace)) {
@@ -885,5 +885,5 @@ bool Lexer::ConsumeWhitespace() {
 
   return consumed;
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
index 85a9ef5dd2929..985008873c7a8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
@@ -291,8 +291,8 @@ std::optional<unsigned> isSchedulingParameter(const Function &f, unsigned idx) {
 }
 
 // Uses the format of a metadata node directly applied to a function.
-std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
-    const Function &f) {
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const Function &f) {
   if (auto mdnode = f.getMetadata(ReqdWGSizeMD)) {
     std::array<uint64_t, 3> wgs = {0, 1, 1};
     assert(mdnode->getNumOperands() >= 1 && mdnode->getNumOperands() <= 3 &&
@@ -306,8 +306,8 @@ std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
 }
 
 // Uses the format of a metadata node that's a part of the opencl.kernels node.
-std::optional<std::array<uint64_t, 3>> parseRequiredWGSMetadata(
-    const MDNode &node) {
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const MDNode &node) {
   for (uint32_t i = 1; i < node.getNumOperands(); ++i) {
     MDNode *const subNode = cast<MDNode>(node.getOperand(i));
     MDString *const operandName = cast<MDString>(subNode->getOperand(0));
@@ -391,5 +391,5 @@ std::optional<uint32_t> getReqdSubgroupSize(const Function &f) {
   return std::nullopt;
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
index 788d493310eb6..51268147e1345 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -50,33 +50,33 @@ static Function *defineLocalWorkItemBuiltin(BIMuxInfoConcept &BI, BuiltinID ID,
   bool HasRankArg = false;
   std::optional<WorkItemInfoStructField::Type> WIFieldIdx;
   switch (ID) {
-    default:
-      return nullptr;
-    case eMuxBuiltinSetLocalId:
-      IsSetter = true;
-      LLVM_FALLTHROUGH;
-    case eMuxBuiltinGetLocalId:
-      HasRankArg = true;
-      WIFieldIdx = WorkItemInfoStructField::local_id;
-      break;
-    case eMuxBuiltinSetSubGroupId:
-      IsSetter = true;
-      LLVM_FALLTHROUGH;
-    case eMuxBuiltinGetSubGroupId:
-      WIFieldIdx = WorkItemInfoStructField::sub_group_id;
-      break;
-    case eMuxBuiltinSetNumSubGroups:
-      IsSetter = true;
-      LLVM_FALLTHROUGH;
-    case eMuxBuiltinGetNumSubGroups:
-      WIFieldIdx = WorkItemInfoStructField::num_sub_groups;
-      break;
-    case eMuxBuiltinSetMaxSubGroupSize:
-      IsSetter = true;
-      LLVM_FALLTHROUGH;
-    case eMuxBuiltinGetMaxSubGroupSize:
-      WIFieldIdx = WorkItemInfoStructField::max_sub_group_size;
-      break;
+  default:
+    return nullptr;
+  case eMuxBuiltinSetLocalId:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetLocalId:
+    HasRankArg = true;
+    WIFieldIdx = WorkItemInfoStructField::local_id;
+    break;
+  case eMuxBuiltinSetSubGroupId:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetSubGroupId:
+    WIFieldIdx = WorkItemInfoStructField::sub_group_id;
+    break;
+  case eMuxBuiltinSetNumSubGroups:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetNumSubGroups:
+    WIFieldIdx = WorkItemInfoStructField::num_sub_groups;
+    break;
+  case eMuxBuiltinSetMaxSubGroupSize:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetMaxSubGroupSize:
+    WIFieldIdx = WorkItemInfoStructField::max_sub_group_size;
+    break;
   }
 
   Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID));
@@ -110,29 +110,29 @@ static Function *defineLocalWorkGroupBuiltin(BIMuxInfoConcept &BI, BuiltinID ID,
   size_t DefaultVal = 0;
   std::optional<WorkGroupInfoStructField::Type> WGFieldIdx;
   switch (ID) {
-    default:
-      return nullptr;
-    case eMuxBuiltinGetLocalSize:
-      DefaultVal = 1;
-      WGFieldIdx = WorkGroupInfoStructField::local_size;
-      break;
-    case eMuxBuiltinGetGroupId:
-      DefaultVal = 0;
-      WGFieldIdx = WorkGroupInfoStructField::group_id;
-      break;
-    case eMuxBuiltinGetNumGroups:
-      DefaultVal = 1;
-      WGFieldIdx = WorkGroupInfoStructField::num_groups;
-      break;
-    case eMuxBuiltinGetGlobalOffset:
-      DefaultVal = 0;
-      WGFieldIdx = WorkGroupInfoStructField::global_offset;
-      break;
-    case eMuxBuiltinGetWorkDim:
-      DefaultVal = 1;
-      HasRankArg = false;
-      WGFieldIdx = WorkGroupInfoStructField::work_dim;
-      break;
+  default:
+    return nullptr;
+  case eMuxBuiltinGetLocalSize:
+    DefaultVal = 1;
+    WGFieldIdx = WorkGroupInfoStructField::local_size;
+    break;
+  case eMuxBuiltinGetGroupId:
+    DefaultVal = 0;
+    WGFieldIdx = WorkGroupInfoStructField::group_id;
+    break;
+  case eMuxBuiltinGetNumGroups:
+    DefaultVal = 1;
+    WGFieldIdx = WorkGroupInfoStructField::num_groups;
+    break;
+  case eMuxBuiltinGetGlobalOffset:
+    DefaultVal = 0;
+    WGFieldIdx = WorkGroupInfoStructField::global_offset;
+    break;
+  case eMuxBuiltinGetWorkDim:
+    DefaultVal = 1;
+    HasRankArg = false;
+    WGFieldIdx = WorkGroupInfoStructField::work_dim;
+    break;
   }
 
   Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID));
@@ -167,61 +167,61 @@ static Function *defineSubGroupGroupOpBuiltin(Function &F,
 
   [&] {
     switch (GroupOp.Op) {
-      case GroupCollective::OpKind::Any:
-      case GroupCollective::OpKind::All:
-      case GroupCollective::OpKind::Broadcast:
-      case GroupCollective::OpKind::Reduction:
-      case GroupCollective::OpKind::ScanInclusive:
-        // In the trivial size=1 case, all of these operations just return the
-        // argument back again
-        B.CreateRet(Arg);
-        return;
-      case GroupCollective::OpKind::ScanExclusive: {
-        // In the trivial size=1 case, exclusive scans return the identity.
-        assert(!OverloadInfo.empty());
-        auto *const IdentityVal =
-            getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]);
-        assert(IdentityVal && "Unable to deduce identity val");
-        B.CreateRet(IdentityVal);
-        return;
-      }
-      case GroupCollective::OpKind::Shuffle:
-      case GroupCollective::OpKind::ShuffleXor:
-        // In the trivial size=1 case, all of these operations just return the
-        // argument back again. Any computed shuffle index other than the only
-        // one in the sub-group would be out of bounds anyway.
-        B.CreateRet(Arg);
-        return;
-      case GroupCollective::OpKind::ShuffleUp: {
-        auto *const Prev = F.getArg(0);
-        auto *const Curr = F.getArg(1);
-        auto *const Delta = F.getArg(2);
-        // In the trivial size=1 case, negative delta is the desired index
-        // (since we're subtracting it from zero). If it's greater than zero and
-        // less than the size, we return 'current', else if it's less than zero
-        // and greater than or equal to the negative size, we return 'prev'. So
-        // if 'delta' is zero, return 'current', else return 'prev'. Anything
-        // else is out of bounds so we can simplify things here.
-        auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
-        auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel");
-        B.CreateRet(Sel);
-        return;
-      }
-      case GroupCollective::OpKind::ShuffleDown: {
-        auto *const Curr = F.getArg(0);
-        auto *const Next = F.getArg(1);
-        auto *const Delta = F.getArg(2);
-        // In the trivial size=1 case, the delta is the desired index (since
-        // we're adding it to zero). If it's less than the size, we return
-        // 'current', else if it's greater or equal to the size but less than
-        // twice the size, we return 'next'. So if 'delta' is zero, return
-        // 'current', else return 'next'. Anything else is out of bounds so we
-        // can simplify things here.
-        auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
-        auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel");
-        B.CreateRet(Sel);
-        return;
-      }
+    case GroupCollective::OpKind::Any:
+    case GroupCollective::OpKind::All:
+    case GroupCollective::OpKind::Broadcast:
+    case GroupCollective::OpKind::Reduction:
+    case GroupCollective::OpKind::ScanInclusive:
+      // In the trivial size=1 case, all of these operations just return the
+      // argument back again
+      B.CreateRet(Arg);
+      return;
+    case GroupCollective::OpKind::ScanExclusive: {
+      // In the trivial size=1 case, exclusive scans return the identity.
+      assert(!OverloadInfo.empty());
+      auto *const IdentityVal =
+          getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]);
+      assert(IdentityVal && "Unable to deduce identity val");
+      B.CreateRet(IdentityVal);
+      return;
+    }
+    case GroupCollective::OpKind::Shuffle:
+    case GroupCollective::OpKind::ShuffleXor:
+      // In the trivial size=1 case, all of these operations just return the
+      // argument back again. Any computed shuffle index other than the only
+      // one in the sub-group would be out of bounds anyway.
+      B.CreateRet(Arg);
+      return;
+    case GroupCollective::OpKind::ShuffleUp: {
+      auto *const Prev = F.getArg(0);
+      auto *const Curr = F.getArg(1);
+      auto *const Delta = F.getArg(2);
+      // In the trivial size=1 case, negative delta is the desired index
+      // (since we're subtracting it from zero). If it's greater than zero and
+      // less than the size, we return 'current', else if it's less than zero
+      // and greater than or equal to the negative size, we return 'prev'. So
+      // if 'delta' is zero, return 'current', else return 'prev'. Anything
+      // else is out of bounds so we can simplify things here.
+      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+      auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel");
+      B.CreateRet(Sel);
+      return;
+    }
+    case GroupCollective::OpKind::ShuffleDown: {
+      auto *const Curr = F.getArg(0);
+      auto *const Next = F.getArg(1);
+      auto *const Delta = F.getArg(2);
+      // In the trivial size=1 case, the delta is the desired index (since
+      // we're adding it to zero). If it's less than the size, we return
+      // 'current', else if it's greater or equal to the size but less than
+      // twice the size, we return 'next'. So if 'delta' is zero, return
+      // 'current', else return 'next'. Anything else is out of bounds so we
+      // can simplify things here.
+      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+      auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel");
+      B.CreateRet(Sel);
+      return;
+    }
     }
 
     llvm_unreachable("Unhandled group operation");
@@ -790,41 +790,41 @@ Function *BIMuxInfoConcept::defineMuxBuiltin(BuiltinID ID, Module &M,
   }
 
   switch (ID) {
-    default:
-      break;
-    case eMuxBuiltinGetGlobalId:
-      return defineGetGlobalId(M);
-    case eMuxBuiltinGetGlobalSize:
-      return defineGetGlobalSize(M);
-    case eMuxBuiltinGetLocalLinearId:
-      return defineGetLocalLinearId(M);
-    case eMuxBuiltinGetGlobalLinearId:
-      return defineGetGlobalLinearId(M);
-    case eMuxBuiltinGetEnqueuedLocalSize:
-      return defineGetEnqueuedLocalSize(M);
-    // Just handle the memory synchronization requirements of any barrier
-    // builtin. We assume that the control requirements of work-group and
-    // sub-group control barriers have been handled by earlier passes.
-    case eMuxBuiltinMemBarrier:
-      return defineMemBarrier(*F, 0, 1);
-    case eMuxBuiltinSubGroupBarrier:
-    case eMuxBuiltinWorkGroupBarrier:
-      return defineMemBarrier(*F, 1, 2);
-    case eMuxBuiltinDMARead1D:
-    case eMuxBuiltinDMAWrite1D:
-      return defineDMA1D(*F);
-    case eMuxBuiltinDMARead2D:
-    case eMuxBuiltinDMAWrite2D:
-      return defineDMA2D(*F);
-    case eMuxBuiltinDMARead3D:
-    case eMuxBuiltinDMAWrite3D:
-      return defineDMA3D(*F);
-    case eMuxBuiltinDMAWait:
-      return defineDMAWait(*F);
-    case eMuxBuiltinGetSubGroupSize:
-      return defineGetSubGroupSize(*F);
-    case eMuxBuiltinGetSubGroupLocalId:
-      return defineGetSubGroupLocalId(*F);
+  default:
+    break;
+  case eMuxBuiltinGetGlobalId:
+    return defineGetGlobalId(M);
+  case eMuxBuiltinGetGlobalSize:
+    return defineGetGlobalSize(M);
+  case eMuxBuiltinGetLocalLinearId:
+    return defineGetLocalLinearId(M);
+  case eMuxBuiltinGetGlobalLinearId:
+    return defineGetGlobalLinearId(M);
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    return defineGetEnqueuedLocalSize(M);
+  // Just handle the memory synchronization requirements of any barrier
+  // builtin. We assume that the control requirements of work-group and
+  // sub-group control barriers have been handled by earlier passes.
+  case eMuxBuiltinMemBarrier:
+    return defineMemBarrier(*F, 0, 1);
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinWorkGroupBarrier:
+    return defineMemBarrier(*F, 1, 2);
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMAWrite1D:
+    return defineDMA1D(*F);
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMAWrite2D:
+    return defineDMA2D(*F);
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite3D:
+    return defineDMA3D(*F);
+  case eMuxBuiltinDMAWait:
+    return defineDMAWait(*F);
+  case eMuxBuiltinGetSubGroupSize:
+    return defineGetSubGroupSize(*F);
+  case eMuxBuiltinGetSubGroupLocalId:
+    return defineGetSubGroupLocalId(*F);
   }
 
   if (auto *const NewF = defineLocalWorkItemBuiltin(*this, ID, M)) {
@@ -847,32 +847,32 @@ Function *BIMuxInfoConcept::defineMuxBuiltin(BuiltinID ID, Module &M,
 
 bool BIMuxInfoConcept::requiresSchedulingParameters(BuiltinID ID) {
   switch (ID) {
-    default:
-      return false;
-    case eMuxBuiltinGetLocalId:
-    case eMuxBuiltinSetLocalId:
-    case eMuxBuiltinGetSubGroupId:
-    case eMuxBuiltinSetSubGroupId:
-    case eMuxBuiltinGetNumSubGroups:
-    case eMuxBuiltinSetNumSubGroups:
-    case eMuxBuiltinGetMaxSubGroupSize:
-    case eMuxBuiltinSetMaxSubGroupSize:
-    case eMuxBuiltinGetLocalLinearId:
-      // Work-item struct only
-      return true;
-    case eMuxBuiltinGetWorkDim:
-    case eMuxBuiltinGetGroupId:
-    case eMuxBuiltinGetNumGroups:
-    case eMuxBuiltinGetGlobalSize:
-    case eMuxBuiltinGetLocalSize:
-    case eMuxBuiltinGetGlobalOffset:
-    case eMuxBuiltinGetEnqueuedLocalSize:
-      // Work-group struct only
-      return true;
-    case eMuxBuiltinGetGlobalId:
-    case eMuxBuiltinGetGlobalLinearId:
-      // Work-item and work-group structs
-      return true;
+  default:
+    return false;
+  case eMuxBuiltinGetLocalId:
+  case eMuxBuiltinSetLocalId:
+  case eMuxBuiltinGetSubGroupId:
+  case eMuxBuiltinSetSubGroupId:
+  case eMuxBuiltinGetNumSubGroups:
+  case eMuxBuiltinSetNumSubGroups:
+  case eMuxBuiltinGetMaxSubGroupSize:
+  case eMuxBuiltinSetMaxSubGroupSize:
+  case eMuxBuiltinGetLocalLinearId:
+    // Work-item struct only
+    return true;
+  case eMuxBuiltinGetWorkDim:
+  case eMuxBuiltinGetGroupId:
+  case eMuxBuiltinGetNumGroups:
+  case eMuxBuiltinGetGlobalSize:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetGlobalOffset:
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    // Work-group struct only
+    return true;
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetGlobalLinearId:
+    // Work-item and work-group structs
+    return true;
   }
 }
 
@@ -902,8 +902,9 @@ Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) {
   return nullptr;
 }
 
-Function *BIMuxInfoConcept::getOrDeclareMuxBuiltin(
-    BuiltinID ID, Module &M, ArrayRef<Type *> OverloadInfo) {
+Function *
+BIMuxInfoConcept::getOrDeclareMuxBuiltin(BuiltinID ID, Module &M,
+                                         ArrayRef<Type *> OverloadInfo) {
   assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins");
   auto FnName = BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo);
   if (auto *const F = M.getFunction(FnName)) {
@@ -920,236 +921,232 @@ Function *BIMuxInfoConcept::getOrDeclareMuxBuiltin(
   SmallVector<std::string, 4> ParamNames;
 
   switch (ID) {
-    // Ranked Getters
-    case eMuxBuiltinGetLocalId:
-    case eMuxBuiltinGetGlobalId:
-    case eMuxBuiltinGetLocalSize:
-    case eMuxBuiltinGetGlobalSize:
-    case eMuxBuiltinGetGlobalOffset:
-    case eMuxBuiltinGetNumGroups:
-    case eMuxBuiltinGetGroupId:
-    case eMuxBuiltinGetEnqueuedLocalSize:
+  // Ranked Getters
+  case eMuxBuiltinGetLocalId:
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetGlobalSize:
+  case eMuxBuiltinGetGlobalOffset:
+  case eMuxBuiltinGetNumGroups:
+  case eMuxBuiltinGetGroupId:
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    ParamTys.push_back(Int32Ty);
+    ParamNames.push_back("idx");
+    LLVM_FALLTHROUGH;
+  // Unranked Getters
+  case eMuxBuiltinGetWorkDim:
+  case eMuxBuiltinGetSubGroupId:
+  case eMuxBuiltinGetNumSubGroups:
+  case eMuxBuiltinGetSubGroupSize:
+  case eMuxBuiltinGetMaxSubGroupSize:
+  case eMuxBuiltinGetSubGroupLocalId:
+  case eMuxBuiltinGetLocalLinearId:
+  case eMuxBuiltinGetGlobalLinearId: {
+    // Some builtins return uint, others return size_t
+    RetTy =
+        (ID == eMuxBuiltinGetWorkDim || ID == eMuxBuiltinGetSubGroupId ||
+         ID == eMuxBuiltinGetNumSubGroups || ID == eMuxBuiltinGetSubGroupSize ||
+         ID == eMuxBuiltinGetMaxSubGroupSize ||
+         ID == eMuxBuiltinGetSubGroupLocalId)
+            ? Int32Ty
+            : SizeTy;
+    // All of our mux getters are readonly - they may never write data
+    AB.addMemoryAttr(MemoryEffects::readOnly());
+    break;
+  }
+  // Ranked Setters
+  case eMuxBuiltinSetLocalId:
+    ParamTys.push_back(Int32Ty);
+    ParamNames.push_back("idx");
+    LLVM_FALLTHROUGH;
+  // Unranked Setters
+  case eMuxBuiltinSetSubGroupId:
+  case eMuxBuiltinSetNumSubGroups:
+  case eMuxBuiltinSetMaxSubGroupSize: {
+    RetTy = VoidTy;
+    ParamTys.push_back(ID == eMuxBuiltinSetLocalId ? SizeTy : Int32Ty);
+    ParamNames.push_back("val");
+    break;
+  }
+  case eMuxBuiltinMemBarrier: {
+    RetTy = VoidTy;
+    for (auto PName : {"scope", "semantics"}) {
       ParamTys.push_back(Int32Ty);
-      ParamNames.push_back("idx");
-      LLVM_FALLTHROUGH;
-    // Unranked Getters
-    case eMuxBuiltinGetWorkDim:
-    case eMuxBuiltinGetSubGroupId:
-    case eMuxBuiltinGetNumSubGroups:
-    case eMuxBuiltinGetSubGroupSize:
-    case eMuxBuiltinGetMaxSubGroupSize:
-    case eMuxBuiltinGetSubGroupLocalId:
-    case eMuxBuiltinGetLocalLinearId:
-    case eMuxBuiltinGetGlobalLinearId: {
-      // Some builtins return uint, others return size_t
-      RetTy = (ID == eMuxBuiltinGetWorkDim || ID == eMuxBuiltinGetSubGroupId ||
-               ID == eMuxBuiltinGetNumSubGroups ||
-               ID == eMuxBuiltinGetSubGroupSize ||
-               ID == eMuxBuiltinGetMaxSubGroupSize ||
-               ID == eMuxBuiltinGetSubGroupLocalId)
-                  ? Int32Ty
-                  : SizeTy;
-      // All of our mux getters are readonly - they may never write data
-      AB.addMemoryAttr(MemoryEffects::readOnly());
-      break;
+      ParamNames.push_back(PName);
     }
-    // Ranked Setters
-    case eMuxBuiltinSetLocalId:
+    AB.addAttribute(Attribute::NoMerge);
+    AB.addAttribute(Attribute::NoDuplicate);
+    AB.addAttribute(Attribute::Convergent);
+    break;
+  }
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinWorkGroupBarrier: {
+    RetTy = VoidTy;
+    for (auto PName : {"id", "scope", "semantics"}) {
       ParamTys.push_back(Int32Ty);
-      ParamNames.push_back("idx");
-      LLVM_FALLTHROUGH;
-    // Unranked Setters
-    case eMuxBuiltinSetSubGroupId:
-    case eMuxBuiltinSetNumSubGroups:
-    case eMuxBuiltinSetMaxSubGroupSize: {
-      RetTy = VoidTy;
-      ParamTys.push_back(ID == eMuxBuiltinSetLocalId ? SizeTy : Int32Ty);
-      ParamNames.push_back("val");
-      break;
-    }
-    case eMuxBuiltinMemBarrier: {
-      RetTy = VoidTy;
-      for (auto PName : {"scope", "semantics"}) {
-        ParamTys.push_back(Int32Ty);
-        ParamNames.push_back(PName);
-      }
-      AB.addAttribute(Attribute::NoMerge);
-      AB.addAttribute(Attribute::NoDuplicate);
-      AB.addAttribute(Attribute::Convergent);
-      break;
-    }
-    case eMuxBuiltinSubGroupBarrier:
-    case eMuxBuiltinWorkGroupBarrier: {
-      RetTy = VoidTy;
-      for (auto PName : {"id", "scope", "semantics"}) {
-        ParamTys.push_back(Int32Ty);
-        ParamNames.push_back(PName);
-      }
-      AB.addAttribute(Attribute::NoMerge);
-      AB.addAttribute(Attribute::NoDuplicate);
-      AB.addAttribute(Attribute::Convergent);
-      break;
+      ParamNames.push_back(PName);
     }
-    case eMuxBuiltinDMAWait:
-      RetTy = VoidTy;
-      // Num events
-      ParamTys.push_back(Int32Ty);
-      ParamNames.push_back("num_events");
-      // The events list
-      ParamTys.push_back(PointerType::getUnqual(Ctx));
-      ParamNames.push_back("events");
-      AB.addAttribute(Attribute::Convergent);
-      break;
-    case eMuxBuiltinDMARead1D:
-    case eMuxBuiltinDMAWrite1D: {
-      // We need to be told the target event type to declare this builtin.
-      assert(!OverloadInfo.empty() && "Missing event type");
-      auto *const EventTy = OverloadInfo[0];
-      RetTy = EventTy;
-      const bool IsRead = ID == eMuxBuiltinDMARead1D;
-
-      PointerType *const LocalPtrTy =
-          PointerType::get(Ctx, AddressSpace::Local);
-      PointerType *const GlobalPtrTy =
-          PointerType::get(Ctx, AddressSpace::Global);
-
-      ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
-      ParamNames.push_back("dst");
-
-      ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
-      ParamNames.push_back("src");
+    AB.addAttribute(Attribute::NoMerge);
+    AB.addAttribute(Attribute::NoDuplicate);
+    AB.addAttribute(Attribute::Convergent);
+    break;
+  }
+  case eMuxBuiltinDMAWait:
+    RetTy = VoidTy;
+    // Num events
+    ParamTys.push_back(Int32Ty);
+    ParamNames.push_back("num_events");
+    // The events list
+    ParamTys.push_back(PointerType::getUnqual(Ctx));
+    ParamNames.push_back("events");
+    AB.addAttribute(Attribute::Convergent);
+    break;
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMAWrite1D: {
+    // We need to be told the target event type to declare this builtin.
+    assert(!OverloadInfo.empty() && "Missing event type");
+    auto *const EventTy = OverloadInfo[0];
+    RetTy = EventTy;
+    const bool IsRead = ID == eMuxBuiltinDMARead1D;
+
+    PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local);
+    PointerType *const GlobalPtrTy =
+        PointerType::get(Ctx, AddressSpace::Global);
+
+    ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+    ParamNames.push_back("dst");
+
+    ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+    ParamNames.push_back("src");
+
+    ParamTys.push_back(SizeTy);
+    ParamNames.push_back("num_bytes");
+
+    ParamTys.push_back(EventTy);
+    ParamNames.push_back("event");
+    break;
+  }
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMAWrite2D: {
+    // We need to be told the target event type to declare this builtin.
+    assert(!OverloadInfo.empty() && "Missing event type");
+    auto *const EventTy = OverloadInfo[0];
+    RetTy = EventTy;
+    const bool IsRead = ID == eMuxBuiltinDMARead2D;
 
-      ParamTys.push_back(SizeTy);
-      ParamNames.push_back("num_bytes");
+    PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local);
+    PointerType *const GlobalPtrTy =
+        PointerType::get(Ctx, AddressSpace::Global);
 
-      ParamTys.push_back(EventTy);
-      ParamNames.push_back("event");
-      break;
-    }
-    case eMuxBuiltinDMARead2D:
-    case eMuxBuiltinDMAWrite2D: {
-      // We need to be told the target event type to declare this builtin.
-      assert(!OverloadInfo.empty() && "Missing event type");
-      auto *const EventTy = OverloadInfo[0];
-      RetTy = EventTy;
-      const bool IsRead = ID == eMuxBuiltinDMARead2D;
-
-      PointerType *const LocalPtrTy =
-          PointerType::get(Ctx, AddressSpace::Local);
-      PointerType *const GlobalPtrTy =
-          PointerType::get(Ctx, AddressSpace::Global);
-
-      ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
-      ParamNames.push_back("dst");
-
-      ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
-      ParamNames.push_back("src");
-
-      for (auto &P : {"num_bytes", "dst_stride", "src_stride", "height"}) {
-        ParamTys.push_back(SizeTy);
-        ParamNames.push_back(P);
-      }
+    ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+    ParamNames.push_back("dst");
 
-      ParamTys.push_back(EventTy);
-      ParamNames.push_back("event");
-      break;
+    ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+    ParamNames.push_back("src");
+
+    for (auto &P : {"num_bytes", "dst_stride", "src_stride", "height"}) {
+      ParamTys.push_back(SizeTy);
+      ParamNames.push_back(P);
     }
-    case eMuxBuiltinDMARead3D:
-    case eMuxBuiltinDMAWrite3D: {
-      // We need to be told the target event type to declare this builtin.
-      assert(!OverloadInfo.empty() && "Missing event type");
-      auto *const EventTy = OverloadInfo[0];
-      RetTy = EventTy;
-      const bool IsRead = ID == eMuxBuiltinDMARead3D;
-
-      PointerType *const LocalPtrTy =
-          PointerType::get(Ctx, AddressSpace::Local);
-      PointerType *const GlobalPtrTy =
-          PointerType::get(Ctx, AddressSpace::Global);
-
-      ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
-      ParamNames.push_back("dst");
-
-      ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
-      ParamNames.push_back("src");
-
-      for (auto &P :
-           {"num_bytes", "dst_line_stride", "src_line_stride", "height",
-            "dst_plane_stride", "src_plane_stride", "depth"}) {
-        ParamTys.push_back(SizeTy);
-        ParamNames.push_back(P);
-      }
 
-      ParamTys.push_back(EventTy);
-      ParamNames.push_back("event");
-      break;
+    ParamTys.push_back(EventTy);
+    ParamNames.push_back("event");
+    break;
+  }
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite3D: {
+    // We need to be told the target event type to declare this builtin.
+    assert(!OverloadInfo.empty() && "Missing event type");
+    auto *const EventTy = OverloadInfo[0];
+    RetTy = EventTy;
+    const bool IsRead = ID == eMuxBuiltinDMARead3D;
+
+    PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local);
+    PointerType *const GlobalPtrTy =
+        PointerType::get(Ctx, AddressSpace::Global);
+
+    ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+    ParamNames.push_back("dst");
+
+    ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+    ParamNames.push_back("src");
+
+    for (auto &P : {"num_bytes", "dst_line_stride", "src_line_stride", "height",
+                    "dst_plane_stride", "src_plane_stride", "depth"}) {
+      ParamTys.push_back(SizeTy);
+      ParamNames.push_back(P);
     }
-    default:
-      // Group builtins are more easily found using this helper rather than
-      // explicitly enumerating each switch case.
-      if (auto Group = BuiltinInfo::isMuxGroupCollective(ID)) {
-        RetTy = OverloadInfo.front();
-        AB.addAttribute(Attribute::Convergent);
-        switch (Group->Op) {
-          default:
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("val");
-            break;
-          case GroupCollective::OpKind::Broadcast:
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("val");
-            // Broadcasts additionally add ID parameters
-            if (Group->isSubGroupScope()) {
-              ParamTys.push_back(Int32Ty);
-              ParamNames.push_back("lid");
-            } else {
-              ParamTys.push_back(SizeTy);
-              ParamNames.push_back("lidx");
-              ParamTys.push_back(SizeTy);
-              ParamNames.push_back("lidy");
-              ParamTys.push_back(SizeTy);
-              ParamNames.push_back("lidz");
-            }
-            break;
-          case GroupCollective::OpKind::Shuffle:
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("val");
-            ParamTys.push_back(Int32Ty);
-            ParamNames.push_back("lid");
-            break;
-          case GroupCollective::OpKind::ShuffleXor:
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("val");
-            ParamTys.push_back(Int32Ty);
-            ParamNames.push_back("xor_val");
-            break;
-          case GroupCollective::OpKind::ShuffleUp:
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("prev");
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("curr");
-            ParamTys.push_back(Int32Ty);
-            ParamNames.push_back("delta");
-            break;
-          case GroupCollective::OpKind::ShuffleDown:
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("curr");
-            ParamTys.push_back(RetTy);
-            ParamNames.push_back("next");
-            ParamTys.push_back(Int32Ty);
-            ParamNames.push_back("delta");
-            break;
-        }
-        // All work-group operations have a 'barrier id' operand as their first
-        // parameter.
-        if (Group->isWorkGroupScope()) {
-          ParamTys.insert(ParamTys.begin(), Int32Ty);
-          ParamNames.insert(ParamNames.begin(), "id");
+
+    ParamTys.push_back(EventTy);
+    ParamNames.push_back("event");
+    break;
+  }
+  default:
+    // Group builtins are more easily found using this helper rather than
+    // explicitly enumerating each switch case.
+    if (auto Group = BuiltinInfo::isMuxGroupCollective(ID)) {
+      RetTy = OverloadInfo.front();
+      AB.addAttribute(Attribute::Convergent);
+      switch (Group->Op) {
+      default:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        break;
+      case GroupCollective::OpKind::Broadcast:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        // Broadcasts additionally add ID parameters
+        if (Group->isSubGroupScope()) {
+          ParamTys.push_back(Int32Ty);
+          ParamNames.push_back("lid");
+        } else {
+          ParamTys.push_back(SizeTy);
+          ParamNames.push_back("lidx");
+          ParamTys.push_back(SizeTy);
+          ParamNames.push_back("lidy");
+          ParamTys.push_back(SizeTy);
+          ParamNames.push_back("lidz");
         }
-      } else {
-        // Unknown mux builtin
-        return nullptr;
+        break;
+      case GroupCollective::OpKind::Shuffle:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("lid");
+        break;
+      case GroupCollective::OpKind::ShuffleXor:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("xor_val");
+        break;
+      case GroupCollective::OpKind::ShuffleUp:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("prev");
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("curr");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("delta");
+        break;
+      case GroupCollective::OpKind::ShuffleDown:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("curr");
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("next");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("delta");
+        break;
+      }
+      // All work-group operations have a 'barrier id' operand as their first
+      // parameter.
+      if (Group->isWorkGroupScope()) {
+        ParamTys.insert(ParamTys.begin(), Int32Ty);
+        ParamNames.insert(ParamNames.begin(), "id");
       }
+    } else {
+      // Unknown mux builtin
+      return nullptr;
+    }
   }
 
   assert(RetTy);
@@ -1283,40 +1280,40 @@ std::optional<llvm::ConstantRange> BIMuxInfoConcept::getBuiltinRange(
   std::array<std::optional<uint64_t>, 3> *SizesPtr = &MaxGlobalSizes;
 
   switch (ID) {
-    default:
+  default:
+    return std::nullopt;
+  case eMuxBuiltinGetWorkDim:
+    return ConstantRange::getNonEmpty(APInt(Bits, 1), APInt(Bits, 4));
+  case eMuxBuiltinGetLocalId:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    // Use the local sizes array, and fall through to common handling.
+    SizesPtr = &MaxLocalSizes;
+    [[fallthrough]];
+  case eMuxBuiltinGetGlobalSize: {
+    auto *DimIdx = CI.getOperand(0);
+    if (!isa<ConstantInt>(DimIdx)) {
+      return std::nullopt;
+    }
+    const uint64_t DimVal = cast<ConstantInt>(DimIdx)->getZExtValue();
+    if (DimVal >= SizesPtr->size()) {
       return std::nullopt;
-    case eMuxBuiltinGetWorkDim:
-      return ConstantRange::getNonEmpty(APInt(Bits, 1), APInt(Bits, 4));
-    case eMuxBuiltinGetLocalId:
-    case eMuxBuiltinGetLocalSize:
-    case eMuxBuiltinGetEnqueuedLocalSize:
-      // Use the local sizes array, and fall through to common handling.
-      SizesPtr = &MaxLocalSizes;
-      [[fallthrough]];
-    case eMuxBuiltinGetGlobalSize: {
-      auto *DimIdx = CI.getOperand(0);
-      if (!isa<ConstantInt>(DimIdx)) {
-        return std::nullopt;
-      }
-      const uint64_t DimVal = cast<ConstantInt>(DimIdx)->getZExtValue();
-      if (DimVal >= SizesPtr->size()) {
-        return std::nullopt;
-      }
-      const std::optional<uint64_t> Size = (*SizesPtr)[DimVal];
-      if (!Size) {
-        return std::nullopt;
-      }
-      // ID builtins range [0,size) (exclusive), and size builtins [1,size]
-      // (inclusive). Thus offset the range by 1 at each low/high end when
-      // returning the range for a size builtin.
-      const int SizeAdjust = ID == eMuxBuiltinGetLocalSize ||
-                             ID == eMuxBuiltinGetEnqueuedLocalSize ||
-                             ID == eMuxBuiltinGetGlobalSize;
-      return ConstantRange::getNonEmpty(APInt(Bits, SizeAdjust),
-                                        APInt(Bits, Size.value() + SizeAdjust));
     }
+    const std::optional<uint64_t> Size = (*SizesPtr)[DimVal];
+    if (!Size) {
+      return std::nullopt;
+    }
+    // ID builtins range [0,size) (exclusive), and size builtins [1,size]
+    // (inclusive). Thus offset the range by 1 at each low/high end when
+    // returning the range for a size builtin.
+    const int SizeAdjust = ID == eMuxBuiltinGetLocalSize ||
+                           ID == eMuxBuiltinGetEnqueuedLocalSize ||
+                           ID == eMuxBuiltinGetGlobalSize;
+    return ConstantRange::getNonEmpty(APInt(Bits, SizeAdjust),
+                                      APInt(Bits, Size.value() + SizeAdjust));
+  }
   }
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
index e7610f89c5525..f735b1d1e6b8f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -46,7 +46,7 @@ void removeCallSite(CallBase &CB, LazyCallGraph &CG) {
   }
 }
 
-}  // namespace
+} // namespace
 
 namespace compiler {
 namespace utils {
@@ -187,8 +187,8 @@ OptimalBuiltinReplacementPass::OptimalBuiltinReplacementPass() {
   replacements.emplace_back(replaceAbacusFMinFMax);
 }
 
-Value *OptimalBuiltinReplacementPass::replaceBuiltinWithInlineIR(
-    CallBase &CB) const {
+Value *
+OptimalBuiltinReplacementPass::replaceBuiltinWithInlineIR(CallBase &CB) const {
   auto *M = CB.getModule();
   NameMangler mangler(&M->getContext());
 
@@ -315,5 +315,5 @@ PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C,
 
   return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
index e1beea751ca06..d1e46ee67b290 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -71,9 +71,8 @@ uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn) {
   return bytes;
 }
 
-static llvm::SmallVector<llvm::Constant *> getNewOps(llvm::Constant *constant,
-                                                     llvm::Constant *from,
-                                                     llvm::Constant *to) {
+static llvm::SmallVector<llvm::Constant *>
+getNewOps(llvm::Constant *constant, llvm::Constant *from, llvm::Constant *to) {
   llvm::SmallVector<llvm::Constant *> newOps;
   // iterate through the constant and create a vector of old and new
   // ones
@@ -328,7 +327,7 @@ bool cloneFunctionsAddArg(
       // Copy names over for the parameters
       llvm::Function::arg_iterator DestI = newFunc->arg_begin();
       for (const auto &I : func.args()) {
-        (*DestI).setName(I.getName());  // Copy the name over...
+        (*DestI).setName(I.getName()); // Copy the name over...
         DestI++;
       }
 
@@ -504,7 +503,8 @@ llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
   // Set up all of our user PHIs
   for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
     // For convenience to callers, permit nullptr and skip over it.
-    if (!currIVs[i]) continue;
+    if (!currIVs[i])
+      continue;
 
     auto *const phi = loopIR.CreatePHI(currIVs[i]->getType(), 2);
     llvm::cast<llvm::PHINode>(phi)->addIncoming(currIVs[i],
@@ -529,7 +529,8 @@ llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
 
   // Update all of our PHIs
   for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
-    if (!currIVs[i]) continue;
+    if (!currIVs[i])
+      continue;
     llvm::cast<llvm::PHINode>(currIVs[i])->addIncoming(nextIVs[i], latch);
   }
 
@@ -577,9 +578,10 @@ llvm::IntegerType *getSizeType(const llvm::Module &m) {
                                 dataLayout.getPointerSizeInBits(0));
 }
 
-static llvm::Function *createKernelWrapperFunctionImpl(
-    llvm::Function &F, llvm::Function &NewFunction, llvm::StringRef Suffix,
-    llvm::StringRef OldSuffix) {
+static llvm::Function *
+createKernelWrapperFunctionImpl(llvm::Function &F, llvm::Function &NewFunction,
+                                llvm::StringRef Suffix,
+                                llvm::StringRef OldSuffix) {
   // Make sure we take a copy of the basename as we're going to change the
   // original function's name from underneath the StringRef.
   const std::string baseName = getOrSetBaseFnName(NewFunction, F).str();
@@ -651,9 +653,10 @@ llvm::Function *createKernelWrapperFunction(llvm::Function &F,
   return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix);
 }
 
-llvm::Function *createKernelWrapperFunction(
-    llvm::Module &M, llvm::Function &F, llvm::ArrayRef<llvm::Type *> ArgTypes,
-    llvm::StringRef Suffix, llvm::StringRef OldSuffix) {
+llvm::Function *
+createKernelWrapperFunction(llvm::Module &M, llvm::Function &F,
+                            llvm::ArrayRef<llvm::Type *> ArgTypes,
+                            llvm::StringRef Suffix, llvm::StringRef OldSuffix) {
   llvm::FunctionType *NewFunctionType =
       llvm::FunctionType::get(F.getReturnType(), ArgTypes, false);
 
@@ -699,38 +702,38 @@ llvm::CallInst *createCallToWrappedFunction(
 llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
                                      llvm::Value *RHS, llvm::RecurKind Kind) {
   switch (Kind) {
-    default:
-      llvm_unreachable("Unexpected Kind");
-    case llvm::RecurKind::None:
-      return nullptr;
-    case llvm::RecurKind::Add:
-      return B.CreateAdd(LHS, RHS);
-    case llvm::RecurKind::Mul:
-      return B.CreateMul(LHS, RHS);
-    case llvm::RecurKind::Or:
-      return B.CreateOr(LHS, RHS);
-    case llvm::RecurKind::And:
-      return B.CreateAnd(LHS, RHS);
-    case llvm::RecurKind::Xor:
-      return B.CreateXor(LHS, RHS);
-    case llvm::RecurKind::SMin:
-      return B.CreateBinaryIntrinsic(llvm::Intrinsic::smin, LHS, RHS);
-    case llvm::RecurKind::UMin:
-      return B.CreateBinaryIntrinsic(llvm::Intrinsic::umin, LHS, RHS);
-    case llvm::RecurKind::SMax:
-      return B.CreateBinaryIntrinsic(llvm::Intrinsic::smax, LHS, RHS);
-    case llvm::RecurKind::UMax:
-      return B.CreateBinaryIntrinsic(llvm::Intrinsic::umax, LHS, RHS);
-    case llvm::RecurKind::FAdd:
-      return B.CreateFAdd(LHS, RHS);
-    case llvm::RecurKind::FMul:
-      return B.CreateFMul(LHS, RHS);
-    case llvm::RecurKind::FMin:
-      return B.CreateBinaryIntrinsic(llvm::Intrinsic::minnum, LHS, RHS);
-    case llvm::RecurKind::FMax:
-      return B.CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, LHS, RHS);
+  default:
+    llvm_unreachable("Unexpected Kind");
+  case llvm::RecurKind::None:
+    return nullptr;
+  case llvm::RecurKind::Add:
+    return B.CreateAdd(LHS, RHS);
+  case llvm::RecurKind::Mul:
+    return B.CreateMul(LHS, RHS);
+  case llvm::RecurKind::Or:
+    return B.CreateOr(LHS, RHS);
+  case llvm::RecurKind::And:
+    return B.CreateAnd(LHS, RHS);
+  case llvm::RecurKind::Xor:
+    return B.CreateXor(LHS, RHS);
+  case llvm::RecurKind::SMin:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::smin, LHS, RHS);
+  case llvm::RecurKind::UMin:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::umin, LHS, RHS);
+  case llvm::RecurKind::SMax:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::smax, LHS, RHS);
+  case llvm::RecurKind::UMax:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::umax, LHS, RHS);
+  case llvm::RecurKind::FAdd:
+    return B.CreateFAdd(LHS, RHS);
+  case llvm::RecurKind::FMul:
+    return B.CreateFMul(LHS, RHS);
+  case llvm::RecurKind::FMin:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::minnum, LHS, RHS);
+  case llvm::RecurKind::FMax:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, LHS, RHS);
   }
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
index 7f6e262177bbd..c9d66624db7ef 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
@@ -116,8 +116,8 @@ void PassMachinery::registerLLVMAnalyses() {
   PB.registerLoopAnalyses(LAM);
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
 namespace compiler {
 namespace utils {
@@ -130,5 +130,5 @@ void printPassName(StringRef PassName, StringRef Params, raw_ostream &OS) {
   OS << "  " << PassName << "<" << Params << ">\n";
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
index c610e76ee5c67..32d9feb5b41bd 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -29,8 +29,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ca-barriers"
 
-PreservedAnalyses compiler::utils::PrepareBarriersPass::run(
-    Module &M, ModuleAnalysisManager &AM) {
+PreservedAnalyses
+compiler::utils::PrepareBarriersPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
   SmallPtrSet<Function *, 4> Kernels;
   auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
   for (auto &F : M.functions()) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
index e8f6152b685f0..396bc347f7fa1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -223,7 +223,7 @@ bool addParamToAllRequiredFunctions(llvm::Module &module,
       nullptr /*updateMetaDataCallback*/);
 }
 
-}  // namespace
+} // namespace
 
 PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
     Module &M, ModuleAnalysisManager &) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
index 9bc40bf4282f7..a05ff3e077c80 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
@@ -151,5 +151,5 @@ void populateStructGetterFunction(llvm::Function &F, Argument &structPtrArg,
   ir.CreateRet(ret);
 }
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
index 1a122433268bd..8b421ccaf4c30 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
@@ -101,8 +101,8 @@ bool GlobalSubgroupInfo::usesSubgroups(const llvm::Function &F) const {
   return !I->second->UsedSubgroupBuiltins.empty();
 }
 
-std::optional<Builtin> GlobalSubgroupInfo::isMuxSubgroupBuiltin(
-    const Function *F) const {
+std::optional<Builtin>
+GlobalSubgroupInfo::isMuxSubgroupBuiltin(const Function *F) const {
   if (!F) {
     return std::nullopt;
   }
@@ -112,15 +112,15 @@ std::optional<Builtin> GlobalSubgroupInfo::isMuxSubgroupBuiltin(
   }
 
   switch (SGBuiltin->ID) {
-    default:
-      break;
-    case eMuxBuiltinSubGroupBarrier:
-    case eMuxBuiltinGetSubGroupSize:
-    case eMuxBuiltinGetMaxSubGroupSize:
-    case eMuxBuiltinGetNumSubGroups:
-    case eMuxBuiltinGetSubGroupId:
-    case eMuxBuiltinGetSubGroupLocalId:
-      return SGBuiltin;
+  default:
+    break;
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinGetSubGroupSize:
+  case eMuxBuiltinGetMaxSubGroupSize:
+  case eMuxBuiltinGetNumSubGroups:
+  case eMuxBuiltinGetSubGroupId:
+  case eMuxBuiltinGetSubGroupLocalId:
+    return SGBuiltin;
   }
 
   if (auto GroupOp = BI.isMuxGroupCollective(SGBuiltin->ID);
@@ -168,5 +168,5 @@ PreservedAnalyses SubgroupAnalysisPrinterPass::run(Module &M,
 
   return PreservedAnalyses::all();
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
index 37c739e22f4f6..1b6f0de967602 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
@@ -34,10 +34,11 @@ Type *getSamplerTy(LLVMContext &Ctx) {
   return TargetExtType::get(Ctx, "spirv.Sampler");
 }
 
-[[maybe_unused]] static Type *getImageTyHelper(
-    LLVMContext &Ctx, ImageTyDimensionalityParam Dim, ImageTyDepthParam Depth,
-    ImageTyArrayedParam Arrayed, ImageTyMSParam MS, ImageTySampledParam Sampled,
-    ImageTyAccessQualParam AccessQual) {
+[[maybe_unused]] static Type *
+getImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+                 ImageTyDepthParam Depth, ImageTyArrayedParam Arrayed,
+                 ImageTyMSParam MS, ImageTySampledParam Sampled,
+                 ImageTyAccessQualParam AccessQual) {
   unsigned IntParams[7];
   IntParams[ImageTyDimensionalityIdx] = Dim;
   IntParams[ImageTyDepthIdx] = Depth;
@@ -50,17 +51,18 @@ Type *getSamplerTy(LLVMContext &Ctx) {
                             IntParams);
 }
 
-[[maybe_unused]] static Type *getOpenCLImageTyHelper(
-    LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
-    ImageTyArrayedParam Arrayed, ImageTyDepthParam Depth, ImageTyMSParam MS,
-    ImageTyAccessQualParam AccessQual) {
+[[maybe_unused]] static Type *
+getOpenCLImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+                       ImageTyArrayedParam Arrayed, ImageTyDepthParam Depth,
+                       ImageTyMSParam MS, ImageTyAccessQualParam AccessQual) {
   return getImageTyHelper(Ctx, Dim, Depth, Arrayed, MS, ImageSampledRuntime,
                           AccessQual);
 }
 
-[[maybe_unused]] static Type *getOpenCLImageTyHelper(
-    LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
-    ImageTyArrayedParam Arrayed, ImageTyAccessQualParam AccessQual) {
+[[maybe_unused]] static Type *
+getOpenCLImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+                       ImageTyArrayedParam Arrayed,
+                       ImageTyAccessQualParam AccessQual) {
   return getOpenCLImageTyHelper(Ctx, Dim, Arrayed, ImageDepthNone,
                                 ImageMSSingleSampled, AccessQual);
 }
@@ -96,6 +98,6 @@ Type *getImage3DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
   return getOpenCLImageTyHelper(Ctx, ImageDim3D, ImageNonArrayed, AccessQual);
 }
 
-}  // namespace tgtext
-}  // namespace utils
-}  // namespace compiler
+} // namespace tgtext
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
index c7a75ae3dd4bc..dafbd1484f3c8 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
@@ -96,8 +96,8 @@ static bool shouldClone(compiler::utils::StructTypeRemapper &StructTypeRemapper,
 /// @param module Module referencing the types in the context.
 ///
 /// @return The map of suffixed structures to the unsuffixed structures.
-static compiler::utils::StructMap uniqueOpaqueSuffixedStructs(
-    llvm::Module &module) {
+static compiler::utils::StructMap
+uniqueOpaqueSuffixedStructs(llvm::Module &module) {
   StructMap map;
   for (auto *structTy : module.getIdentifiedStructTypes()) {
     if (!structTy->isOpaque()) {
@@ -133,9 +133,10 @@ static compiler::utils::StructMap uniqueOpaqueSuffixedStructs(
 /// @param[in] StructTypeRemapper Map from suffixed opaque structs to
 /// unsuffixed opaque structs.
 /// @param[out] WorkList vector of functions that need to be processed.
-static void populateWorkList(
-    Module &Module, compiler::utils::StructTypeRemapper &StructTypeRemapper,
-    SmallVectorImpl<Function *> &WorkList) {
+static void
+populateWorkList(Module &Module,
+                 compiler::utils::StructTypeRemapper &StructTypeRemapper,
+                 SmallVectorImpl<Function *> &WorkList) {
   for (auto &Function : Module) {
     // We don't need to touch intrinsics.
     if (Function.isIntrinsic()) {
@@ -172,9 +173,9 @@ static void removeOldFunctions(const SmallVectorImpl<Function *> &OldFuncs) {
 /// @param[in] StructTypeRemapper Map from suffixed opaque structs to
 /// unsuffixed opaque structs.
 /// @param[in] OldFuncs list of functions to clone and update.
-static void replaceRemappedTypeRefs(
-    compiler::utils::StructTypeRemapper &StructTypeRemapper,
-    const SmallVectorImpl<Function *> &OldFuncs) {
+static void
+replaceRemappedTypeRefs(compiler::utils::StructTypeRemapper &StructTypeRemapper,
+                        const SmallVectorImpl<Function *> &OldFuncs) {
   // Maps the old functions to their new versions with updated types.
   // Note: it is important we do this before cloning to catch the case that
   // functions A and B both need updating, but function A calls function B and
@@ -279,5 +280,5 @@ PreservedAnalyses UniqueOpaqueStructsPass::run(Module &Module,
   // modified.
   return PreservedAnalyses::none();
 }
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
index 71663cb7f4314..4569df09a5495 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -45,7 +45,7 @@ namespace utils {
 ///
 /// It adds additional fields used when creating wrapper kernels.
 class BarrierWithLiveVars : public Barrier {
- public:
+public:
   BarrierWithLiveVars(llvm::Module &m, llvm::Function &f,
                       VectorizationInfo vf_info, bool IsDebug)
       : Barrier(m, f, IsDebug), vf_info(vf_info) {}
@@ -67,7 +67,7 @@ class BarrierWithLiveVars : public Barrier {
   AllocaInst *getDebugAddr() const { return debug_addr; }
   void setDebugAddr(AllocaInst *ai) { debug_addr = ai; }
 
- private:
+private:
   VectorizationInfo vf_info;
 
   // Alloca representing the memory for the live variables for a given kernel,
@@ -93,8 +93,8 @@ class BarrierWithLiveVars : public Barrier {
   Value *structSize = nullptr;
 };
 
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
 namespace {
 
@@ -103,12 +103,8 @@ struct ScheduleGenerator {
                     const compiler::utils::BarrierWithLiveVars &barrierMain,
                     const compiler::utils::BarrierWithLiveVars *barrierTail,
                     compiler::utils::BuiltinInfo &BI)
-      : module(m),
-        context(m.getContext()),
-        barrierMain(barrierMain),
-        barrierTail(barrierTail),
-        BI(BI),
-        i32Ty(Type::getInt32Ty(context)) {
+      : module(m), context(m.getContext()), barrierMain(barrierMain),
+        barrierTail(barrierTail), BI(BI), i32Ty(Type::getInt32Ty(context)) {
     set_local_id =
         BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetLocalId, m);
     set_subgroup_id =
@@ -141,9 +137,9 @@ struct ScheduleGenerator {
 
   DILocation *wrapperDbgLoc = nullptr;
 
-  Value *createLinearLiveVarsPtr(
-      const compiler::utils::BarrierWithLiveVars &barrier, IRBuilder<> &ir,
-      Value *index) {
+  Value *
+  createLinearLiveVarsPtr(const compiler::utils::BarrierWithLiveVars &barrier,
+                          IRBuilder<> &ir, Value *index) {
     Value *const mem_space = barrier.getMemSpace();
     if (!mem_space) {
       return nullptr;
@@ -193,9 +189,9 @@ struct ScheduleGenerator {
     return createLinearLiveVarsPtr(barrier, ir, offset);
   }
 
-  void recreateDebugIntrinsics(
-      const compiler::utils::BarrierWithLiveVars &barrier, BasicBlock *block,
-      StoreInst *SI) {
+  void
+  recreateDebugIntrinsics(const compiler::utils::BarrierWithLiveVars &barrier,
+                          BasicBlock *block, StoreInst *SI) {
     DIBuilder DIB(module, /*AllowUnresolved*/ false);
     auto RecreateDebugIntrinsic = [&](DILocalVariable *const old_var,
                                       const unsigned live_var_offset) {
@@ -240,11 +236,12 @@ struct ScheduleGenerator {
     }
   }
 
-  void createWorkItemLoopBody(
-      const compiler::utils::BarrierWithLiveVars &barrier, IRBuilder<> &ir,
-      BasicBlock *block, unsigned i, Value *dim_0, Value *dim_1, Value *dim_2,
-      Value *accumulator = nullptr, Value *VF = nullptr,
-      Value *offset = nullptr) {
+  void
+  createWorkItemLoopBody(const compiler::utils::BarrierWithLiveVars &barrier,
+                         IRBuilder<> &ir, BasicBlock *block, unsigned i,
+                         Value *dim_0, Value *dim_1, Value *dim_2,
+                         Value *accumulator = nullptr, Value *VF = nullptr,
+                         Value *offset = nullptr) {
     auto new_kernel_args = args;
     if (accumulator) {
       new_kernel_args.push_back(accumulator);
@@ -305,10 +302,10 @@ struct ScheduleGenerator {
 
   // Create a 1D loop to execute all the work items in a 'barrier', reducing
   // across an accumulator.
-  std::pair<BasicBlock *, Value *> makeReductionLoop(
-      const compiler::utils::BarrierWithLiveVars &barrier,
-      const compiler::utils::GroupCollective &WGC, BasicBlock *block, Value *op,
-      Value *accumulator) {
+  std::pair<BasicBlock *, Value *>
+  makeReductionLoop(const compiler::utils::BarrierWithLiveVars &barrier,
+                    const compiler::utils::GroupCollective &WGC,
+                    BasicBlock *block, Value *op, Value *accumulator) {
     auto *const accTy = accumulator->getType();
     Function *const func = block->getParent();
 
@@ -398,8 +395,9 @@ struct ScheduleGenerator {
     }
   }
 
-  std::optional<compiler::utils::GroupCollective> getBarrierGroupCollective(
-      const compiler::utils::BarrierWithLiveVars &Barrier, unsigned BarrierID) {
+  std::optional<compiler::utils::GroupCollective>
+  getBarrierGroupCollective(const compiler::utils::BarrierWithLiveVars &Barrier,
+                            unsigned BarrierID) {
     auto *const BarrierCall = Barrier.getBarrierCall(BarrierID);
     if (!BarrierCall) {
       return std::nullopt;
@@ -424,157 +422,156 @@ struct ScheduleGenerator {
     }
 
     switch (Info->Op) {
-      case compiler::utils::GroupCollective::OpKind::Reduction:
-      case compiler::utils::GroupCollective::OpKind::All:
-      case compiler::utils::GroupCollective::OpKind::Any: {
-        auto *const ty = groupCall->getType();
-        auto *const accumulator =
-            compiler::utils::getNeutralVal(Info->Recurrence, ty);
-        auto [loop_exit_block, accum] = makeReductionLoop(
-            barrierMain, *Info, block, groupCall->getOperand(1), accumulator);
-        if (barrierTail) {
-          auto *const groupTailInst = barrierTail->getBarrierCall(barrierID);
-          std::tie(loop_exit_block, accum) =
-              makeReductionLoop(*barrierTail, *Info, loop_exit_block,
-                                groupTailInst->getOperand(1), accum);
-        }
-        if (groupCall->hasName()) {
-          accum->takeName(groupCall);
-        }
-        return std::make_tuple(loop_exit_block, accum, Info);
+    case compiler::utils::GroupCollective::OpKind::Reduction:
+    case compiler::utils::GroupCollective::OpKind::All:
+    case compiler::utils::GroupCollective::OpKind::Any: {
+      auto *const ty = groupCall->getType();
+      auto *const accumulator =
+          compiler::utils::getNeutralVal(Info->Recurrence, ty);
+      auto [loop_exit_block, accum] = makeReductionLoop(
+          barrierMain, *Info, block, groupCall->getOperand(1), accumulator);
+      if (barrierTail) {
+        auto *const groupTailInst = barrierTail->getBarrierCall(barrierID);
+        std::tie(loop_exit_block, accum) =
+            makeReductionLoop(*barrierTail, *Info, loop_exit_block,
+                              groupTailInst->getOperand(1), accum);
       }
-      case compiler::utils::GroupCollective::OpKind::ScanInclusive:
-      case compiler::utils::GroupCollective::OpKind::ScanExclusive: {
-        auto *const ty = groupCall->getType();
-        auto *const accumulator =
-            compiler::utils::getIdentityVal(Info->Recurrence, ty);
-        return {block, accumulator, Info};
+      if (groupCall->hasName()) {
+        accum->takeName(groupCall);
       }
-      case compiler::utils::GroupCollective::OpKind::Broadcast: {
-        // First we need to get the item ID values from the barrier struct.
-        // These should be uniform but they may still be variables. It should
-        // be safe to get them from the barrier struct at index zero.
-        auto *const zero =
-            Constant::getNullValue(compiler::utils::getSizeType(module));
-
-        Function *const func = block->getParent();
-        BasicBlock *mainUniformBlock = block;
-        BasicBlock *tailUniformBlock = nullptr;
-
-        auto *const totalSize = barrierMain.getTotalSize();
-        if (auto *const loopLimitConst = dyn_cast<Constant>(totalSize)) {
-          // If we know for a fact that the main struct has at least one item,
-          // we can just use that. Otherwise, we need to use the tail struct.
-          if (loopLimitConst->isZeroValue()) {
-            mainUniformBlock = nullptr;
-            if (barrierTail) {
-              tailUniformBlock = block;
-            }
+      return std::make_tuple(loop_exit_block, accum, Info);
+    }
+    case compiler::utils::GroupCollective::OpKind::ScanInclusive:
+    case compiler::utils::GroupCollective::OpKind::ScanExclusive: {
+      auto *const ty = groupCall->getType();
+      auto *const accumulator =
+          compiler::utils::getIdentityVal(Info->Recurrence, ty);
+      return {block, accumulator, Info};
+    }
+    case compiler::utils::GroupCollective::OpKind::Broadcast: {
+      // First we need to get the item ID values from the barrier struct.
+      // These should be uniform but they may still be variables. It should
+      // be safe to get them from the barrier struct at index zero.
+      auto *const zero =
+          Constant::getNullValue(compiler::utils::getSizeType(module));
+
+      Function *const func = block->getParent();
+      BasicBlock *mainUniformBlock = block;
+      BasicBlock *tailUniformBlock = nullptr;
+
+      auto *const totalSize = barrierMain.getTotalSize();
+      if (auto *const loopLimitConst = dyn_cast<Constant>(totalSize)) {
+        // If we know for a fact that the main struct has at least one item,
+        // we can just use that. Otherwise, we need to use the tail struct.
+        if (loopLimitConst->isZeroValue()) {
+          mainUniformBlock = nullptr;
+          if (barrierTail) {
+            tailUniformBlock = block;
           }
-        } else if (barrierTail) {
-          // If we have a variable number of main items, it could be zero at
-          // runtime, so we need an alternative way to get the values.
-          mainUniformBlock =
-              BasicBlock::Create(context, "ca_main_uniform_load", func);
-          tailUniformBlock =
-              BasicBlock::Create(context, "ca_tail_uniform_load", func);
-
-          auto *const needTail = CmpInst::Create(
-              Instruction::ICmp, CmpInst::ICMP_EQ, totalSize, zero, "", block);
-          BranchInst::Create(tailUniformBlock, mainUniformBlock, needTail,
-                             block);
         }
+      } else if (barrierTail) {
+        // If we have a variable number of main items, it could be zero at
+        // runtime, so we need an alternative way to get the values.
+        mainUniformBlock =
+            BasicBlock::Create(context, "ca_main_uniform_load", func);
+        tailUniformBlock =
+            BasicBlock::Create(context, "ca_tail_uniform_load", func);
+
+        auto *const needTail = CmpInst::Create(
+            Instruction::ICmp, CmpInst::ICMP_EQ, totalSize, zero, "", block);
+        BranchInst::Create(tailUniformBlock, mainUniformBlock, needTail, block);
+      }
 
-        if (!mainUniformBlock && !tailUniformBlock) {
-          return {block, nullptr, std::nullopt};
-        }
+      if (!mainUniformBlock && !tailUniformBlock) {
+        return {block, nullptr, std::nullopt};
+      }
 
-        Value *idsMain[] = {zero, zero, zero};
-        Value *idsTail[] = {zero, zero, zero};
-        if (mainUniformBlock) {
-          idsMain[0] = groupCall->getOperand(2);
-          idsMain[1] = groupCall->getOperand(3);
-          idsMain[2] = groupCall->getOperand(4);
-          getUniformValues(mainUniformBlock, barrierMain, idsMain);
-        }
+      Value *idsMain[] = {zero, zero, zero};
+      Value *idsTail[] = {zero, zero, zero};
+      if (mainUniformBlock) {
+        idsMain[0] = groupCall->getOperand(2);
+        idsMain[1] = groupCall->getOperand(3);
+        idsMain[2] = groupCall->getOperand(4);
+        getUniformValues(mainUniformBlock, barrierMain, idsMain);
+      }
 
-        if (tailUniformBlock) {
-          auto *const tailGroupCall = barrierTail->getBarrierCall(barrierID);
-          assert(tailGroupCall &&
-                 "No corresponding work group broadcast in tail kernel");
-          idsTail[0] = tailGroupCall->getOperand(2);
-          idsTail[1] = tailGroupCall->getOperand(3);
-          idsTail[2] = tailGroupCall->getOperand(4);
-          getUniformValues(tailUniformBlock, *barrierTail, idsTail);
-
-          if (mainUniformBlock) {
-            // If both barrier structs had to be used, we need to merge the
-            // result.
-            block = BasicBlock::Create(context, "ca_merge_uniform_load", func);
-            BranchInst::Create(block, tailUniformBlock);
-            BranchInst::Create(block, mainUniformBlock);
-
-            for (size_t i = 0; i != 3; ++i) {
-              auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2,
-                                               "uniform_merge", block);
-              mergePhi->addIncoming(idsMain[i], mainUniformBlock);
-              mergePhi->addIncoming(idsTail[i], tailUniformBlock);
-              idsMain[i] = mergePhi;
-            }
-          } else {
-            // Otherwise we can use the tail.
-            for (size_t i = 0; i != 3; ++i) {
-              idsMain[i] = idsTail[i];
-            }
+      if (tailUniformBlock) {
+        auto *const tailGroupCall = barrierTail->getBarrierCall(barrierID);
+        assert(tailGroupCall &&
+               "No corresponding work group broadcast in tail kernel");
+        idsTail[0] = tailGroupCall->getOperand(2);
+        idsTail[1] = tailGroupCall->getOperand(3);
+        idsTail[2] = tailGroupCall->getOperand(4);
+        getUniformValues(tailUniformBlock, *barrierTail, idsTail);
+
+        if (mainUniformBlock) {
+          // If both barrier structs had to be used, we need to merge the
+          // result.
+          block = BasicBlock::Create(context, "ca_merge_uniform_load", func);
+          BranchInst::Create(block, tailUniformBlock);
+          BranchInst::Create(block, mainUniformBlock);
+
+          for (size_t i = 0; i != 3; ++i) {
+            auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2,
+                                             "uniform_merge", block);
+            mergePhi->addIncoming(idsMain[i], mainUniformBlock);
+            mergePhi->addIncoming(idsTail[i], tailUniformBlock);
+            idsMain[i] = mergePhi;
+          }
+        } else {
+          // Otherwise we can use the tail.
+          for (size_t i = 0; i != 3; ++i) {
+            idsMain[i] = idsTail[i];
           }
         }
+      }
 
-        IRBuilder<> ir(block);
-        auto *const op = groupCall->getOperand(1);
+      IRBuilder<> ir(block);
+      auto *const op = groupCall->getOperand(1);
 
-        // Compute the address of the value in the main barrier struct
-        auto *const VF = ir.CreateElementCount(
-            compiler::utils::getSizeType(module), barrierMain.getVFInfo().vf);
-        auto *const liveVars = createLiveVarsPtr(barrierMain, ir, idsMain[0],
-                                                 idsMain[1], idsMain[2], VF);
-        compiler::utils::Barrier::LiveValuesHelper live_values(barrierMain,
-                                                               block, liveVars);
-        auto *const GEPmain = live_values.getGEP(op);
-        assert(GEPmain && "Could not get broadcasted value");
+      // Compute the address of the value in the main barrier struct
+      auto *const VF = ir.CreateElementCount(
+          compiler::utils::getSizeType(module), barrierMain.getVFInfo().vf);
+      auto *const liveVars = createLiveVarsPtr(barrierMain, ir, idsMain[0],
+                                               idsMain[1], idsMain[2], VF);
+      compiler::utils::Barrier::LiveValuesHelper live_values(barrierMain, block,
+                                                             liveVars);
+      auto *const GEPmain = live_values.getGEP(op);
+      assert(GEPmain && "Could not get broadcasted value");
 
-        if (barrierTail) {
-          const bool VP = barrierTail->getVFInfo().IsVectorPredicated;
+      if (barrierTail) {
+        const bool VP = barrierTail->getVFInfo().IsVectorPredicated;
 
-          // Compute the address of the value in the tail barrier struct
-          auto *const offsetDim0 = ir.CreateSub(idsMain[0], mainLoopLimit);
-          auto *const liveVarsTail =
-              createLiveVarsPtr(*barrierTail, ir, offsetDim0, idsMain[1],
-                                idsMain[2], VP ? VF : nullptr);
-          compiler::utils::Barrier::LiveValuesHelper live_values(
-              *barrierTail, block, liveVarsTail);
+        // Compute the address of the value in the tail barrier struct
+        auto *const offsetDim0 = ir.CreateSub(idsMain[0], mainLoopLimit);
+        auto *const liveVarsTail =
+            createLiveVarsPtr(*barrierTail, ir, offsetDim0, idsMain[1],
+                              idsMain[2], VP ? VF : nullptr);
+        compiler::utils::Barrier::LiveValuesHelper live_values(
+            *barrierTail, block, liveVarsTail);
 
-          auto *const opTail =
-              barrierTail->getBarrierCall(barrierID)->getOperand(1);
-          auto *const GEPtail = live_values.getGEP(opTail);
-          assert(GEPtail && "Could not get tail-broadcasted value");
+        auto *const opTail =
+            barrierTail->getBarrierCall(barrierID)->getOperand(1);
+        auto *const GEPtail = live_values.getGEP(opTail);
+        assert(GEPtail && "Could not get tail-broadcasted value");
 
-          // Select the main GEP or the tail GEP to load from
-          auto *const cond = ir.CreateICmpUGE(idsMain[0], mainLoopLimit);
+        // Select the main GEP or the tail GEP to load from
+        auto *const cond = ir.CreateICmpUGE(idsMain[0], mainLoopLimit);
 
-          auto *const select = ir.CreateSelect(cond, GEPtail, GEPmain);
+        auto *const select = ir.CreateSelect(cond, GEPtail, GEPmain);
 
-          auto *const result = ir.CreateLoad(op->getType(), select);
-          result->takeName(groupCall);
+        auto *const result = ir.CreateLoad(op->getType(), select);
+        result->takeName(groupCall);
 
-          return {block, result, Info};
-        } else {
-          auto *const result = ir.CreateLoad(op->getType(), GEPmain);
-          result->takeName(groupCall);
-          return {block, result, Info};
-        }
+        return {block, result, Info};
+      } else {
+        auto *const result = ir.CreateLoad(op->getType(), GEPmain);
+        result->takeName(groupCall);
+        return {block, result, Info};
       }
-      default:
-        break;
+    }
+    default:
+      break;
     }
     return {block, nullptr, std::nullopt};
   }
@@ -1311,7 +1308,7 @@ void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier,
   }
 }
 
-}  // namespace
+} // namespace
 
 Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
     BarrierWithLiveVars &barrierMain, BarrierWithLiveVars *barrierTail,
@@ -1635,18 +1632,18 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
 
       auto *const exitBlock = [&]() {
         switch (barrierMain.getSchedule(i)) {
-          case BarrierSchedule::Unordered:
-          case BarrierSchedule::ScalarTail:
-            if (tailInfo && tailInfo->IsVectorPredicated) {
-              return schedule.makeLinearWorkItemLoops(block, i);
-            }
-            return schedule.makeWorkItemLoops(block, i);
+        case BarrierSchedule::Unordered:
+        case BarrierSchedule::ScalarTail:
+          if (tailInfo && tailInfo->IsVectorPredicated) {
+            return schedule.makeLinearWorkItemLoops(block, i);
+          }
+          return schedule.makeWorkItemLoops(block, i);
 
-          case BarrierSchedule::Once:
-            return schedule.makeRunOneWorkItem(block, i);
+        case BarrierSchedule::Once:
+          return schedule.makeRunOneWorkItem(block, i);
 
-          case BarrierSchedule::Linear:
-            return schedule.makeLinearWorkItemLoops(block, i);
+        case BarrierSchedule::Linear:
+          return schedule.makeLinearWorkItemLoops(block, i);
         }
 
         llvm_unreachable("Unexpected barrier schedule enum");
@@ -1773,8 +1770,8 @@ struct BarrierWrapperInfo {
   Function *SkippedTailF = nullptr;
 };
 
-PreservedAnalyses compiler::utils::WorkItemLoopsPass::run(
-    Module &M, ModuleAnalysisManager &MAM) {
+PreservedAnalyses
+compiler::utils::WorkItemLoopsPass::run(Module &M, ModuleAnalysisManager &MAM) {
   // Cache the functions we're interested in as this pass introduces new ones
   // which we don't want to run over.
   SmallVector<BarrierWrapperInfo, 4> MainTailPairs;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
index ef5a8ad656fa0..d7e59337fc261 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -35,13 +35,13 @@ class ModulePass;
 class StringRef;
 class Module;
 class TargetMachine;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
 class BuiltinInfo;
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
 namespace vecz {
 /// @addtogroup vecz
@@ -80,8 +80,8 @@ std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(llvm::Function &);
 /// preferring ones which fit the known local work-group size and powers of
 /// two. The device's sub-group sizes can be sorted such that preferable sizes
 /// are placed towards the front.
-std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
-    llvm::Function &, llvm::ModuleAnalysisManager &);
+std::optional<VeczPassOptions>
+getAutoSubgroupSizeOpts(llvm::Function &, llvm::ModuleAnalysisManager &);
 
 /// @brief Analysis pass which determines on which functions @ref RunVeczPass
 /// should operate.
@@ -103,7 +103,7 @@ class VeczPassOptionsAnalysis
     return true;
   };
 
- public:
+public:
   VeczPassOptionsAnalysis() = default;
   /// @brief explicit constructor which uses the given callback to determine
   /// whether vectorization should be performed on the passed function. If the
@@ -123,7 +123,7 @@ class VeczPassOptionsPrinterPass
     : public llvm::PassInfoMixin<VeczPassOptionsPrinterPass> {
   llvm::raw_ostream &OS;
 
- public:
+public:
   explicit VeczPassOptionsPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
 
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
@@ -139,12 +139,12 @@ class VeczPassOptionsPrinterPass
 /// manager's ModuleAnalysisManager is configured with a custom @ref
 /// `VeczShouldRunOnFunctionAnalysis`
 class RunVeczPass : public llvm::PassInfoMixin<RunVeczPass> {
- public:
+public:
   /// @brief llvm's entry point for the PassManager
   llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_PASS_H
+#endif // VECZ_PASS_H
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
index 231cc228830ee..64ed72c120d98 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
@@ -29,7 +29,7 @@
 namespace llvm {
 class StringRef;
 class Twine;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -39,7 +39,7 @@ namespace vecz {
 /// related. Since they are not always the best choice for a given target, they
 /// are controlled at runtime by this class.
 class VectorizationChoices {
- public:
+public:
   VectorizationChoices();
   ~VectorizationChoices() = default;
 
@@ -274,7 +274,7 @@ class VectorizationChoices {
 
   static llvm::ArrayRef<ChoiceInfo> queryAvailableChoices();
 
- private:
+private:
   /// @brief All the choices enabled
   llvm::SmallSet<Choice, 2> Enabled;
 
@@ -290,5 +290,5 @@ class VectorizationChoices {
                                      llvm::Twine Msg);
 };
 
-}  // namespace vecz
-#endif  // VECZ_VECZ_CHOICES_H_INCLUDED
+} // namespace vecz
+#endif // VECZ_VECZ_CHOICES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
index 97a91c9266c9d..490247e70c995 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -30,7 +30,7 @@ namespace llvm {
 class TargetMachine;
 class TargetTransformInfo;
 class Type;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class VectorizationContext;
@@ -55,7 +55,7 @@ enum InterleavedOperation : int {
 /// @brief Used by the vectorizer to query for target capabilities and
 /// materialize memory intrinsics.
 class TargetInfo {
- public:
+public:
   /// @brief Create a new vector target info instance.
   /// @param[in] tm LLVM target machine that will be used for compilation, can
   /// be NULL if no target data is available.
@@ -192,9 +192,10 @@ class TargetInfo {
   /// @param[in] alignment Alignment of the load
   ///
   /// @return IR value that results from the interleaved vector store.
-  virtual llvm::Value *createInterleavedStore(
-      llvm::IRBuilder<> &builder, llvm::Value *data, llvm::Value *ptr,
-      llvm::Value *stride, llvm::Value *evl, unsigned alignment) const;
+  virtual llvm::Value *
+  createInterleavedStore(llvm::IRBuilder<> &builder, llvm::Value *data,
+                         llvm::Value *ptr, llvm::Value *stride,
+                         llvm::Value *evl, unsigned alignment) const;
 
   /// @brief Create a masked interleaved vector load.
   ///        Only lanes with a non-zero mask will be loaded from the address.
@@ -212,10 +213,11 @@ class TargetInfo {
   /// @param[in] alignment Alignment of the load
   ///
   /// @return IR value that results from the masked interleaved vector load.
-  virtual llvm::Value *createMaskedInterleavedLoad(
-      llvm::IRBuilder<> &builder, llvm::Type *ty, llvm::Value *ptr,
-      llvm::Value *mask, llvm::Value *stride, llvm::Value *evl,
-      unsigned alignment) const;
+  virtual llvm::Value *
+  createMaskedInterleavedLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
+                              llvm::Value *ptr, llvm::Value *mask,
+                              llvm::Value *stride, llvm::Value *evl,
+                              unsigned alignment) const;
 
   /// @brief Create a masked interleaved vector store.
   ///        Only lanes with a non-zero mask will be stored to the address.
@@ -233,10 +235,11 @@ class TargetInfo {
   /// @param[in] alignment Alignment of the load
   ///
   /// @return IR value that results from the masked interleaved vector store.
-  virtual llvm::Value *createMaskedInterleavedStore(
-      llvm::IRBuilder<> &builder, llvm::Value *data, llvm::Value *ptr,
-      llvm::Value *mask, llvm::Value *stride, llvm::Value *evl,
-      unsigned alignment) const;
+  virtual llvm::Value *
+  createMaskedInterleavedStore(llvm::IRBuilder<> &builder, llvm::Value *data,
+                               llvm::Value *ptr, llvm::Value *mask,
+                               llvm::Value *stride, llvm::Value *evl,
+                               unsigned alignment) const;
 
   /// @brief Create a gather vector load.
   ///        Vector lanes are loaded from different memory addresses.
@@ -314,9 +317,10 @@ class TargetInfo {
   /// @param[in] alignment Alignment of the store.
   ///
   /// @return IR value that results from the masked scatter vector store.
-  virtual llvm::Value *createMaskedScatterStore(
-      llvm::IRBuilder<> &builder, llvm::Value *data, llvm::Value *ptr,
-      llvm::Value *mask, llvm::Value *evl, unsigned alignment) const;
+  virtual llvm::Value *
+  createMaskedScatterStore(llvm::IRBuilder<> &builder, llvm::Value *data,
+                           llvm::Value *ptr, llvm::Value *mask,
+                           llvm::Value *evl, unsigned alignment) const;
 
   /// @brief Create a scalable extractelement instruction. Note that the
   /// operands are expected to have been pre-packetized before passing to this
@@ -349,9 +353,10 @@ class TargetInfo {
   /// @param[in] vector Vector to broadcast.
   /// @param[in] VL Vector length.
   /// @param[in] factor Broadcast factor.
-  virtual llvm::Value *createOuterScalableBroadcast(
-      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
-      llvm::ElementCount factor) const;
+  virtual llvm::Value *
+  createOuterScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               llvm::ElementCount factor) const;
 
   /// @brief Create an inner broadcast of a vector. An inner broadcast is one
   /// where a vector with length V has its lanes individually and sequentially
@@ -364,9 +369,10 @@ class TargetInfo {
   /// @param[in] vector Vector to broadcast.
   /// @param[in] VL Vector length.
   /// @param[in] factor Broadcast factor.
-  virtual llvm::Value *createInnerScalableBroadcast(
-      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
-      llvm::ElementCount factor) const;
+  virtual llvm::Value *
+  createInnerScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               llvm::ElementCount factor) const;
 
   /// @brief Utility function for packetizing an insertelement instruction by a
   /// scalable factor. Note that the operands are expected to have been
@@ -498,9 +504,10 @@ class TargetInfo {
   /// @param[in] width the widest SIMD width to consider
   /// @return the widest SIMD width that is expected to fit into registers, or
   ///         zero if the set can never fit into registers.
-  virtual unsigned estimateSimdWidth(
-      const llvm::TargetTransformInfo &TTI,
-      const llvm::ArrayRef<const llvm::Value *> vals, unsigned width) const;
+  virtual unsigned
+  estimateSimdWidth(const llvm::TargetTransformInfo &TTI,
+                    const llvm::ArrayRef<const llvm::Value *> vals,
+                    unsigned width) const;
 
   /// @brief Get the preferred vector width for the given scalar type
   ///
@@ -522,11 +529,11 @@ class TargetInfo {
   /// binary vp intrinsic.
   virtual bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const;
 
- protected:
+protected:
   /// @brief This type indicates legality of a VP/Masked memory operation in a
   /// target.
   class VPMemOpLegality {
-   public:
+  public:
     constexpr VPMemOpLegality() = default;
     constexpr VPMemOpLegality(bool VPLegal, bool MaskLegal)
         : VPLegal(VPLegal), MaskLegal(MaskLegal) {}
@@ -545,7 +552,7 @@ class TargetInfo {
     /// operation.
     constexpr bool isMaskLegal() const { return MaskLegal; }
 
-   private:
+  private:
     bool VPLegal = false;
     bool MaskLegal = false;
   };
@@ -617,16 +624,17 @@ class TargetInfo {
   /// @brief LLVM target machine that will be used for compilation.
   llvm::TargetMachine *TM_;
 
- private:
+private:
   /// @brief Helper function to check legality of memory operations.
   ///
   /// @return Illegal in LLVM < 13 and check leagality in LLVM >= 13.
-  VPMemOpLegality checkMemOpLegality(
-      const llvm::Function *F,
-      llvm::function_ref<bool(const llvm::TargetTransformInfo &, llvm::Type *,
-                              unsigned, unsigned)>
-          Checker,
-      llvm::Type *Ty, unsigned Alignment, unsigned AddrSpace) const;
+  VPMemOpLegality
+  checkMemOpLegality(const llvm::Function *F,
+                     llvm::function_ref<bool(const llvm::TargetTransformInfo &,
+                                             llvm::Type *, unsigned, unsigned)>
+                         Checker,
+                     llvm::Type *Ty, unsigned Alignment,
+                     unsigned AddrSpace) const;
 
   /// @brief Create a broadcast of a vector.
   ///
@@ -646,7 +654,7 @@ class TargetInfo {
 class TargetInfoAnalysis : public llvm::AnalysisInfoMixin<TargetInfoAnalysis> {
   friend AnalysisInfoMixin<TargetInfoAnalysis>;
 
- public:
+public:
   struct Result {
     Result(std::unique_ptr<TargetInfo> &&I) : Info(std::move(I)) {}
     /// Handle the invalidation of this information.
@@ -681,7 +689,7 @@ class TargetInfoAnalysis : public llvm::AnalysisInfoMixin<TargetInfoAnalysis> {
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "TargetInfo analysis"; }
 
- private:
+private:
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
 
@@ -699,10 +707,10 @@ std::unique_ptr<TargetInfo> createTargetInfoRISCV(llvm::TargetMachine *tm);
 /// @param[in] tm LLVM target machine that will be used for compilation, can
 /// be NULL if no target data is available.
 /// @return The new TargetInfo instance.
-std::unique_ptr<TargetInfo> createTargetInfoFromTargetMachine(
-    llvm::TargetMachine *tm);
+std::unique_ptr<TargetInfo>
+createTargetInfoFromTargetMachine(llvm::TargetMachine *tm);
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECZ_TARGET_INFO_H_INCLUDED
+#endif // VECZ_VECZ_TARGET_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
index 64d99ee71e8c7..39c78b01a2bf4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 
 namespace {
 using RPOT = ReversePostOrderTraversal<Function *>;
-}  // namespace
+} // namespace
 
 BlockQueue::BlockQueue(const DivergenceResult &dr,
                        const DenseSet<BasicBlock *> &blocks)
@@ -427,9 +427,11 @@ void DivergenceResult::markByAll(BasicBlock &src) {
           // If we are not in a loop, or the loop we live in does not diverge
           // nor does the one englobing us if it exists, then mark by_all.
           if (DLoopTag) {
-            if (DLoopTag->isLoopDivergent()) continue;
+            if (DLoopTag->isLoopDivergent())
+              continue;
             Loop *parentLoop = DLoopTag->loop->getParentLoop();
-            if (parentLoop && !isByAll(*parentLoop->getHeader())) continue;
+            if (parentLoop && !isByAll(*parentLoop->getHeader()))
+              continue;
           }
           queue.push(DIndex);
         }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
index e6076fcb634d7..f14239789e598 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -106,7 +106,7 @@ bool analyzeAlloca(const VectorizationContext &Ctx, AllocaInst *alloca) {
   const uint64_t align = alloca->getAlign().value();
   return (align != 0 && (memSize % align) != 0);
 }
-}  // namespace
+} // namespace
 
 namespace vecz {
 bool needsInstantiation(const VectorizationContext &Ctx, Instruction &I) {
@@ -132,4 +132,4 @@ bool needsInstantiation(const VectorizationContext &Ctx, Instruction &I) {
     return analyzeType(I.getType());
   }
 }
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
index 2a41508892371..6bdcf9c295412 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -69,15 +69,15 @@ inline bool pushOnce(BlockLivenessInfo::LiveSet &s, Value *V) {
   return true;
 }
 
-}  // namespace
+} // namespace
 
 class LivenessResult::Impl {
- public:
+public:
   Impl(LivenessResult &lr) : LR(lr) {}
 
   void recalculate();
 
- private:
+private:
   LivenessResult &LR;
 
   void computeByVar(const BasicBlock &BB);
@@ -107,8 +107,8 @@ size_t LivenessResult::getMaxLiveVirtualRegisters() const {
   return maxNumberOfLiveValues;
 }
 
-const BlockLivenessInfo &LivenessResult::getBlockInfo(
-    const BasicBlock *BB) const {
+const BlockLivenessInfo &
+LivenessResult::getBlockInfo(const BasicBlock *BB) const {
   auto found = BlockInfos.find(BB);
   assert(found != BlockInfos.end() && "No liveness information for BasicBlock");
   return found->second;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
index 88eb38c4fe860..d5230a303e3c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
@@ -41,7 +41,7 @@ bool isDivergenceReduction(const Function &F) {
   return (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
           L.Consume("divergence_"));
 }
-}  // namespace
+} // namespace
 
 llvm::AnalysisKey PacketizationAnalysis::Key;
 
@@ -169,8 +169,8 @@ void PacketizationAnalysisResult::markForPacketization(Value *V) {
   }
 }
 
-PacketizationAnalysisResult PacketizationAnalysis::run(
-    Function &F, llvm::FunctionAnalysisManager &AM) {
+PacketizationAnalysisResult
+PacketizationAnalysis::run(Function &F, llvm::FunctionAnalysisManager &AM) {
   auto &SAR = AM.getResult<StrideAnalysis>(F);
   return Result(F, SAR);
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
index f92ba7358ed51..9354efd65bb12 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -73,7 +73,7 @@ bool definedOrUsedInLoop(Value *V, Loop *L) {
   }
   return false;
 }
-}  // namespace
+} // namespace
 
 // Avoid Spill implementation. It focus on avoiding register spill by optimizing
 // register pressure.
@@ -166,8 +166,8 @@ unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F,
   return SimdWidth;
 }
 
-SimdWidthAnalysis::Result SimdWidthAnalysis::run(
-    Function &F, llvm::FunctionAnalysisManager &AM) {
+SimdWidthAnalysis::Result
+SimdWidthAnalysis::run(Function &F, llvm::FunctionAnalysisManager &AM) {
   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   const VectorizationUnit &VU =
       AM.getResult<VectorizationUnitAnalysis>(F).getVU();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
index f9f2e84a59958..0d24a43a81921 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -111,7 +111,7 @@ bool isTrueUniformInternal(const Value *V, unsigned Depth) {
   return false;
 }
 
-}  // namespace
+} // namespace
 
 UniformValueResult::UniformValueResult(Function &F, VectorizationUnit &vu)
     : F(F), VU(vu), Ctx(VU.context()), dimension(VU.dimension()) {}
@@ -161,8 +161,9 @@ bool UniformValueResult::isTrueUniform(const Value *V) {
 /// @param[in] BI BuiltinInfo for platform-specific builtin IDs
 /// @return true if the instruction is a call to a reduction or broadcast
 /// builtin.
-static bool isGroupBroadcastOrReduction(
-    const Instruction &I, const compiler::utils::BuiltinInfo &BI) {
+static bool
+isGroupBroadcastOrReduction(const Instruction &I,
+                            const compiler::utils::BuiltinInfo &BI) {
   if (!isa<CallInst>(&I)) {
     return false;
   }
@@ -491,8 +492,9 @@ Value *UniformValueResult::extractMemBase(Value *Address) {
 
 llvm::AnalysisKey UniformValueAnalysis::Key;
 
-UniformValueResult UniformValueAnalysis::run(
-    llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
+UniformValueResult
+UniformValueAnalysis::run(llvm::Function &F,
+                          llvm::FunctionAnalysisManager &AM) {
   VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
   UniformValueResult Res(F, VU);
   std::vector<Value *> Roots;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
index 7365e339c1a86..edf0101ba883a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -51,14 +51,14 @@ namespace {
 bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
   // Certain instructions just cannot appear.
   switch (I.getOpcode()) {
-    default:
-      break;
-    case Instruction::IndirectBr:
-    case Instruction::VAArg:
-    case Instruction::Invoke:
-    case Instruction::Resume:
-    case Instruction::LandingPad:
-      return false;
+  default:
+    break;
+  case Instruction::IndirectBr:
+  case Instruction::VAArg:
+  case Instruction::Invoke:
+  case Instruction::Resume:
+  case Instruction::LandingPad:
+    return false;
   }
 
   // User function calls.
@@ -120,10 +120,11 @@ bool canVectorize(const Function &F, const VectorizationContext &Ctx) {
   return true;
 }
 
-}  // namespace
+} // namespace
 
-VectorizableFunctionAnalysis::Result VectorizableFunctionAnalysis::run(
-    llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
+VectorizableFunctionAnalysis::Result
+VectorizableFunctionAnalysis::run(llvm::Function &F,
+                                  llvm::FunctionAnalysisManager &AM) {
   Result res;
   auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
index f53afec4161fe..484da6c6c8eae 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
@@ -22,8 +22,9 @@ using namespace vecz;
 
 llvm::AnalysisKey VectorizationUnitAnalysis::Key;
 
-VectorizationUnitAnalysis::Result VectorizationUnitAnalysis::run(
-    llvm::Function &F, llvm::FunctionAnalysisManager &) {
+VectorizationUnitAnalysis::Result
+VectorizationUnitAnalysis::run(llvm::Function &F,
+                               llvm::FunctionAnalysisManager &) {
   return Result{Ctx.getActiveVU(&F)};
 }
 
@@ -32,7 +33,8 @@ VectorizationUnitAnalysis::Result VectorizationUnitAnalysis::run(
 
 llvm::AnalysisKey VectorizationContextAnalysis::Key;
 
-VectorizationContextAnalysis::Result VectorizationContextAnalysis::run(
-    llvm::Function &, llvm::FunctionAnalysisManager &) {
+VectorizationContextAnalysis::Result
+VectorizationContextAnalysis::run(llvm::Function &,
+                                  llvm::FunctionAnalysisManager &) {
   return Result{Context};
 }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
index ea693925d67a3..b6099bad61731 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -73,7 +73,7 @@ bool isTrivialBlock(const BasicBlock &BB) {
   return true;
 }
 
-}  // namespace
+} // namespace
 
 /// @brief Check whether a uniform region is viable and worth keeping.
 /// @param[in] region the region to check
@@ -1083,9 +1083,8 @@ bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
       for (Instruction &I : *connectionPoint) {
         if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
           const int idx = PHI->getBasicBlockIndex(target);
-          VECZ_ERROR_IF(idx == -1,
-                        "Connection point PHIs must have incoming "
-                        "block from the target");
+          VECZ_ERROR_IF(idx == -1, "Connection point PHIs must have incoming "
+                                   "block from the target");
           if (Instruction *incoming =
                   dyn_cast<Instruction>(PHI->getIncomingValue(idx))) {
             LLVM_DEBUG(dbgs()
@@ -1173,8 +1172,8 @@ void ControlFlowConversionState::BOSCCGadget::addInRegions(BasicBlock *newB,
   }
 }
 
-Value *ControlFlowConversionState::BOSCCGadget::getUniformV(
-    Value *predicatedV) {
+Value *
+ControlFlowConversionState::BOSCCGadget::getUniformV(Value *predicatedV) {
   auto uniformVIt = VMap.find(predicatedV);
   if (uniformVIt != VMap.end()) {
     return uniformVIt->second;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
index d0174fb3ba4bc..02f6e9e68ca9b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
@@ -64,7 +64,7 @@ bool isReturnBlock(const llvm::BasicBlock &BB) {
 
   return isa<ReturnInst>(T);
 }
-}  // namespace
+} // namespace
 
 bool ControlFlowConversionState::ROSCCGadget::run(Function &F) {
   bool changed = false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
index 1ddb912ca5a55..9d30786cf3d39 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
@@ -85,4 +85,4 @@ void emitVeczRemark(const Function *F, const Value *V, StringRef Msg) {
 void emitVeczRemark(const Function *F, StringRef Msg) {
   emitVeczRemark(F, nullptr, Msg);
 }
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
index f6409f00bb7a8..f538de0e6bed4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
@@ -23,7 +23,7 @@
 
 namespace llvm {
 class BasicBlock;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -68,7 +68,7 @@ struct CFGResult {
 /// @brief Analysis that determines whether a function can have divergent
 /// control flow and so whether CFG conversion is needed or not.
 class CFGAnalysis : public llvm::AnalysisInfoMixin<CFGAnalysis> {
- public:
+public:
   /// @brief Create a new CFG analysis object.
   CFGAnalysis() = default;
 
@@ -87,12 +87,12 @@ class CFGAnalysis : public llvm::AnalysisInfoMixin<CFGAnalysis> {
   /// @brief Analysis name.
   static llvm::StringRef name() { return "CFG analysis"; }
 
- private:
+private:
   friend llvm::AnalysisInfoMixin<CFGAnalysis>;
   /// @brief Unique identifier for the analysis.
   static llvm::AnalysisKey Key;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
index 8027f90d742fe..cb66e38ba1bde 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
@@ -35,7 +35,7 @@
 namespace llvm {
 class BasicBlock;
 class Loop;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 struct BasicBlockTag;
@@ -208,7 +208,7 @@ using DivergenceInfo = llvm::DenseSet<llvm::BasicBlock *>;
 
 /// @brief Holds the result of Divergence Analysis for a given function.
 class DivergenceResult {
- public:
+public:
   /// @brief Create a new DA result for the given unit.
   /// @param[in] AM FunctionAnalysisManager providing analyses.
   DivergenceResult(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
@@ -379,7 +379,7 @@ class DivergenceResult {
     return divCausingBlocks;
   }
 
- private:
+private:
   friend class DivergenceAnalysis;
 
   /// @brief Mark a block div causing and mark blocks that are control dependent
@@ -452,7 +452,7 @@ class DivergenceResult {
 class DivergenceAnalysis : public llvm::AnalysisInfoMixin<DivergenceAnalysis> {
   friend llvm::AnalysisInfoMixin<DivergenceAnalysis>;
 
- public:
+public:
   /// @brief Create a new analysis object.
   DivergenceAnalysis() = default;
 
@@ -471,10 +471,10 @@ class DivergenceAnalysis : public llvm::AnalysisInfoMixin<DivergenceAnalysis> {
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Divergence analysis"; }
 
- private:
+private:
   /// @brief Unique identifier for the pass.
   static llvm::AnalysisKey Key;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
index 6859f39690193..daf31e624a35d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
 class Instruction;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class VectorizationContext;
@@ -31,6 +31,6 @@ class VectorizationContext;
 ///
 /// @return true iff the instruction requires instantiation.
 bool needsInstantiation(const VectorizationContext &Ctx, llvm::Instruction &I);
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
index d9d8d7fb264de..e36188b41dff5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
@@ -32,7 +32,7 @@ class LoopInfo;
 class Function;
 class BasicBlock;
 class Value;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class VectorizationUnit;
@@ -46,7 +46,7 @@ struct BlockLivenessInfo {
 };
 
 class LivenessResult {
- public:
+public:
   LivenessResult(llvm::Function &F) : F(F) {}
 
   LivenessResult() = delete;
@@ -59,7 +59,7 @@ class LivenessResult {
   size_t getMaxLiveVirtualRegisters() const;
   const BlockLivenessInfo &getBlockInfo(const llvm::BasicBlock *) const;
 
- private:
+private:
   class Impl;
 
   llvm::Function &F;
@@ -76,7 +76,7 @@ class LivenessResult {
 class LivenessAnalysis : public llvm::AnalysisInfoMixin<LivenessAnalysis> {
   friend llvm::AnalysisInfoMixin<LivenessAnalysis>;
 
- public:
+public:
   using Result = LivenessResult;
 
   LivenessAnalysis() = default;
@@ -95,6 +95,6 @@ class LivenessAnalysis : public llvm::AnalysisInfoMixin<LivenessAnalysis> {
   static llvm::AnalysisKey Key;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
+#endif // VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
index f87fe45434a5f..ddd8c97d0c8f2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
@@ -29,7 +29,7 @@
 namespace llvm {
 class Function;
 class Value;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -38,7 +38,7 @@ struct UniformValueResult;
 
 /// @brief Holds the result of Packetization Analysis for a given function.
 class PacketizationAnalysisResult {
- public:
+public:
   /// @brief The function being analyzed
   llvm::Function &F;
   /// @brief The Stride Analysis Result to use during analysis
@@ -64,7 +64,7 @@ class PacketizationAnalysisResult {
     return toPacketize.contains(V);
   }
 
- private:
+private:
   void markForPacketization(llvm::Value *V);
 
   /// @brief The set of instructions that need to be packetized.
@@ -79,7 +79,7 @@ class PacketizationAnalysis
     : public llvm::AnalysisInfoMixin<PacketizationAnalysis> {
   friend AnalysisInfoMixin<PacketizationAnalysis>;
 
- public:
+public:
   /// @brief Create a new analysis object.
   PacketizationAnalysis() {}
 
@@ -96,11 +96,11 @@ class PacketizationAnalysis
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Packetization analysis"; }
 
- private:
+private:
   /// @brief Unique identifier for the pass.
   static llvm::AnalysisKey Key;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
index 55f79f9866c8f..bee7f0f1c0046 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
@@ -34,7 +34,7 @@ class LivenessResult;
 class SimdWidthAnalysis : public llvm::AnalysisInfoMixin<SimdWidthAnalysis> {
   friend AnalysisInfoMixin<SimdWidthAnalysis>;
 
- public:
+public:
   /// @brief Create a new instance of the pass.
   SimdWidthAnalysis() = default;
 
@@ -53,7 +53,7 @@ class SimdWidthAnalysis : public llvm::AnalysisInfoMixin<SimdWidthAnalysis> {
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "SIMD width analysis"; }
 
- private:
+private:
   unsigned avoidSpillImpl(llvm::Function &, llvm::FunctionAnalysisManager &,
                           unsigned MinWidth = 2);
 
@@ -63,6 +63,6 @@ class SimdWidthAnalysis : public llvm::AnalysisInfoMixin<SimdWidthAnalysis> {
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
index 7f5a1cbaf7293..8b6d641e29681 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -32,7 +32,7 @@
 namespace llvm {
 class Function;
 class Value;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -40,7 +40,7 @@ struct UniformValueResult;
 
 /// @brief Holds the result of Stride Analysis for a given function.
 class StrideAnalysisResult {
- public:
+public:
   /// @brief The function being analyzed
   llvm::Function &F;
   /// @brief The Uniform Value Result to use during analysis
@@ -91,7 +91,7 @@ class StrideAnalysisResult {
   llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Value *Ptr,
                                  llvm::Type *EleTy) const;
 
- private:
+private:
   /// @brief A map of values onto OffsetInfos that were already analyzed.
   llvm::DenseMap<llvm::Value *, OffsetInfo> analyzed;
 };
@@ -101,7 +101,7 @@ class StrideAnalysisResult {
 class StrideAnalysis : public llvm::AnalysisInfoMixin<StrideAnalysis> {
   friend AnalysisInfoMixin<StrideAnalysis>;
 
- public:
+public:
   /// @brief Create a new analysis object.
   StrideAnalysis() {}
 
@@ -118,7 +118,7 @@ class StrideAnalysis : public llvm::AnalysisInfoMixin<StrideAnalysis> {
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Stride analysis"; }
 
- private:
+private:
   /// @brief Unique identifier for the pass.
   static llvm::AnalysisKey Key;
 };
@@ -129,13 +129,13 @@ class StrideAnalysisPrinterPass
     : public llvm::PassInfoMixin<StrideAnalysisPrinterPass> {
   llvm::raw_ostream &OS;
 
- public:
+public:
   explicit StrideAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
 
   llvm::PreservedAnalyses run(llvm::Function &F,
                               llvm::FunctionAnalysisManager &AM);
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
index b0a083ad69b5e..a221e6cba1447 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
@@ -31,7 +31,7 @@
 namespace llvm {
 class Value;
 class Instruction;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -171,7 +171,7 @@ class UniformValueAnalysis
     : public llvm::AnalysisInfoMixin<UniformValueAnalysis> {
   friend AnalysisInfoMixin<UniformValueAnalysis>;
 
- public:
+public:
   /// @brief Create a new analysis object.
   UniformValueAnalysis() {}
 
@@ -190,11 +190,11 @@ class UniformValueAnalysis
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Uniform value analysis"; }
 
- private:
+private:
   /// @brief Unique identifier for the pass.
   static llvm::AnalysisKey Key;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
index e82d297c4d5ec..6bc813caeea0e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -35,7 +35,7 @@ class VectorizableFunctionAnalysis
     : public llvm::AnalysisInfoMixin<VectorizableFunctionAnalysis> {
   friend AnalysisInfoMixin<VectorizableFunctionAnalysis>;
 
- public:
+public:
   /// @brief Create a new instance of the pass.
   VectorizableFunctionAnalysis() = default;
 
@@ -61,11 +61,11 @@ class VectorizableFunctionAnalysis
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "Vectorizable Function analysis"; }
 
- private:
+private:
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
+#endif // VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
index a2f60888635a9..7244236587d2f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
@@ -36,7 +36,7 @@ class VectorizationUnitAnalysis
     : public llvm::AnalysisInfoMixin<VectorizationUnitAnalysis> {
   friend AnalysisInfoMixin<VectorizationUnitAnalysis>;
 
- public:
+public:
   /// @brief Create a new instance of the pass.
   VectorizationUnitAnalysis(const VectorizationContext &Ctx) : Ctx(Ctx) {}
 
@@ -44,7 +44,7 @@ class VectorizationUnitAnalysis
   class Result {
     VectorizationUnit *VU = nullptr;
 
-   public:
+  public:
     Result() = default;
     Result(VectorizationUnit *VU) : VU(VU) {}
     VectorizationUnit &getVU() {
@@ -70,7 +70,7 @@ class VectorizationUnitAnalysis
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "VectorizationUnit analysis"; }
 
- private:
+private:
   const VectorizationContext &Ctx;
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
@@ -81,7 +81,7 @@ class VectorizationContextAnalysis
     : public llvm::AnalysisInfoMixin<VectorizationContextAnalysis> {
   friend AnalysisInfoMixin<VectorizationContextAnalysis>;
 
- public:
+public:
   /// @brief Create a new instance of the pass.
   VectorizationContextAnalysis(VectorizationContext &Ctx) : Context(Ctx) {}
 
@@ -89,7 +89,7 @@ class VectorizationContextAnalysis
   class Result {
     VectorizationContext &Ctx;
 
-   public:
+  public:
     Result(VectorizationContext &Ctx) : Ctx(Ctx) {}
     VectorizationContext &getContext() { return Ctx; }
     const VectorizationContext &getContext() const { return Ctx; }
@@ -111,11 +111,11 @@ class VectorizationContextAnalysis
   /// @brief Return the name of the pass.
   static llvm::StringRef name() { return "VectorizationContext analysis"; }
 
- private:
+private:
   VectorizationContext &Context;
   /// @brief Unique pass identifier.
   static llvm::AnalysisKey Key;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
+#endif // VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
index 5cb6ca77eeefd..cad9caaa7bead 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
@@ -39,23 +39,17 @@ class Instruction;
 class BasicBlock;
 class Function;
 class Loop;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
 class LivenessResult;
 
 class ControlFlowConversionState::BOSCCGadget final {
- public:
+public:
   BOSCCGadget(ControlFlowConversionState &Pass)
-      : PassState(Pass),
-        F(Pass.F),
-        AM(Pass.AM),
-        DT(Pass.DT),
-        PDT(Pass.PDT),
-        LI(Pass.LI),
-        DR(Pass.DR),
-        RC(Pass.RC.get()) {}
+      : PassState(Pass), F(Pass.F), AM(Pass.AM), DT(Pass.DT), PDT(Pass.PDT),
+        LI(Pass.LI), DR(Pass.DR), RC(Pass.RC.get()) {}
 
   /// @brief Region of code that will remain uniform after vectorization.
   ///
@@ -182,7 +176,7 @@ class ControlFlowConversionState::BOSCCGadget final {
   /// @return true if no problem occured, false otherwise.
   bool cleanUp();
 
- private:
+private:
   ControlFlowConversionState &PassState;
   llvm::Function &F;
   llvm::FunctionAnalysisManager &AM;
@@ -268,6 +262,6 @@ class ControlFlowConversionState::BOSCCGadget final {
   /// @returns true if no errors occurred.
   bool computeBlockOrdering();
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
+#endif // VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
index 3ed4dd469f797..187299c997307 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
@@ -30,7 +30,7 @@ namespace llvm {
 class Instruction;
 class BasicBlock;
 class Loop;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -38,19 +38,19 @@ namespace vecz {
 ///        "Return On Superword Condition Code" and optimizes non-uniform
 ///        branches to the function return block(s).
 class ControlFlowConversionState::ROSCCGadget final {
- public:
+public:
   ROSCCGadget(ControlFlowConversionState &Pass)
       : UVR(Pass.UVR), DT(Pass.DT), PDT(Pass.PDT), LI(Pass.LI) {}
 
   /// @brief perform the ROSCC transformation
   bool run(llvm::Function &F);
 
- private:
+private:
   UniformValueResult *UVR = nullptr;
   llvm::DominatorTree *DT = nullptr;
   llvm::PostDominatorTree *PDT = nullptr;
   llvm::LoopInfo *LI = nullptr;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
+#endif // VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
index 6faa2f2b52f15..0be9fa33de99a 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -46,23 +46,13 @@ struct VeczFailResult {
   /// @brief For functions that return a boolean value
   operator bool() const { return false; }
   /// @brief For functions that return a pointer
-  template <typename T>
-  operator T *() const {
-    return nullptr;
-  }
+  template <typename T> operator T *() const { return nullptr; }
   /// @brief For functions that return an std::shared_ptr
-  template <typename T>
-  operator std::shared_ptr<T>() const {
-    return nullptr;
-  }
+  template <typename T> operator std::shared_ptr<T>() const { return nullptr; }
   /// @brief For functions that return an std::unique_ptr
-  template <typename T>
-  operator std::unique_ptr<T>() const {
-    return nullptr;
-  }
+  template <typename T> operator std::unique_ptr<T>() const { return nullptr; }
   /// @brief For functions that return an llvm::Optional
-  template <typename T>
-  operator std::optional<T>() const {
+  template <typename T> operator std::optional<T>() const {
     return std::nullopt;
   }
 
@@ -111,36 +101,36 @@ struct AnalysisFailResult : public internal::VeczFailResult {
 
 #define VECZ_FAIL() return vecz::internal::VeczFailResult()
 
-#define VECZ_FAIL_IF(cond) \
-  do {                     \
-    if (cond) {            \
-      VECZ_FAIL();         \
-    }                      \
+#define VECZ_FAIL_IF(cond)                                                     \
+  do {                                                                         \
+    if (cond) {                                                                \
+      VECZ_FAIL();                                                             \
+    }                                                                          \
   } while (false)
 
-#define VECZ_STAT_FAIL_IF(cond, stat) \
-  do {                                \
-    if (cond) {                       \
-      ++stat;                         \
-      VECZ_FAIL();                    \
-    }                                 \
+#define VECZ_STAT_FAIL_IF(cond, stat)                                          \
+  do {                                                                         \
+    if (cond) {                                                                \
+      ++stat;                                                                  \
+      VECZ_FAIL();                                                             \
+    }                                                                          \
   } while (false)
 
-#define VECZ_ERROR_IF(cond, message) \
-  do {                               \
-    if (cond) {                      \
-      VECZ_ERROR(message);           \
-    }                                \
+#define VECZ_ERROR_IF(cond, message)                                           \
+  do {                                                                         \
+    if (cond) {                                                                \
+      VECZ_ERROR(message);                                                     \
+    }                                                                          \
   } while (false)
 
 #ifdef NDEBUG
 
-#define VECZ_ERROR(message)                                             \
-  do {                                                                  \
-    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__ \
-                 << "\n";                                               \
-    llvm::errs() << "!! Reason: " << message << "\n";                   \
-    VECZ_FAIL();                                                        \
+#define VECZ_ERROR(message)                                                    \
+  do {                                                                         \
+    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__        \
+                 << "\n";                                                      \
+    llvm::errs() << "!! Reason: " << message << "\n";                          \
+    VECZ_FAIL();                                                               \
   } while (false)
 
 #define VECZ_WARN_IF(cond, message) /* Nothing */
@@ -148,32 +138,32 @@ struct AnalysisFailResult : public internal::VeczFailResult {
 
 #else /* !NDEBUG */
 
-#define VECZ_ERROR(message)                                             \
-  do {                                                                  \
-    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__ \
-                 << "\n";                                               \
-    llvm::errs() << "!! Reason: " << (message) << "\n";                 \
-    std::abort();                                                       \
+#define VECZ_ERROR(message)                                                    \
+  do {                                                                         \
+    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__        \
+                 << "\n";                                                      \
+    llvm::errs() << "!! Reason: " << (message) << "\n";                        \
+    std::abort();                                                              \
   } while (false)
 
-#define VECZ_WARN_IF(cond, message)                                         \
-  do {                                                                      \
-    if (cond) {                                                             \
-      llvm::errs() << "!! Vecz: WARNING in " << __FILE__ << ":" << __LINE__ \
-                   << "\n";                                                 \
-      llvm::errs() << "!! Reason: " << (message) << "\n";                   \
-    }                                                                       \
+#define VECZ_WARN_IF(cond, message)                                            \
+  do {                                                                         \
+    if (cond) {                                                                \
+      llvm::errs() << "!! Vecz: WARNING in " << __FILE__ << ":" << __LINE__    \
+                   << "\n";                                                    \
+      llvm::errs() << "!! Reason: " << (message) << "\n";                      \
+    }                                                                          \
   } while (false)
 
-#define VECZ_UNREACHABLE(message)                                         \
-  do {                                                                    \
-    llvm::errs() << "!! Vecz: UNREACHABLE reached in " << __FILE__ << ":" \
-                 << __LINE__ << "\n";                                     \
-    llvm::errs() << "!! Message: " << (message) << "\n";                  \
-    std::abort();                                                         \
+#define VECZ_UNREACHABLE(message)                                              \
+  do {                                                                         \
+    llvm::errs() << "!! Vecz: UNREACHABLE reached in " << __FILE__ << ":"      \
+                 << __LINE__ << "\n";                                          \
+    llvm::errs() << "!! Message: " << (message) << "\n";                       \
+    std::abort();                                                              \
   } while (false)
 #endif /* NDEBUG */
-}  // namespace internal
+} // namespace internal
 
 #define VECZ_UNUSED(x) ((void)(x))
 
@@ -205,6 +195,6 @@ void emitVeczRemark(const llvm::Function *F, const llvm::Value *V,
 /// @param[in] Msg The main remark message text
 void emitVeczRemark(const llvm::Function *F, llvm::StringRef Msg);
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_DEBUGGING_H_INCLUDED
+#endif // VECZ_DEBUGGING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
index bf971807ee2bd..1321237311322 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
@@ -25,7 +25,7 @@ class Instruction;
 
 namespace vecz {
 class IRCleanup {
- public:
+public:
   /// @brief Mark the instruction as needing deletion. It will only be deleted
   /// if it is unused. This is used to mark instructions with side-effects
   /// (e.g. call, load, store and leaves) that have been replaced and are no
@@ -42,11 +42,11 @@ class IRCleanup {
   /// @param[in] I Instruction to delete.
   static void deleteInstructionNow(llvm::Instruction *I);
 
- private:
+private:
   /// @brief Instructions that have been marked for deletion.
   llvm::SmallPtrSet<llvm::Instruction *, 16> InstructionsToDelete;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECTORIZATION_UNIT_H_INCLUDED
+#endif // VECZ_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
index 548a11eb5f3bd..d4aafaa610cc5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
@@ -49,6 +49,6 @@ llvm::Value *getDefaultValue(llvm::Type *T, uint64_t V = 0UL);
 ///
 /// @return Array of integers representing the Shuffle mask
 llvm::ArrayRef<int> getShuffleVecMask(llvm::ShuffleVectorInst *Shuffle);
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_LLVM_HELPERS_H_INCLUDED
+#endif // VECZ_LLVM_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
index a28100507c7e5..a02bb446174d4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -37,7 +37,7 @@ class Function;
 class Instruction;
 class Value;
 class Type;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -108,10 +108,11 @@ llvm::CallInst *createMaskedStore(VectorizationContext &Ctx, llvm::Value *Data,
 /// @param[in] IsVP true if defining a vector-predicated operation
 ///
 /// @return (Masked) interleaved builtin function.
-llvm::Function *getOrCreateInterleavedMemOpFn(
-    VectorizationContext &Ctx, llvm::Type *DataTy, llvm::PointerType *PtrTy,
-    llvm::Value *Stride, llvm::Type *MaskTy, unsigned Alignment, bool IsLoad,
-    bool IsVP);
+llvm::Function *
+getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx, llvm::Type *DataTy,
+                              llvm::PointerType *PtrTy, llvm::Value *Stride,
+                              llvm::Type *MaskTy, unsigned Alignment,
+                              bool IsLoad, bool IsVP);
 
 /// @brief Create a call to a (masked) interleaved load builtin function. Also
 /// known as a strided load.
@@ -282,7 +283,7 @@ class MemOpDesc {
 
   friend struct MemOp;
 
- public:
+public:
   /// @brief Create an invalid memory operation.
   MemOpDesc();
 
@@ -406,8 +407,8 @@ class MemOpDesc {
   ///
   /// @return A MemOpDesc if the given function is a masked interleaved memory
   /// operation. std::nullopt otherwise.
-  static std::optional<MemOpDesc> analyzeMaskedInterleavedMemOp(
-      llvm::Function &F);
+  static std::optional<MemOpDesc>
+  analyzeMaskedInterleavedMemOp(llvm::Function &F);
 
   /// @brief Determine whether the given function is a scatter/gather memory
   /// operation or not. If that's the case, the descriptor is populated and
@@ -427,39 +428,39 @@ class MemOpDesc {
   ///
   /// @return A MemOpDesc if the given function is a masked scatter/gather
   /// operation. std::nullopt otherwise.
-  static std::optional<MemOpDesc> analyzeMaskedScatterGatherMemOp(
-      llvm::Function &F);
+  static std::optional<MemOpDesc>
+  analyzeMaskedScatterGatherMemOp(llvm::Function &F);
 
   /// @brief Determine whether the operation is a load or not.
   bool isLoad() const {
     switch (Kind) {
-      default:
-        return false;
-      case MemOpKind::LoadInstruction:
-      case MemOpKind::LoadCall:
-        return true;
+    default:
+      return false;
+    case MemOpKind::LoadInstruction:
+    case MemOpKind::LoadCall:
+      return true;
     }
   }
 
   /// @brief Determine whether the operation is a store or not.
   bool isStore() const {
     switch (Kind) {
-      default:
-        return false;
-      case MemOpKind::StoreInstruction:
-      case MemOpKind::StoreCall:
-        return true;
+    default:
+      return false;
+    case MemOpKind::StoreInstruction:
+    case MemOpKind::StoreCall:
+      return true;
     }
   }
 
   /// @brief Determine whether the operation is an instruction or not.
   bool isLoadStoreInst() const {
     switch (Kind) {
-      default:
-        return false;
-      case MemOpKind::LoadInstruction:
-      case MemOpKind::StoreInstruction:
-        return true;
+    default:
+      return false;
+    case MemOpKind::LoadInstruction:
+    case MemOpKind::StoreInstruction:
+      return true;
     }
   }
 
@@ -568,7 +569,7 @@ struct MemOp {
     return Desc.getStrideAsConstantInt();
   }
 
- private:
+private:
   /// @brief Access an operand of the call instruction.
   ///
   /// @param[in] OpIdx Index of the operand to access.
@@ -608,7 +609,7 @@ inline llvm::IntegerType *getSizeTy(llvm::Module &M) {
 inline llvm::IntegerType *getSizeTy(llvm::IRBuilder<> &B) {
   return getSizeTy(*(B.GetInsertBlock()->getModule()));
 }
-}  // namespace
-}  // namespace vecz
+} // namespace
+} // namespace vecz
 
-#endif  // VECZ_MEMORY_OPERATIONS_H_INCLUDED
+#endif // VECZ_MEMORY_OPERATIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
index 1715fbd8b5aeb..2ad2d60a3a78c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -28,7 +28,7 @@ namespace llvm {
 class CallInst;
 class Value;
 class Type;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -144,7 +144,7 @@ struct OffsetInfo {
   /// @return Reference to the current object for chaining.
   OffsetInfo &manifest(llvm::IRBuilder<> &B, StrideAnalysisResult &SAR);
 
- private:
+private:
   /// @brief Mark this offset with the given flag.
   /// @return Reference to the current object for chaining.
   OffsetInfo &setKind(OffsetKind Kind);
@@ -263,6 +263,6 @@ struct OffsetInfo {
   OffsetInfo &copyStrideAndBitMaskFrom(const OffsetInfo &Other);
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // #define VECZ_OFFSET_INFO_H_INCLUDED
+#endif // #define VECZ_OFFSET_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
index a8b87da22aada..2506c79921928 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
@@ -32,13 +32,13 @@ class DominatorTree;
 class Function;
 class LoopInfo;
 class PostDominatorTree;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
 /// @brief A data structure to handle reachability queries
 class Reachability {
- public:
+public:
   /// @brief Construct the Reachability computation from a Dominator Tree
   ///        and a Post-Dominator Tree, that are used to speed up the queries.
   /// @param[in] DT the Dominator Tree
@@ -77,7 +77,7 @@ class Reachability {
   /// @return True if "to" is reachable from "from"
   bool isReachable(llvm::BasicBlock *from, llvm::BasicBlock *to) const;
 
- private:
+private:
   /// @brief Internal implementation of isReachable
   ///
   /// @param[in] from the graph node index to start from
@@ -111,6 +111,6 @@ class Reachability {
   /// @brief A mapping between BasicBlock pointers and graph node indices.
   llvm::DenseMap<llvm::BasicBlock *, size_t> indexMap;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_REACHABILITY_H_INCLUDED
+#endif // VECZ_REACHABILITY_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
index 07329e4384b18..40acd42336a0e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
@@ -94,6 +94,6 @@ struct SimdPacket : public llvm::SmallVector<llvm::Value *, 4> {
   PacketMask Mask;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_SIMD_PACKET_H_INCLUDED
+#endif // VECZ_SIMD_PACKET_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
index b76cccdf4d998..15f848257d446 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
@@ -32,7 +32,7 @@ class VectorizationUnit;
 /// packetization pass.
 class CommonGEPEliminationPass
     : public llvm::PassInfoMixin<CommonGEPEliminationPass> {
- public:
+public:
   static void *ID() { return (void *)&PassID; };
 
   /// @brief Remove duplicate GEP instructions.
@@ -47,10 +47,10 @@ class CommonGEPEliminationPass
   /// @brief Pass name.
   static llvm::StringRef name() { return "Common GEP Elimination pass"; }
 
- private:
+private:
   /// @brief Identifier for the pass.
   static char PassID;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
index 7b6dd46175da6..9cffc83720217 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -35,7 +35,7 @@ class DominatorTree;
 class PostDominatorTree;
 class PreservedAnalyses;
 class LoopInfo;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 struct BasicBlockTag;
@@ -54,7 +54,7 @@ class Reachability;
 /// a function.
 class ControlFlowConversionPass
     : public llvm::PassInfoMixin<ControlFlowConversionPass> {
- public:
+public:
   /// @brief Unique identifier for the pass.
   static void *ID() { return (void *)&PassID; }
 
@@ -72,17 +72,17 @@ class ControlFlowConversionPass
     return "Control flow to data flow conversion";
   }
 
- private:
+private:
   /// @brief Unique identifier for the pass.
   static char PassID;
 };
 
 class ControlFlowConversionState {
- public:
+public:
   /// @brief The actual implementation of this pass
   class Impl;
 
- protected:
+protected:
   ControlFlowConversionState(llvm::Function &,
                              llvm::FunctionAnalysisManager &AM);
 
@@ -107,7 +107,7 @@ class ControlFlowConversionState {
   std::unique_ptr<BOSCCGadget> BOSCC;
   std::unique_ptr<Reachability> RC;
 
- private:
+private:
   struct MaskInfo {
     /// @brief Mask that describes which lanes have exited the block.
     llvm::SmallDenseMap<llvm::BasicBlock *, llvm::Value *, 4> exitMasks;
@@ -150,6 +150,6 @@ class ControlFlowConversionState {
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
index 1a4b7cb74e109..bcd63aa00cac6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
@@ -30,7 +30,7 @@ namespace vecz {
 /// after vectorization.
 class InlinePostVectorizationPass
     : public llvm::PassInfoMixin<InlinePostVectorizationPass> {
- public:
+public:
   /// @brief Create a new pass object.
   InlinePostVectorizationPass() {}
 
@@ -44,6 +44,6 @@ class InlinePostVectorizationPass
   /// @return pointer to text description.
   static llvm::StringRef name() { return "Inline Post Vectorization pass"; }
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
index d2a400ed261b8..ce9140ad64586 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
@@ -35,7 +35,7 @@ struct MemOp;
 /// are instantiated (i.e. duplicated with lane ID substitution), starting from
 /// the leaves.
 class InstantiationPass {
- public:
+public:
   /// @brief Create a new instantiation pass.
   ///
   /// @param[in] PP The packetizer object to call back to when required.
@@ -50,7 +50,7 @@ class InstantiationPass {
   /// @return Instantiated value.
   PacketRange instantiate(llvm::Value *V);
 
- private:
+private:
   /// @brief Duplicates an instruction across all SIMD Lanes.
   ///
   /// @param[in] I The instruction to duplicate across lanes
@@ -108,6 +108,6 @@ class InstantiationPass {
   VectorizationContext &Ctx;
   Packetizer &packetizer;
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
index 88efc00d560bc..ae6deb613826c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
@@ -37,7 +37,7 @@ class VectorizationUnit;
 /// @brief Combine groups of interleaved memory operations.
 class InterleavedGroupCombinePass
     : public llvm::PassInfoMixin<InterleavedGroupCombinePass> {
- public:
+public:
   /// @brief Create a new pass object.
   ///
   /// @param[in] kind Kind of interleaved operation to combine.
@@ -61,7 +61,7 @@ class InterleavedGroupCombinePass
     return "Combine interleaved memory instructions";
   }
 
- private:
+private:
   /// @brief Information about an interleaved operation.
   struct InterleavedOpInfo;
 
@@ -89,6 +89,6 @@ class InterleavedGroupCombinePass
   llvm::ScalarEvolution *scalarEvolution;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
index 24c81f90084b8..1c9cfe79dac53 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -36,7 +36,7 @@ namespace llvm {
 class Value;
 class ShuffleVectorInst;
 class Twine;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class TargetInfo;
@@ -133,7 +133,7 @@ llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder,
 /// The range is represented by its integer starting index and length, so that
 /// it remains valid if the vector re-allocates its storage.
 class PacketRange {
- public:
+public:
   using value_type = llvm::Value *;
   using iterator = value_type *;
   using const_iterator = const value_type *;
@@ -185,7 +185,7 @@ class PacketRange {
   /// @returns false if length is zero, true otherwise
   operator bool() const { return length != 0; }
 
- private:
+private:
   std::vector<llvm::Value *> &data;
   const size_t start;
   const size_t length;
@@ -256,6 +256,6 @@ inline llvm::Type *getWideType(llvm::Type *ty, llvm::ElementCount factor) {
   ty = vecTy->getElementType();
   return llvm::VectorType::get(ty, factor * elts);
 }
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
+#endif // VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
index ccc52a26912d7..fb5b49bc106ba 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
@@ -34,7 +34,7 @@ class VectorizationUnit;
 /// @brief Vectorization pass where scalar instructions that need it are
 /// packetized, starting from leaves.
 class PacketizationPass : public llvm::PassInfoMixin<PacketizationPass> {
- public:
+public:
   /// @brief Create a new packetization pass object.
   PacketizationPass() = default;
 
@@ -72,6 +72,6 @@ class PacketizationPass : public llvm::PassInfoMixin<PacketizationPass> {
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
index 4f030ee15fa8f..4e9ff96a07e56 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
@@ -50,11 +50,11 @@ class VectorizationChoices;
 
 /// @brief The implementation of the packetization process
 class Packetizer {
- public:
+public:
   class Result {
     friend class Packetizer;
 
-   public:
+  public:
     Result() = delete;
     Result(const Result &) = default;
     constexpr Result(Result &&) = default;
@@ -97,7 +97,7 @@ class Packetizer {
     void getPacketValues(unsigned width,
                          llvm::SmallVectorImpl<llvm::Value *> &vals) const;
 
-   private:
+  private:
     Packetizer &packetizer;
     llvm::Value *const scalar;
     PacketInfo *const info;
@@ -170,7 +170,7 @@ class Packetizer {
     IC.deleteInstructionLater(I);
   }
 
- private:
+private:
   Packetizer(llvm::Function &, llvm::FunctionAnalysisManager &AM,
              llvm::ElementCount Width, unsigned Dim);
   Packetizer() = delete;
@@ -229,6 +229,6 @@ class Packetizer {
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
+#endif // VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
index 1f62cc200c967..bbc9cd6428a2c 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
@@ -27,13 +27,13 @@
 namespace compiler {
 namespace utils {
 class BuiltinInfo;
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
 namespace vecz {
 class SimplifyInfiniteLoopPass
     : public llvm::PassInfoMixin<SimplifyInfiniteLoopPass> {
- public:
+public:
   SimplifyInfiniteLoopPass() = default;
 
   llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &,
@@ -44,7 +44,7 @@ class SimplifyInfiniteLoopPass
 /// @brief This pass replaces calls to builtins that require special attention
 /// (e.g. there is no scalar or vector equivalent) with inline implementations.
 class BuiltinInliningPass : public llvm::PassInfoMixin<BuiltinInliningPass> {
- public:
+public:
   /// @brief Create a new pass object.
   BuiltinInliningPass() = default;
 
@@ -58,7 +58,7 @@ class BuiltinInliningPass : public llvm::PassInfoMixin<BuiltinInliningPass> {
   /// @return pointer to text description.
   static llvm::StringRef name() { return "OpenCL builtin inlining pass"; }
 
- private:
+private:
   /// @brief Process a call site, inlining it or marking it as needing inlining
   /// if required.
   ///
@@ -73,7 +73,7 @@ class BuiltinInliningPass : public llvm::PassInfoMixin<BuiltinInliningPass> {
 /// away by LLVM's Mem2Reg pass, for example in the presence of bitcasts. It is
 /// however much simpler than LLVM's.
 class BasicMem2RegPass : public llvm::PassInfoMixin<BasicMem2RegPass> {
- public:
+public:
   BasicMem2RegPass() {};
 
   /// @brief The entry point to the pass.
@@ -86,7 +86,7 @@ class BasicMem2RegPass : public llvm::PassInfoMixin<BasicMem2RegPass> {
   /// @return pointer to text description.
   static llvm::StringRef name() { return "Basic Mem2Reg Pass"; }
 
- private:
+private:
   /// @brief Determine whether the alloca can be promoted or not.
   ///
   /// This is the case when it is inside the entry block, there is at most one
@@ -104,7 +104,7 @@ class BasicMem2RegPass : public llvm::PassInfoMixin<BasicMem2RegPass> {
 };
 
 class PreLinearizePass : public llvm::PassInfoMixin<PreLinearizePass> {
- public:
+public:
   PreLinearizePass() = default;
 
   llvm::PreservedAnalyses run(llvm::Function &F,
@@ -116,7 +116,7 @@ class PreLinearizePass : public llvm::PassInfoMixin<PreLinearizePass> {
 /// @brief Wraps llvm's LoopRotatePass but retricts the range of loops on which
 /// it works.
 class VeczLoopRotatePass : public llvm::PassInfoMixin<VeczLoopRotatePass> {
- public:
+public:
   VeczLoopRotatePass() {}
 
   llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &,
@@ -127,7 +127,7 @@ class VeczLoopRotatePass : public llvm::PassInfoMixin<VeczLoopRotatePass> {
 };
 
 class RemoveIntPtrPass : public llvm::PassInfoMixin<RemoveIntPtrPass> {
- public:
+public:
   RemoveIntPtrPass() = default;
 
   static llvm::StringRef name() { return "Remove IntPtr instructions"; }
@@ -138,7 +138,7 @@ class RemoveIntPtrPass : public llvm::PassInfoMixin<RemoveIntPtrPass> {
 
 class SquashSmallVectorsPass
     : public llvm::PassInfoMixin<SquashSmallVectorsPass> {
- public:
+public:
   SquashSmallVectorsPass() = default;
 
   static llvm::StringRef name() { return "Squash Small Vectors"; }
@@ -151,7 +151,7 @@ class SquashSmallVectorsPass
 /// not needed or can be converted to non-masked operations.
 class SimplifyMaskedMemOpsPass
     : public llvm::PassInfoMixin<SimplifyMaskedMemOpsPass> {
- public:
+public:
   /// @brief Create a new pass object.
   SimplifyMaskedMemOpsPass() = default;
 
@@ -173,7 +173,7 @@ class SimplifyMaskedMemOpsPass
 /// @brief reassociate uniform binary operators and split branches
 class UniformReassociationPass
     : public llvm::PassInfoMixin<UniformReassociationPass> {
- public:
+public:
   UniformReassociationPass() = default;
 
   static llvm::StringRef name() { return "Reassociate uniform binops"; }
@@ -185,7 +185,7 @@ class UniformReassociationPass
 /// @brief Removes uniform divergence reductions created by CFG conversion
 class DivergenceCleanupPass
     : public llvm::PassInfoMixin<DivergenceCleanupPass> {
- public:
+public:
   /// @brief Create a new pass object.
   DivergenceCleanupPass() = default;
 
@@ -204,6 +204,6 @@ class DivergenceCleanupPass
   }
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_PASSES_H_INCLUDED
+#endif // VECZ_TRANSFORM_PASSES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
index f10dbf27048de..2d4885059b3db 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
@@ -25,12 +25,11 @@ namespace llvm {
 class Module;
 class User;
 class Instruction;
-template <typename T, unsigned N>
-class SmallVector;
+template <typename T, unsigned N> class SmallVector;
 class GlobalVariable;
 class Value;
 class CallInst;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -66,9 +65,10 @@ std::string GetFormatStringAsString(llvm::Value *op);
 /// @param[in]  new_format_string The scalarized format string to create a
 /// global variable from.
 /// @return The newly created global variable for the format string.
-llvm::GlobalVariable *GetNewFormatStringAsGlobalVar(
-    llvm::Module &module, llvm::GlobalVariable *const string_value,
-    const std::string &new_format_string);
+llvm::GlobalVariable *
+GetNewFormatStringAsGlobalVar(llvm::Module &module,
+                              llvm::GlobalVariable *const string_value,
+                              const std::string &new_format_string);
 
 /// @brief This function transforms an OpenCL printf format string into a
 /// C99-conformant one.
@@ -109,9 +109,9 @@ EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
 /// @param[in] new_format_string_gvar The module-level global variable for the
 /// new format string.
 /// @return A new call instruction to the new printf function.
-llvm::Instruction *BuildNewPrintfCall(
-    llvm::Module &module, llvm::CallInst *const old_inst,
-    llvm::GlobalVariable *const new_format_string_gvar);
-}  // namespace vecz
+llvm::Instruction *
+BuildNewPrintfCall(llvm::Module &module, llvm::CallInst *const old_inst,
+                   llvm::GlobalVariable *const new_format_string_gvar);
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
+#endif // VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
index 71425d2da4a7b..a494a1945e0a6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
@@ -26,7 +26,7 @@
 
 namespace llvm {
 class Function;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -39,7 +39,7 @@ class VectorizationUnit;
 /// @brief Scalarization pass where vector instructions that need it are
 /// scalarized, starting from leaves.
 class ScalarizationPass : public llvm::PassInfoMixin<ScalarizationPass> {
- public:
+public:
   /// @brief Create a new scalarizaation pass.
   ScalarizationPass();
 
@@ -58,11 +58,11 @@ class ScalarizationPass : public llvm::PassInfoMixin<ScalarizationPass> {
   /// @brief Name of the pass.
   static llvm::StringRef name() { return "Function scalarization"; }
 
- private:
+private:
   static char PassID;
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
index bcd25451fabb9..ecb2136c6b73d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
@@ -50,7 +50,7 @@ class InsertElementInst;
 class PHINode;
 class ExtractElementInst;
 class IntrinsicInst;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -66,7 +66,7 @@ struct SimdPacket;
 
 /// @brief Holds the result of scalarization analysis for a given function.
 class Scalarizer {
- public:
+public:
   /// @brief Create new scalarization results for the function.
   ///
   /// @param[in] F Function to scalarize.
@@ -88,7 +88,7 @@ class Scalarizer {
   /// @brief Get the list of instructions that failed to scalarize
   const FailureSet &failures() const { return Failures; }
 
- private:
+private:
   /// @brief Vectorization context for the function to scalarize.
   VectorizationContext &Ctx;
   llvm::Function &F;
@@ -318,6 +318,6 @@ class Scalarizer {
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
+#endif // VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
index 8453a47a5ae03..a428b84ba9aa9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
@@ -29,7 +29,7 @@ namespace vecz {
 /// @brief This pass tries to transform selects with pointer operands,
 /// transforms to individual GEPs followed by masked memory operations.
 class TernaryTransformPass : public llvm::PassInfoMixin<TernaryTransformPass> {
- public:
+public:
   TernaryTransformPass() = default;
 
   /// @brief The entry point to the pass.
@@ -44,6 +44,6 @@ class TernaryTransformPass : public llvm::PassInfoMixin<TernaryTransformPass> {
   // @brief Pass name.
   static llvm::StringRef name() { return "Ternary transform pass"; }
 };
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
+#endif // VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
index 96a73a5962cfb..9d231c8b7b1d7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -37,13 +37,13 @@
 
 namespace llvm {
 class TargetTransformInfo;
-}  // namespace llvm
+} // namespace llvm
 
 namespace compiler {
 namespace utils {
 class BuiltinInfo;
-}  // namespace utils
-}  // namespace compiler
+} // namespace utils
+} // namespace compiler
 
 namespace vecz {
 class MemOpDesc;
@@ -58,7 +58,7 @@ using ActiveUnitMap = llvm::DenseMap<llvm::PoisoningVH<const llvm::Function>,
 
 /// @brief Holds global (per-module) vectorization state.
 class VectorizationContext {
- public:
+public:
   /// @brief Create a new vectorization context object.
   ///
   /// @param[in] target Module in which vectorization happens.
@@ -195,8 +195,8 @@ class VectorizationContext {
   /// @param[in] F The function to check
   /// @return A MaskedAtomic instance detailing the atomic operation if the
   /// function is a masked atomic, or std::nullopt otherwise
-  std::optional<MaskedAtomic> isMaskedAtomicFunction(
-      const llvm::Function &F) const;
+  std::optional<MaskedAtomic>
+  isMaskedAtomicFunction(const llvm::Function &F) const;
   /// @brief Get (if it exists already) or create the function representing the
   /// masked version of an atomicrmw/cmpxchg operation.
   ///
@@ -204,9 +204,10 @@ class VectorizationContext {
   /// @param[in] Choices Choices to mangle into the function name
   /// @param[in] VF The vectorization factor of the atomic operation
   /// @return The masked version of the function
-  llvm::Function *getOrCreateMaskedAtomicFunction(
-      MaskedAtomic &I, const VectorizationChoices &Choices,
-      llvm::ElementCount VF);
+  llvm::Function *
+  getOrCreateMaskedAtomicFunction(MaskedAtomic &I,
+                                  const VectorizationChoices &Choices,
+                                  llvm::ElementCount VF);
 
   /// @brief Create a VectorizationUnit to use to vectorize the given scalar
   /// function.
@@ -254,7 +255,7 @@ class VectorizationContext {
 
   static const char *InternalBuiltinPrefix;
 
- private:
+private:
   /// @brief Determine whether this scalar builtin function can be safely
   /// expanded at vector call sites, i.e. it has not side effects.
   ///
@@ -359,7 +360,7 @@ class VectorizationContext {
 /// @brief Implement internal builtins.
 class DefineInternalBuiltinsPass
     : public llvm::PassInfoMixin<DefineInternalBuiltinsPass> {
- public:
+public:
   /// @brief Create a new pass object.
   DefineInternalBuiltinsPass() {}
 
@@ -376,12 +377,12 @@ class DefineInternalBuiltinsPass
 
   static llvm::StringRef name() { return "Define internal builtins"; }
 
- private:
+private:
   /// @brief Identifier for the DefineInternalBuiltin pass.
   static char PassID;
 };
 
 /// @}
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
+#endif // VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
index 978b02a6f202c..c865601b90a55 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
@@ -24,7 +24,7 @@
 namespace llvm {
 class Function;
 class StringRef;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class VectorizationUnit;
@@ -77,6 +77,6 @@ void cloneDebugInfo(const VectorizationUnit &VU);
 /// vectorized one. Obviously, the kernel itself has to be cloned before
 /// calling this function.
 void cloneOpenCLMetadata(const VectorizationUnit &VU);
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECTORIZATION_HELPERS_H_INCLUDED
+#endif // VECZ_VECTORIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
index 129f6af29b362..e80949be23143 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
@@ -21,7 +21,7 @@
 
 namespace llvm {
 class Function;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class VectorizationContext;
@@ -38,6 +38,6 @@ class VectorizationContext;
 bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
                      llvm::ElementCount VF, unsigned SimdDimIdx);
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
+#endif // VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
index d6ff8aa20eca3..820b83d53ad86 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
@@ -33,13 +33,13 @@ class FunctionType;
 class Module;
 class Instruction;
 class Argument;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 namespace internal {
 struct VeczFailResult;
 struct AnalysisFailResult;
-}  // namespace internal
+} // namespace internal
 
 struct SimdPacket;
 struct UniformValueResult;
@@ -47,8 +47,7 @@ class ValueTagMap;
 class VectorizationContext;
 class VectorizationChoices;
 
-template <typename T>
-class AnalysisWrapper;
+template <typename T> class AnalysisWrapper;
 
 /// @brief Describe an argument of a function that needs to be vectorized.
 struct VectorizerTargetArgument {
@@ -98,7 +97,7 @@ struct VectorizationResult {
 
 /// @brief Describe a function that needs to be vectorized.
 class VectorizationUnit {
- public:
+public:
   /// @brief Create a new vectorization unit for the given scalar function.
   ///
   /// @param[in] F Function to vectorize.
@@ -226,7 +225,7 @@ class VectorizationUnit {
   /// @return The Choices
   const VectorizationChoices &choices() const { return Choices; };
 
- private:
+private:
   /// @brief Context this function is vectorized in.
   VectorizationContext &Ctx;
   /// @brief Which Vecz code generation choices are enabled and which not
@@ -254,6 +253,6 @@ class VectorizationUnit {
   llvm::SmallPtrSet<const llvm::Instruction *, 4> ArgumentPlaceholders;
 };
 
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECTORIZATION_UNIT_H_INCLUDED
+#endif // VECZ_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
index 1e3771957e96e..483a46af5c681 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
@@ -26,7 +26,7 @@
 
 namespace llvm {
 class Function;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 
@@ -69,6 +69,6 @@ bool createVectorizedFunctionMetadata(VectorizationUnit &VU);
 /// @param[in] VU the vectorization Unit of to create metadata for
 /// @returns true iff vectorization succeeded.
 void trackVeczSuccessFailure(VectorizationUnit &VU);
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECTORIZER_H_INCLUDED
+#endif // VECZ_VECTORIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
index e707974569cde..a51e66c4ec024 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
@@ -28,7 +28,7 @@ namespace llvm {
 class Module;
 class TargetTransformInfo;
 class TargetMachine;
-}  // namespace llvm
+} // namespace llvm
 
 namespace vecz {
 class VectorizationContext;
@@ -36,7 +36,7 @@ class VectorizationContext;
 /// @brief A class that manages the lifetime and initialization of all
 /// components required to set up an LLVM pass manager to run Vecz passes.
 class VeczPassMachinery final : public compiler::utils::PassMachinery {
- public:
+public:
   /// @brief Construct the pass machinery.
   /// The base class method `initialize(TargetInfo)` must also be called.
   ///
@@ -51,7 +51,7 @@ class VeczPassMachinery final : public compiler::utils::PassMachinery {
 
   virtual void registerPasses() override;
 
- private:
+private:
   virtual void addClassToPassNames() override;
   virtual void registerPassCallbacks() override;
 
@@ -63,6 +63,6 @@ class VeczPassMachinery final : public compiler::utils::PassMachinery {
 /// @param[in] PM The Module Pass Manager to build.
 /// @return true on success.
 bool buildPassPipeline(llvm::ModulePassManager &PM);
-}  // namespace vecz
+} // namespace vecz
 
-#endif  // VECZ_VECZ_PASS_BUILDER_H_INCLUDED
+#endif // VECZ_VECZ_PASS_BUILDER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
index f7b3317f3b681..28ec40fc6f7c7 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -75,7 +75,7 @@ bool AreUsersDead(Instruction *I,
   return true;
 }
 
-}  // namespace
+} // namespace
 
 void IRCleanup::deleteInstructionLater(llvm::Instruction *I) {
   if (InstructionsToDelete.insert(I).second) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
index d5aa4b62f3f8d..aedcd49128678 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -407,18 +407,10 @@ llvm::CallInst *vecz::createScatter(VectorizationContext &Ctx,
 }
 
 MemOpDesc::MemOpDesc()
-    : DataTy(nullptr),
-      PtrTy(nullptr),
-      MaskTy(nullptr),
-      Kind(MemOpKind::Invalid),
-      AccessKind(MemOpAccessKind::Native),
-      IsVLOp(false),
-      Alignment(1),
-      Stride(nullptr),
-      DataOpIdx(-1),
-      PtrOpIdx(-1),
-      MaskOpIdx(-1),
-      VLOpIdx(-1) {}
+    : DataTy(nullptr), PtrTy(nullptr), MaskTy(nullptr),
+      Kind(MemOpKind::Invalid), AccessKind(MemOpAccessKind::Native),
+      IsVLOp(false), Alignment(1), Stride(nullptr), DataOpIdx(-1), PtrOpIdx(-1),
+      MaskOpIdx(-1), VLOpIdx(-1) {}
 
 bool MemOpDesc::isStrideConstantInt() const {
   return Stride && isa<ConstantInt>(Stride);
@@ -744,8 +736,8 @@ std::optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(Function &F) {
   return std::nullopt;
 }
 
-std::optional<MemOpDesc> MemOpDesc::analyzeMaskedScatterGatherMemOp(
-    Function &F) {
+std::optional<MemOpDesc>
+MemOpDesc::analyzeMaskedScatterGatherMemOp(Function &F) {
   const StringRef MangledName = F.getName();
   compiler::utils::Lexer L(MangledName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
@@ -849,23 +841,23 @@ std::optional<MemOp> MemOp::get(llvm::CallInst *CI,
   std::optional<MemOpDesc> Desc;
   if (Function *Caller = CI->getCalledFunction()) {
     switch (AccessKind) {
-      default:
-        return std::nullopt;
-      case MemOpAccessKind::Masked:
-        Desc = MemOpDesc::analyzeMaskedMemOp(*Caller);
-        break;
-      case MemOpAccessKind::Interleaved:
-        Desc = MemOpDesc::analyzeInterleavedMemOp(*Caller);
-        break;
-      case MemOpAccessKind::MaskedInterleaved:
-        Desc = MemOpDesc::analyzeMaskedInterleavedMemOp(*Caller);
-        break;
-      case MemOpAccessKind::ScatterGather:
-        Desc = MemOpDesc::analyzeScatterGatherMemOp(*Caller);
-        break;
-      case MemOpAccessKind::MaskedScatterGather:
-        Desc = MemOpDesc::analyzeMaskedScatterGatherMemOp(*Caller);
-        break;
+    default:
+      return std::nullopt;
+    case MemOpAccessKind::Masked:
+      Desc = MemOpDesc::analyzeMaskedMemOp(*Caller);
+      break;
+    case MemOpAccessKind::Interleaved:
+      Desc = MemOpDesc::analyzeInterleavedMemOp(*Caller);
+      break;
+    case MemOpAccessKind::MaskedInterleaved:
+      Desc = MemOpDesc::analyzeMaskedInterleavedMemOp(*Caller);
+      break;
+    case MemOpAccessKind::ScatterGather:
+      Desc = MemOpDesc::analyzeScatterGatherMemOp(*Caller);
+      break;
+    case MemOpAccessKind::MaskedScatterGather:
+      Desc = MemOpDesc::analyzeMaskedScatterGatherMemOp(*Caller);
+      break;
     }
   }
   if (!Desc) {
@@ -908,25 +900,25 @@ llvm::Value *MemOp::getDataOperand() const {
 
 llvm::Value *MemOp::getPointerOperand() const {
   switch (Desc.getKind()) {
-    default:
-      return nullptr;
-    case MemOpKind::LoadInstruction:
-      return cast<LoadInst>(Ins)->getPointerOperand();
-    case MemOpKind::StoreInstruction:
-      return cast<StoreInst>(Ins)->getPointerOperand();
-    case MemOpKind::LoadCall:
-    case MemOpKind::StoreCall:
-      return getCallOperand(Desc.getPointerOperandIndex());
+  default:
+    return nullptr;
+  case MemOpKind::LoadInstruction:
+    return cast<LoadInst>(Ins)->getPointerOperand();
+  case MemOpKind::StoreInstruction:
+    return cast<StoreInst>(Ins)->getPointerOperand();
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return getCallOperand(Desc.getPointerOperandIndex());
   }
 }
 
 llvm::Value *MemOp::getMaskOperand() const {
   switch (Desc.getKind()) {
-    default:
-      return nullptr;
-    case MemOpKind::LoadCall:
-    case MemOpKind::StoreCall:
-      return getCallOperand(Desc.getMaskOperandIndex());
+  default:
+    return nullptr;
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return getCallOperand(Desc.getMaskOperandIndex());
   }
 }
 
@@ -943,27 +935,27 @@ bool MemOp::setDataOperand(Value *V) {
 
 bool MemOp::setPointerOperand(Value *V) {
   switch (Desc.getKind()) {
-    default:
-      return false;
-    case MemOpKind::LoadInstruction:
-      cast<LoadInst>(Ins)->setOperand(0, V);
-      return true;
-    case MemOpKind::StoreInstruction:
-      cast<StoreInst>(Ins)->setOperand(1, V);
-      return true;
-    case MemOpKind::LoadCall:
-    case MemOpKind::StoreCall:
-      return setCallOperand(Desc.getPointerOperandIndex(), V);
+  default:
+    return false;
+  case MemOpKind::LoadInstruction:
+    cast<LoadInst>(Ins)->setOperand(0, V);
+    return true;
+  case MemOpKind::StoreInstruction:
+    cast<StoreInst>(Ins)->setOperand(1, V);
+    return true;
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return setCallOperand(Desc.getPointerOperandIndex(), V);
   }
 }
 
 bool MemOp::setMaskOperand(Value *V) {
   switch (Desc.getKind()) {
-    default:
-      return false;
-    case MemOpKind::LoadCall:
-    case MemOpKind::StoreCall:
-      return setCallOperand(Desc.getMaskOperandIndex(), V);
+  default:
+    return false;
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return setCallOperand(Desc.getMaskOperandIndex(), V);
   }
 }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
index 32cabec59b297..a93fdacbdc9a2 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -93,14 +93,11 @@ OffsetKind combineKinds(OffsetKind LHS, OffsetKind RHS) {
   // Uniform values are all that's left.
   return eOffsetUniformVariable;
 }
-}  // namespace
+} // namespace
 
 OffsetInfo::OffsetInfo(StrideAnalysisResult &SAR, Value *V)
-    : Kind(eOffsetMayDiverge),
-      ActualValue(V),
-      StrideInt(0),
-      ManifestStride(nullptr),
-      BitMask(~uint64_t(0)) {
+    : Kind(eOffsetMayDiverge), ActualValue(V), StrideInt(0),
+      ManifestStride(nullptr), BitMask(~uint64_t(0)) {
   auto *const ty = V->getType();
   if (ty->isIntegerTy()) {
     analyze(V, SAR);
@@ -207,24 +204,24 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
     }
 
     switch (BOp->getOpcode()) {
-      default:
-        return setMayDiverge();
-      case Instruction::Add:
-        return combineAdd(LHS, RHS);
-      case Instruction::Sub:
-        return combineSub(LHS, RHS);
-      case Instruction::And:
-        return combineAnd(LHS, RHS);
-      case Instruction::Or:
-        return combineOr(LHS, RHS);
-      case Instruction::Xor:
-        return combineXor(LHS, RHS);
-      case Instruction::Mul:
-        return combineMul(LHS, RHS);
-      case Instruction::Shl:
-        return combineShl(LHS, RHS);
-      case Instruction::AShr:
-        return combineAShr(LHS, RHS);
+    default:
+      return setMayDiverge();
+    case Instruction::Add:
+      return combineAdd(LHS, RHS);
+    case Instruction::Sub:
+      return combineSub(LHS, RHS);
+    case Instruction::And:
+      return combineAnd(LHS, RHS);
+    case Instruction::Or:
+      return combineOr(LHS, RHS);
+    case Instruction::Xor:
+      return combineXor(LHS, RHS);
+    case Instruction::Mul:
+      return combineMul(LHS, RHS);
+    case Instruction::Shl:
+      return combineShl(LHS, RHS);
+    case Instruction::AShr:
+      return combineAShr(LHS, RHS);
     }
   }
 
@@ -324,30 +321,30 @@ OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
     const auto &BI = SAR.UVR.Ctx.builtins();
     if (const auto Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension)) {
       switch (Builtin->uniformity) {
-        default:
-        case compiler::utils::eBuiltinUniformityMaybeInstanceID:
-        case compiler::utils::eBuiltinUniformityNever:
-          return setMayDiverge();
-        case compiler::utils::eBuiltinUniformityLikeInputs:
-          break;
-        case compiler::utils::eBuiltinUniformityAlways:
-          return setKind(eOffsetUniformVariable);
-        case compiler::utils::eBuiltinUniformityInstanceID:
-          if (Builtin->properties & compiler::utils::eBuiltinPropertyLocalID) {
-            // If the local size is unknown (represented by zero), the resulting
-            // mask will be ~0ULL (all ones). Potentially, it is possible to use
-            // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in
-            // this case.
-            uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
-            LocalBitMask |= LocalBitMask >> 32;
-            LocalBitMask |= LocalBitMask >> 16;
-            LocalBitMask |= LocalBitMask >> 8;
-            LocalBitMask |= LocalBitMask >> 4;
-            LocalBitMask |= LocalBitMask >> 2;
-            LocalBitMask |= LocalBitMask >> 1;
-            BitMask = LocalBitMask;
-          }
-          return setStride(1);
+      default:
+      case compiler::utils::eBuiltinUniformityMaybeInstanceID:
+      case compiler::utils::eBuiltinUniformityNever:
+        return setMayDiverge();
+      case compiler::utils::eBuiltinUniformityLikeInputs:
+        break;
+      case compiler::utils::eBuiltinUniformityAlways:
+        return setKind(eOffsetUniformVariable);
+      case compiler::utils::eBuiltinUniformityInstanceID:
+        if (Builtin->properties & compiler::utils::eBuiltinPropertyLocalID) {
+          // If the local size is unknown (represented by zero), the resulting
+          // mask will be ~0ULL (all ones). Potentially, it is possible to use
+          // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in
+          // this case.
+          uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
+          LocalBitMask |= LocalBitMask >> 32;
+          LocalBitMask |= LocalBitMask >> 16;
+          LocalBitMask |= LocalBitMask >> 8;
+          LocalBitMask |= LocalBitMask >> 4;
+          LocalBitMask |= LocalBitMask >> 2;
+          LocalBitMask |= LocalBitMask >> 1;
+          BitMask = LocalBitMask;
+        }
+        return setStride(1);
       }
     }
   }
@@ -552,24 +549,24 @@ OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
     // Build strides immediately before their instructions
     B.SetInsertPoint(BOp);
     switch (BOp->getOpcode()) {
-      default:
-        return *this;
-      case Instruction::Add:
-        return manifestAdd(B, LHS, RHS);
-      case Instruction::Sub:
-        return manifestSub(B, LHS, RHS);
-      case Instruction::And:
-        return manifestAnd(B, LHS, RHS);
-      case Instruction::Or:
-        return manifestOr(B, LHS, RHS);
-      case Instruction::Xor:
-        return manifestXor(B, LHS, RHS);
-      case Instruction::Mul:
-        return manifestMul(B, LHS, RHS);
-      case Instruction::Shl:
-        return manifestShl(B, LHS, RHS);
-      case Instruction::AShr:
-        return manifestAShr(B, LHS, RHS);
+    default:
+      return *this;
+    case Instruction::Add:
+      return manifestAdd(B, LHS, RHS);
+    case Instruction::Sub:
+      return manifestSub(B, LHS, RHS);
+    case Instruction::And:
+      return manifestAnd(B, LHS, RHS);
+    case Instruction::Or:
+      return manifestOr(B, LHS, RHS);
+    case Instruction::Xor:
+      return manifestXor(B, LHS, RHS);
+    case Instruction::Mul:
+      return manifestMul(B, LHS, RHS);
+    case Instruction::Shl:
+      return manifestShl(B, LHS, RHS);
+    case Instruction::AShr:
+      return manifestAShr(B, LHS, RHS);
     }
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
index cd5df6d5011e9..5d27b424d9d00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -48,9 +48,9 @@ using namespace llvm;
 /// @brief Provide debug logging for Vecz's PassManager
 ///
 /// This flag is intended for testing and debugging purposes.
-static cl::opt<bool> DebugVeczPipeline(
-    "debug-vecz-pipeline",
-    cl::desc("Enable debug logging of the vecz PassManager"));
+static cl::opt<bool>
+    DebugVeczPipeline("debug-vecz-pipeline",
+                      cl::desc("Enable debug logging of the vecz PassManager"));
 
 /// @brief Provide debug logging for Vecz's PassManager
 ///
@@ -270,8 +270,8 @@ std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(Function &F) {
   return std::nullopt;
 }
 
-std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
-    Function &F, ModuleAnalysisManager &AM) {
+std::optional<VeczPassOptions>
+getAutoSubgroupSizeOpts(Function &F, ModuleAnalysisManager &AM) {
   // If there's a required sub-group size, we must return a vectorization
   // factor that gets us there.
   if (auto opts = getReqdSubgroupSizeOpts(F)) {
@@ -361,4 +361,4 @@ std::optional<VeczPassOptions> getAutoSubgroupSizeOpts(
   return vecz_opts;
 }
 
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
index 4744a3fab1897..4c2ac445b32c3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
@@ -278,4 +278,4 @@ bool Reachability::isReachable(BasicBlock *from, BasicBlock *to) const {
   return from == to || isReachableImpl(fromI->second, toI->second);
 }
 
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
index 8df91eeb9769a..175e1f043729d 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -63,13 +63,13 @@ using namespace llvm;
 using namespace vecz;
 
 class ControlFlowConversionState::Impl : public ControlFlowConversionState {
- public:
+public:
   Impl(Function &F, FunctionAnalysisManager &AM)
       : ControlFlowConversionState(F, AM) {}
 
   PreservedAnalyses run(Function &, FunctionAnalysisManager &);
 
- private:
+private:
   /// @brief utility struct used by LinearizeCFG to allow block retargeting
   /// info to be stored in a single contiguous vector of variable-length
   /// subvectors. This avoids having to use a vector of vectors, and all
@@ -271,8 +271,9 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @param[in] LTag The loop whose live values are being handled.
   /// @param[in] exitBlocks List of exit blocks before any transformation
   /// @return true if no problem occurred, false otherwise.
-  bool blendDivergentLoopLiveValues(
-      LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks);
+  bool
+  blendDivergentLoopLiveValues(LoopTag &LTag,
+                               const SmallVectorImpl<BasicBlock *> &exitBlocks);
 
   /// @brief Generate blend instruction for loop exit masks at the latch.
   ///
@@ -280,9 +281,10 @@ class ControlFlowConversionState::Impl : public ControlFlowConversionState {
   /// @param[in] exitEdges List of exit edges before any transformation
   /// @param[in] exitBlocks List of exit blocks before any transformation
   /// @return true if no problem occurred, false otherwise.
-  bool blendDivergentLoopExitMasks(
-      LoopTag &LTag, const SmallVectorImpl<Loop::Edge> &exitEdges,
-      const SmallVectorImpl<BasicBlock *> &exitBlocks);
+  bool
+  blendDivergentLoopExitMasks(LoopTag &LTag,
+                              const SmallVectorImpl<Loop::Edge> &exitEdges,
+                              const SmallVectorImpl<BasicBlock *> &exitBlocks);
 
   /// @brief Replace uses of loop values outside of a divergent loop.
   ///
@@ -419,7 +421,7 @@ static bool isBranchCondTrulyUniform(Value *cond, UniformValueResult &UVR) {
 
   return UVR.isTrueUniform(cmp);
 }
-}  // namespace
+} // namespace
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -433,13 +435,12 @@ PreservedAnalyses ControlFlowConversionPass::run(Function &F,
 
 ControlFlowConversionState::ControlFlowConversionState(
     Function &F, FunctionAnalysisManager &AM)
-    : F(F),
-      AM(AM),
-      VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
+    : F(F), AM(AM), VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
       Ctx(AM.getResult<VectorizationContextAnalysis>(F).getContext()) {}
 
-PreservedAnalyses ControlFlowConversionState::Impl::run(
-    Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses
+ControlFlowConversionState::Impl::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
   const auto &CFGR = AM.getResult<CFGAnalysis>(F);
   if (CFGR.getFailed()) {
     ++VeczCFGFail;
@@ -1233,52 +1234,52 @@ bool ControlFlowConversionState::Impl::tryApplyMaskToBinOp(
     // so it is sufficient to use the mask generated from the CFG.
     bool isUnsigned = false;
     switch (binOp->getOpcode()) {
-      case Instruction::UDiv:
-      case Instruction::URem:
-        isUnsigned = true;
-        LLVM_FALLTHROUGH;
-      case Instruction::SDiv:
-      case Instruction::SRem: {
-        auto *divisor = binOp->getOperand(1);
-        // no need to mask divides by a constant..
-        if (auto *C = dyn_cast<Constant>(divisor)) {
-          if (C->isZeroValue()) {
-            // Divides by constant zero can be a NOP since there is no
-            // division by zero exception in OpenCL.
-            auto *nop = binOp->getOperand(0);
-            I.replaceAllUsesWith(nop);
-            toDelete.emplace_back(&I, nop);
-          }
-        } else {
-          auto &masked = safeDivisors[divisor];
-          if (!masked) {
-            // NOTE this function does not check for the pattern
-            // "select (x eq 0) 1, x" or equivalent, so we might want to
-            // write it ourselves, but Instruction Combining cleans it up.
-            // NOTE that for a signed division, we also have to consider the
-            // potential overflow situation, which is not so simple
-            if (isUnsigned &&
-                isKnownNonZero(divisor, F.getParent()->getDataLayout())) {
-              // Static analysis concluded it can't be zero, so we don't need
-              // to do anything.
-              masked = divisor;
-            } else {
-              auto *SI = SelectInst::Create(
-                  mask, divisor, ConstantInt::get(divisor->getType(), 1),
-                  divisor->getName() + ".masked");
-              SI->insertBefore(I.getIterator());
-              masked = SI;
-            }
+    case Instruction::UDiv:
+    case Instruction::URem:
+      isUnsigned = true;
+      LLVM_FALLTHROUGH;
+    case Instruction::SDiv:
+    case Instruction::SRem: {
+      auto *divisor = binOp->getOperand(1);
+      // no need to mask divides by a constant..
+      if (auto *C = dyn_cast<Constant>(divisor)) {
+        if (C->isZeroValue()) {
+          // Divides by constant zero can be a NOP since there is no
+          // division by zero exception in OpenCL.
+          auto *nop = binOp->getOperand(0);
+          I.replaceAllUsesWith(nop);
+          toDelete.emplace_back(&I, nop);
+        }
+      } else {
+        auto &masked = safeDivisors[divisor];
+        if (!masked) {
+          // NOTE this function does not check for the pattern
+          // "select (x eq 0) 1, x" or equivalent, so we might want to
+          // write it ourselves, but Instruction Combining cleans it up.
+          // NOTE that for a signed division, we also have to consider the
+          // potential overflow situation, which is not so simple
+          if (isUnsigned &&
+              isKnownNonZero(divisor, F.getParent()->getDataLayout())) {
+            // Static analysis concluded it can't be zero, so we don't need
+            // to do anything.
+            masked = divisor;
+          } else {
+            auto *SI = SelectInst::Create(
+                mask, divisor, ConstantInt::get(divisor->getType(), 1),
+                divisor->getName() + ".masked");
+            SI->insertBefore(I.getIterator());
+            masked = SI;
           }
+        }
 
-          if (masked != divisor) {
-            binOp->setOperand(1, masked);
-          }
+        if (masked != divisor) {
+          binOp->setOperand(1, masked);
         }
-      } break;
+      }
+    } break;
 
-      default:
-        break;
+    default:
+      break;
     }
     return true;
   } else {
@@ -1345,7 +1346,7 @@ bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
   if (!callee) {
     callee = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
   }
-  VECZ_FAIL_IF(!callee);  // TODO: Support indirect function calls.
+  VECZ_FAIL_IF(!callee); // TODO: Support indirect function calls.
   // Check to see if this is a function that we know we won't be able to
   // handle in any other way.
   VECZ_FAIL_IF(callee->cannotDuplicate());
@@ -1878,35 +1879,35 @@ bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks(
 
   auto removeSuccessor = [this](Instruction *T, unsigned succIdx) {
     switch (T->getOpcode()) {
-      default:
-        // Any other kind of Terminator cannot be handled and until
-        // proven otherwise, should not.
-        break;
-      case Instruction::Br: {
-        const unsigned keepIdx = succIdx == 0 ? 1 : 0;
-        auto *newT = BranchInst::Create(T->getSuccessor(keepIdx));
-        newT->insertBefore(T->getIterator());
+    default:
+      // Any other kind of Terminator cannot be handled and until
+      // proven otherwise, should not.
+      break;
+    case Instruction::Br: {
+      const unsigned keepIdx = succIdx == 0 ? 1 : 0;
+      auto *newT = BranchInst::Create(T->getSuccessor(keepIdx));
+      newT->insertBefore(T->getIterator());
 
-        updateMaps(T, newT);
+      updateMaps(T, newT);
 
-        IRCleanup::deleteInstructionNow(T);
-        break;
-      }
-      case Instruction::Switch: {
-        SwitchInst *SI = cast<SwitchInst>(T);
-        if (succIdx == 0) {
-          SI->setDefaultDest(SI->getSuccessor(1));
-          SI->removeCase(SI->case_begin());
-        } else {
-          SI->removeCase(std::next(SI->case_begin(), succIdx - 1));
-        }
-        break;
-      }
-      case Instruction::IndirectBr: {
-        IndirectBrInst *IBI = cast<IndirectBrInst>(T);
-        IBI->removeDestination(succIdx);
-        break;
+      IRCleanup::deleteInstructionNow(T);
+      break;
+    }
+    case Instruction::Switch: {
+      SwitchInst *SI = cast<SwitchInst>(T);
+      if (succIdx == 0) {
+        SI->setDefaultDest(SI->getSuccessor(1));
+        SI->removeCase(SI->case_begin());
+      } else {
+        SI->removeCase(std::next(SI->case_begin(), succIdx - 1));
       }
+      break;
+    }
+    case Instruction::IndirectBr: {
+      IndirectBrInst *IBI = cast<IndirectBrInst>(T);
+      IBI->removeDestination(succIdx);
+      break;
+    }
     }
   };
 
@@ -2394,7 +2395,7 @@ void removeDeferrals(BasicBlock *src, DenseDeferralMap &deferrals) {
     deferrals.erase(deferredIt);
   }
 }
-}  // namespace
+} // namespace
 
 bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
   // The entry block cannot be targeted.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
index 24896a5af6134..3953e4257e1a9 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -76,10 +76,10 @@ Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
   return CI;
 }
 
-}  // namespace
+} // namespace
 
-PreservedAnalyses InlinePostVectorizationPass::run(
-    Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses
+InlinePostVectorizationPass::run(Function &F, FunctionAnalysisManager &AM) {
   bool modified = false;
   bool needToRunInliner = false;
   auto &BI =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
index 1f509ba022787..4235885c8a564 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -100,15 +100,15 @@ PacketRange InstantiationPass::instantiateInternal(Value *V) {
 PacketRange InstantiationPass::instantiateInstruction(Instruction *Ins) {
   // Figure out what kind of instruction it is and try to instantiate it.
   switch (Ins->getOpcode()) {
-    default:
-      // No special handling of this Instruction so just clone across lanes..
-      break;
+  default:
+    // No special handling of this Instruction so just clone across lanes..
+    break;
 
-    case Instruction::Call:
-      return assignInstance(instantiateCall(cast<CallInst>(Ins)), Ins);
+  case Instruction::Call:
+    return assignInstance(instantiateCall(cast<CallInst>(Ins)), Ins);
 
-    case Instruction::Alloca:
-      return assignInstance(instantiateAlloca(cast<AllocaInst>(Ins)), Ins);
+  case Instruction::Alloca:
+    return assignInstance(instantiateAlloca(cast<AllocaInst>(Ins)), Ins);
   }
 
   return assignInstance(instantiateByCloning(Ins), Ins);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
index 31ce6983d3eab..f2308d7bb050f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -186,7 +186,7 @@ bool canMoveDown(const SmallVectorImpl<Value *> &Group, Instruction *IA) {
   return false;
 }
 
-}  // namespace
+} // namespace
 
 bool InterleavedGroupCombinePass::InterleavedGroupInfo::canDeinterleaveMask(
     const Instruction &Mask) const {
@@ -237,8 +237,8 @@ bool InterleavedGroupCombinePass::InterleavedGroupInfo::canDeinterleaveMask(
   return true;
 }
 
-PreservedAnalyses InterleavedGroupCombinePass::run(
-    Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses
+InterleavedGroupCombinePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
   IRCleanup IC;
 
@@ -492,11 +492,13 @@ bool InterleavedGroupCombinePass::findGroup(
 
       // If the same offset occurs several times, we can still de-interleave
       // the unique ones, and maybe catch the rest the next time round.
-      InfoE = Group.Info.erase(
-          std::unique(InfoB, InfoE,
-                      [](const GroupMemberInfo &a, const GroupMemberInfo &b)
-                          -> bool { return a.Offset == b.Offset; }),
-          InfoE);
+      InfoE =
+          Group.Info.erase(std::unique(InfoB, InfoE,
+                                       [](const GroupMemberInfo &a,
+                                          const GroupMemberInfo &b) -> bool {
+                                         return a.Offset == b.Offset;
+                                       }),
+                           InfoE);
 
       if (Group.Info.size() <= 1) {
         // This could happen if our entire group has the same address, in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
index 949977f889a03..b1274d91cf196 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
@@ -14,15 +14,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "transform/passes.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
-#include "transform/passes.h"
 
 using namespace llvm;
 
-llvm::PreservedAnalyses vecz::VeczLoopRotatePass::run(
-    llvm::Loop &L, llvm::LoopAnalysisManager &LAM,
-    llvm::LoopStandardAnalysisResults &AR, llvm::LPMUpdater &LU) {
+llvm::PreservedAnalyses
+vecz::VeczLoopRotatePass::run(llvm::Loop &L, llvm::LoopAnalysisManager &LAM,
+                              llvm::LoopStandardAnalysisResults &AR,
+                              llvm::LPMUpdater &LU) {
   // Only process loops whose latch cannot exit the loop and its predecessors
   // cannot either.
   if (L.isLoopExiting(L.getLoopLatch())) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
index 996881c8b16ab..cbeb82b3c47f6 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -65,7 +65,7 @@ Value *createFixedBroadcastOfScalableVector(const vecz::TargetInfo &TI,
   assert(!factor.isScalable());
   return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ false);
 }
-}  // namespace
+} // namespace
 
 namespace vecz {
 IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) {
@@ -244,49 +244,49 @@ Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind,
   }
   auto IntrinsicOp = Intrinsic::not_intrinsic;
   switch (Kind) {
-    default:
-      break;
-    case RecurKind::None:
-      return nullptr;
-    case RecurKind::Add:
-      IntrinsicOp = Intrinsic::vp_reduce_add;
-      break;
-    case RecurKind::Mul:
-      IntrinsicOp = Intrinsic::vp_reduce_mul;
-      break;
-    case RecurKind::Or:
-      IntrinsicOp = Intrinsic::vp_reduce_or;
-      break;
-    case RecurKind::And:
-      IntrinsicOp = Intrinsic::vp_reduce_and;
-      break;
-    case RecurKind::Xor:
-      IntrinsicOp = Intrinsic::vp_reduce_xor;
-      break;
-    case RecurKind::FAdd:
-      IntrinsicOp = Intrinsic::vp_reduce_fadd;
-      break;
-    case RecurKind::FMul:
-      IntrinsicOp = Intrinsic::vp_reduce_fmul;
-      break;
-    case RecurKind::SMin:
-      IntrinsicOp = Intrinsic::vp_reduce_smin;
-      break;
-    case RecurKind::SMax:
-      IntrinsicOp = Intrinsic::vp_reduce_smax;
-      break;
-    case RecurKind::UMin:
-      IntrinsicOp = Intrinsic::vp_reduce_umin;
-      break;
-    case RecurKind::UMax:
-      IntrinsicOp = Intrinsic::vp_reduce_umax;
-      break;
-    case RecurKind::FMin:
-      IntrinsicOp = Intrinsic::vp_reduce_fmin;
-      break;
-    case RecurKind::FMax:
-      IntrinsicOp = Intrinsic::vp_reduce_fmax;
-      break;
+  default:
+    break;
+  case RecurKind::None:
+    return nullptr;
+  case RecurKind::Add:
+    IntrinsicOp = Intrinsic::vp_reduce_add;
+    break;
+  case RecurKind::Mul:
+    IntrinsicOp = Intrinsic::vp_reduce_mul;
+    break;
+  case RecurKind::Or:
+    IntrinsicOp = Intrinsic::vp_reduce_or;
+    break;
+  case RecurKind::And:
+    IntrinsicOp = Intrinsic::vp_reduce_and;
+    break;
+  case RecurKind::Xor:
+    IntrinsicOp = Intrinsic::vp_reduce_xor;
+    break;
+  case RecurKind::FAdd:
+    IntrinsicOp = Intrinsic::vp_reduce_fadd;
+    break;
+  case RecurKind::FMul:
+    IntrinsicOp = Intrinsic::vp_reduce_fmul;
+    break;
+  case RecurKind::SMin:
+    IntrinsicOp = Intrinsic::vp_reduce_smin;
+    break;
+  case RecurKind::SMax:
+    IntrinsicOp = Intrinsic::vp_reduce_smax;
+    break;
+  case RecurKind::UMin:
+    IntrinsicOp = Intrinsic::vp_reduce_umin;
+    break;
+  case RecurKind::UMax:
+    IntrinsicOp = Intrinsic::vp_reduce_umax;
+    break;
+  case RecurKind::FMin:
+    IntrinsicOp = Intrinsic::vp_reduce_fmin;
+    break;
+  case RecurKind::FMax:
+    IntrinsicOp = Intrinsic::vp_reduce_fmax;
+    break;
   }
 
   auto *const F = Intrinsic::getOrInsertDeclaration(
@@ -334,7 +334,7 @@ Value *createIndexSequence(IRBuilder<> &Builder, VectorType *VecTy,
   return ConstantVector::get(Indices);
 }
 
-}  // namespace vecz
+} // namespace vecz
 
 PacketRange PacketInfo::getRange(std::vector<llvm::Value *> &d,
                                  unsigned width) const {
@@ -663,7 +663,7 @@ Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
 
   return gather;
 }
-}  // namespace
+} // namespace
 
 const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
   const auto factor = packetizer.width().divideCoefficientBy(width);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
index efe1cefbc54b4..e45e2d91bf9d5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
@@ -38,9 +38,8 @@ using namespace llvm;
 
 STATISTIC(VeczPacketizeFail,
           "Number of kernels that failed to packetize [ID#P80]");
-STATISTIC(VeczSimdAnalysisFail,
-          "Number of kernels that SIMD Width Analysis "
-          "suggested not to packetize [ID#P81]");
+STATISTIC(VeczSimdAnalysisFail, "Number of kernels that SIMD Width Analysis "
+                                "suggested not to packetize [ID#P81]");
 
 char PacketizationPass::PassID = 0;
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
index 7a5118d7643e8..5e0a1fbc7e12e 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -77,11 +77,11 @@ STATISTIC(VeczPacketizeFailStride,
 
 // Just a little macro that can return an empty SmallVector, as a drop-in
 // replacement for VECZ_FAIL_IF..
-#define PACK_FAIL_IF(cond) \
-  do {                     \
-    if (cond) {            \
-      return {};           \
-    }                      \
+#define PACK_FAIL_IF(cond)                                                     \
+  do {                                                                         \
+    if (cond) {                                                                \
+      return {};                                                               \
+    }                                                                          \
   } while (false)
 
 namespace {
@@ -96,7 +96,7 @@ Type *getPaddedType(Type *Ty) {
   }
   return Ty;
 }
-}  // namespace
+} // namespace
 
 using ValuePacket = SmallVector<Value *, 16>;
 
@@ -106,7 +106,7 @@ using ValuePacket = SmallVector<Value *, 16>;
 /// Packetizer, while also ensuring that a Packetizer cannot be created except
 /// as the base class of its own implementation.
 class Packetizer::Impl : public Packetizer {
- public:
+public:
   Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM, ElementCount Width,
        unsigned Dim);
   Impl() = delete;
@@ -144,9 +144,9 @@ class Packetizer::Impl : public Packetizer {
   ValuePacket packetizeAndGet(Value *V, unsigned Width);
 
   /// @brief Helper to produce a Result from a Packet
-  Packetizer::Result getPacketizationResult(
-      Instruction *I, const SmallVectorImpl<Value *> &Packet,
-      bool UpdateStats = false);
+  Packetizer::Result
+  getPacketizationResult(Instruction *I, const SmallVectorImpl<Value *> &Packet,
+                         bool UpdateStats = false);
 
   /// @brief Packetize the given value from the function, only if it is a
   /// varying value. Ensures Mask Varying values are handled correctly.
@@ -207,8 +207,8 @@ class Packetizer::Impl : public Packetizer {
   ///
   /// @return The group collective data if the instruction is a call to any of
   /// the mux subgroup shuffle builtins; std::nullopt otherwise.
-  std::optional<compiler::utils::GroupCollective> isSubgroupShuffleLike(
-      Instruction *I);
+  std::optional<compiler::utils::GroupCollective>
+  isSubgroupShuffleLike(Instruction *I);
   /// @brief Packetize a sub-group shuffle builtin
   ///
   /// Note - not any shuffle-like operation, but specifically the 'shuffle'
@@ -227,8 +227,9 @@ class Packetizer::Impl : public Packetizer {
   /// @param[in] ShuffleXor Shuffle to packetize.
   ///
   /// @return Packetized instructions.
-  Result packetizeSubgroupShuffleXor(
-      Instruction *Ins, compiler::utils::GroupCollective ShuffleXor);
+  Result
+  packetizeSubgroupShuffleXor(Instruction *Ins,
+                              compiler::utils::GroupCollective ShuffleXor);
   /// @brief Packetize a sub-group shuffle-up or shuffle-down builtin
   ///
   /// Note - not any shuffle-like operation, but specifically the 'shuffle_up'
@@ -298,8 +299,9 @@ class Packetizer::Impl : public Packetizer {
   /// @param[in] AtomicInfo Information about the masked atomic.
   ///
   /// @return Packetized instruction.
-  ValuePacket packetizeMaskedAtomic(
-      CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo);
+  ValuePacket
+  packetizeMaskedAtomic(CallInst &CI,
+                        VectorizationContext::MaskedAtomic AtomicInfo);
   /// @brief Packetize a GEP instruction.
   ///
   /// @param[in] GEP Instruction to packetize.
@@ -440,15 +442,11 @@ class Packetizer::Impl : public Packetizer {
 
 Packetizer::Packetizer(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
                        ElementCount Width, unsigned Dim)
-    : AM(AM),
-      VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
+    : AM(AM), VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
       Ctx(AM.getResult<VectorizationContextAnalysis>(F).getContext()),
-      Choices(VU.choices()),
-      UVR(AM.getResult<UniformValueAnalysis>(F)),
+      Choices(VU.choices()), UVR(AM.getResult<UniformValueAnalysis>(F)),
       SAR(AM.getResult<StrideAnalysis>(F)),
-      PAR(AM.getResult<PacketizationAnalysis>(F)),
-      F(F),
-      SimdWidth(Width),
+      PAR(AM.getResult<PacketizationAnalysis>(F)), F(F), SimdWidth(Width),
       Dimension(Dim) {}
 
 Packetizer::Impl::Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
@@ -997,24 +995,24 @@ Packetizer::Result Packetizer::Impl::packetize(Value *V) {
 
   if (auto shuffle = isSubgroupShuffleLike(Ins)) {
     switch (shuffle->Op) {
-      default:
-        break;
-      case compiler::utils::GroupCollective::OpKind::Shuffle:
-        if (auto *s = packetizeSubgroupShuffle(Ins)) {
-          return broadcast(s);
-        }
-        break;
-      case compiler::utils::GroupCollective::OpKind::ShuffleXor:
-        if (auto s = packetizeSubgroupShuffleXor(Ins, *shuffle)) {
-          return s;
-        }
-        break;
-      case compiler::utils::GroupCollective::OpKind::ShuffleUp:
-      case compiler::utils::GroupCollective::OpKind::ShuffleDown:
-        if (auto s = packetizeSubgroupShuffleUpDown(Ins, *shuffle)) {
-          return s;
-        }
-        break;
+    default:
+      break;
+    case compiler::utils::GroupCollective::OpKind::Shuffle:
+      if (auto *s = packetizeSubgroupShuffle(Ins)) {
+        return broadcast(s);
+      }
+      break;
+    case compiler::utils::GroupCollective::OpKind::ShuffleXor:
+      if (auto s = packetizeSubgroupShuffleXor(Ins, *shuffle)) {
+        return s;
+      }
+      break;
+    case compiler::utils::GroupCollective::OpKind::ShuffleUp:
+    case compiler::utils::GroupCollective::OpKind::ShuffleDown:
+      if (auto s = packetizeSubgroupShuffleUpDown(Ins, *shuffle)) {
+        return s;
+      }
+      break;
     }
     // We can't packetize all sub-group shuffle-like operations, but we also
     // can't vectorize or instantiate them - so provide a diagnostic saying as
@@ -1128,61 +1126,61 @@ Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
 
   // Figure out what kind of instruction it is and try to vectorize it.
   switch (Ins->getOpcode()) {
-    default:
-      if (Ins->isBinaryOp()) {
-        results = packetizeBinaryOp(cast<BinaryOperator>(Ins));
-      } else if (Ins->isCast()) {
-        results = packetizeCast(cast<CastInst>(Ins));
-      } else if (Ins->isUnaryOp()) {
-        results = packetizeUnaryOp(cast<UnaryOperator>(Ins));
-      }
-      break;
-
-    case Instruction::PHI:
-      results = packetizePHI(cast<PHINode>(Ins));
-      break;
-    case Instruction::GetElementPtr:
-      results = packetizeGEP(cast<GetElementPtrInst>(Ins));
-      break;
-    case Instruction::Store:
-      results = packetizeStore(cast<StoreInst>(Ins));
-      break;
-    case Instruction::Load:
-      results = packetizeLoad(cast<LoadInst>(Ins));
-      break;
-    case Instruction::Call:
-      results = packetizeCall(cast<CallInst>(Ins));
-      break;
-    case Instruction::ICmp:
-      results = packetizeICmp(cast<ICmpInst>(Ins));
-      break;
-    case Instruction::FCmp:
-      results = packetizeFCmp(cast<FCmpInst>(Ins));
-      break;
-    case Instruction::Select:
-      results = packetizeSelect(cast<SelectInst>(Ins));
-      break;
-    case Instruction::InsertElement:
-      results = packetizeInsertElement(cast<InsertElementInst>(Ins));
-      break;
-    case Instruction::ExtractElement:
-      results = packetizeExtractElement(cast<ExtractElementInst>(Ins));
-      break;
-    case Instruction::InsertValue:
-      results = packetizeInsertValue(cast<InsertValueInst>(Ins));
-      break;
-    case Instruction::ExtractValue:
-      results = packetizeExtractValue(cast<ExtractValueInst>(Ins));
-      break;
-    case Instruction::ShuffleVector:
-      results = packetizeShuffleVector(cast<ShuffleVectorInst>(Ins));
-      break;
-    case Instruction::Freeze:
-      results = packetizeFreeze(cast<FreezeInst>(Ins));
-      break;
-    case Instruction::AtomicCmpXchg:
-      results = packetizeAtomicCmpXchg(cast<AtomicCmpXchgInst>(Ins));
-      break;
+  default:
+    if (Ins->isBinaryOp()) {
+      results = packetizeBinaryOp(cast<BinaryOperator>(Ins));
+    } else if (Ins->isCast()) {
+      results = packetizeCast(cast<CastInst>(Ins));
+    } else if (Ins->isUnaryOp()) {
+      results = packetizeUnaryOp(cast<UnaryOperator>(Ins));
+    }
+    break;
+
+  case Instruction::PHI:
+    results = packetizePHI(cast<PHINode>(Ins));
+    break;
+  case Instruction::GetElementPtr:
+    results = packetizeGEP(cast<GetElementPtrInst>(Ins));
+    break;
+  case Instruction::Store:
+    results = packetizeStore(cast<StoreInst>(Ins));
+    break;
+  case Instruction::Load:
+    results = packetizeLoad(cast<LoadInst>(Ins));
+    break;
+  case Instruction::Call:
+    results = packetizeCall(cast<CallInst>(Ins));
+    break;
+  case Instruction::ICmp:
+    results = packetizeICmp(cast<ICmpInst>(Ins));
+    break;
+  case Instruction::FCmp:
+    results = packetizeFCmp(cast<FCmpInst>(Ins));
+    break;
+  case Instruction::Select:
+    results = packetizeSelect(cast<SelectInst>(Ins));
+    break;
+  case Instruction::InsertElement:
+    results = packetizeInsertElement(cast<InsertElementInst>(Ins));
+    break;
+  case Instruction::ExtractElement:
+    results = packetizeExtractElement(cast<ExtractElementInst>(Ins));
+    break;
+  case Instruction::InsertValue:
+    results = packetizeInsertValue(cast<InsertValueInst>(Ins));
+    break;
+  case Instruction::ExtractValue:
+    results = packetizeExtractValue(cast<ExtractValueInst>(Ins));
+    break;
+  case Instruction::ShuffleVector:
+    results = packetizeShuffleVector(cast<ShuffleVectorInst>(Ins));
+    break;
+  case Instruction::Freeze:
+    results = packetizeFreeze(cast<FreezeInst>(Ins));
+    break;
+  case Instruction::AtomicCmpXchg:
+    results = packetizeAtomicCmpXchg(cast<AtomicCmpXchgInst>(Ins));
+    break;
   }
 
   if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) {
@@ -2103,14 +2101,14 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
     // cases can set independently what operand must be skipped.
     SmallVector<bool, maxOperands> operandsToSkip(maxOperands, false);
     switch (IntrID) {
-      case Intrinsic::abs:
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-        // def abs [LLVMMatchType<0>, llvm_i1_ty]
-        operandsToSkip = {false, true};
-        break;
-      default:
-        break;
+    case Intrinsic::abs:
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+      // def abs [LLVMMatchType<0>, llvm_i1_ty]
+      operandsToSkip = {false, true};
+      break;
+    default:
+      break;
     }
 
     auto *const ty = CI->getType();
@@ -2140,7 +2138,7 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
     }
 
     const auto name = CI->getName();
-    Type *const types[1] = {wideTy};  // because LLVM 13 is a numpty
+    Type *const types[1] = {wideTy}; // because LLVM 13 is a numpty
     Value *opVals[maxOperands];
     for (unsigned i = 0; i < packetWidth; ++i) {
       for (unsigned j = 0; j < n; ++j) {
@@ -2276,8 +2274,9 @@ ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeGroupScan(
-    CallInst *CI, compiler::utils::GroupCollective Scan) {
+ValuePacket
+Packetizer::Impl::packetizeGroupScan(CallInst *CI,
+                                     compiler::utils::GroupCollective Scan) {
   ValuePacket results;
 
   Function *callee = CI->getCalledFunction();
@@ -2304,46 +2303,46 @@ ValuePacket Packetizer::Impl::packetizeGroupScan(
   bool opIsSignedInt = false;
 
   switch (Scan.Recurrence) {
-    default:
-      assert(false && "Impossible subgroup scan kind");
-      return results;
-    case llvm::RecurKind::Add:
-    case llvm::RecurKind::FAdd:
-      op = "add";
-      break;
-    case llvm::RecurKind::SMin:
-      op = "smin";
-      opIsSignedInt = true;
-      break;
-    case llvm::RecurKind::UMin:
-      op = "umin";
-      break;
-    case llvm::RecurKind::FMin:
-      op = "min";
-      break;
-    case llvm::RecurKind::SMax:
-      op = "smax";
-      opIsSignedInt = true;
-      break;
-    case llvm::RecurKind::UMax:
-      op = "umax";
-      break;
-    case llvm::RecurKind::FMax:
-      op = "max";
-      break;
-    case llvm::RecurKind::Mul:
-    case llvm::RecurKind::FMul:
-      op = "mul";
-      break;
-    case llvm::RecurKind::And:
-      op = "and";
-      break;
-    case llvm::RecurKind::Or:
-      op = "or";
-      break;
-    case llvm::RecurKind::Xor:
-      op = "xor";
-      break;
+  default:
+    assert(false && "Impossible subgroup scan kind");
+    return results;
+  case llvm::RecurKind::Add:
+  case llvm::RecurKind::FAdd:
+    op = "add";
+    break;
+  case llvm::RecurKind::SMin:
+    op = "smin";
+    opIsSignedInt = true;
+    break;
+  case llvm::RecurKind::UMin:
+    op = "umin";
+    break;
+  case llvm::RecurKind::FMin:
+    op = "min";
+    break;
+  case llvm::RecurKind::SMax:
+    op = "smax";
+    opIsSignedInt = true;
+    break;
+  case llvm::RecurKind::UMax:
+    op = "umax";
+    break;
+  case llvm::RecurKind::FMax:
+    op = "max";
+    break;
+  case llvm::RecurKind::Mul:
+  case llvm::RecurKind::FMul:
+    op = "mul";
+    break;
+  case llvm::RecurKind::And:
+    op = "and";
+    break;
+  case llvm::RecurKind::Or:
+    op = "or";
+    break;
+  case llvm::RecurKind::Xor:
+    op = "xor";
+    break;
   }
 
   // Now create the mangled builtin function name.
@@ -2450,20 +2449,20 @@ Value *Packetizer::Impl::vectorizeInstruction(Instruction *Ins) {
   // Figure out what kind of instruction it is and try to vectorize it.
   Value *Result = nullptr;
   switch (Ins->getOpcode()) {
-    default:
-      break;
-    case Instruction::Call:
-      Result = vectorizeCall(cast<CallInst>(Ins));
-      break;
-    case Instruction::Ret:
-      Result = vectorizeReturn(cast<ReturnInst>(Ins));
-      break;
-    case Instruction::Alloca:
-      Result = vectorizeAlloca(cast<AllocaInst>(Ins));
-      break;
-    case Instruction::ExtractValue:
-      Result = vectorizeExtractValue(cast<ExtractValueInst>(Ins));
-      break;
+  default:
+    break;
+  case Instruction::Call:
+    Result = vectorizeCall(cast<CallInst>(Ins));
+    break;
+  case Instruction::Ret:
+    Result = vectorizeReturn(cast<ReturnInst>(Ins));
+    break;
+  case Instruction::Alloca:
+    Result = vectorizeAlloca(cast<AllocaInst>(Ins));
+    break;
+  case Instruction::ExtractValue:
+    Result = vectorizeExtractValue(cast<ExtractValueInst>(Ins));
+    break;
   }
 
   if (Result) {
@@ -3082,8 +3081,8 @@ ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) {
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeAtomicCmpXchg(
-    AtomicCmpXchgInst *AtomicI) {
+ValuePacket
+Packetizer::Impl::packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI) {
   ValuePacket results;
 
   VectorizationContext::MaskedAtomic MA;
@@ -3541,8 +3540,8 @@ Value *Packetizer::Impl::vectorizeExtractValue(ExtractValueInst *ExtractValue) {
   return Result;
 }
 
-ValuePacket Packetizer::Impl::packetizeInsertElement(
-    InsertElementInst *InsertElement) {
+ValuePacket
+Packetizer::Impl::packetizeInsertElement(InsertElementInst *InsertElement) {
   ValuePacket results;
   Value *Result = nullptr;
 
@@ -3716,8 +3715,8 @@ ValuePacket Packetizer::Impl::packetizeInsertElement(
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeExtractElement(
-    ExtractElementInst *ExtractElement) {
+ValuePacket
+Packetizer::Impl::packetizeExtractElement(ExtractElementInst *ExtractElement) {
   ValuePacket results;
   Value *Result = nullptr;
 
@@ -3846,8 +3845,8 @@ ValuePacket Packetizer::Impl::packetizeExtractElement(
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeInsertValue(
-    InsertValueInst *InsertValue) {
+ValuePacket
+Packetizer::Impl::packetizeInsertValue(InsertValueInst *InsertValue) {
   ValuePacket results;
 
   Value *const Val = InsertValue->getInsertedValueOperand();
@@ -3887,8 +3886,8 @@ ValuePacket Packetizer::Impl::packetizeInsertValue(
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeExtractValue(
-    ExtractValueInst *ExtractValue) {
+ValuePacket
+Packetizer::Impl::packetizeExtractValue(ExtractValueInst *ExtractValue) {
   ValuePacket results;
 
   Value *const Aggregate = ExtractValue->getAggregateOperand();
@@ -3910,8 +3909,8 @@ ValuePacket Packetizer::Impl::packetizeExtractValue(
   return results;
 }
 
-ValuePacket Packetizer::Impl::packetizeShuffleVector(
-    ShuffleVectorInst *Shuffle) {
+ValuePacket
+Packetizer::Impl::packetizeShuffleVector(ShuffleVectorInst *Shuffle) {
   Value *const srcA = Shuffle->getOperand(0);
   Value *const srcB = Shuffle->getOperand(1);
   assert(srcA && "Could not get operand 0 from Shuffle");
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
index d24d615dcf764..a496b0fdb44c1 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -177,4 +177,4 @@ PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
   return Preserved;
 }
 
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
index e82ec366c8f85..b72ab38121384 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -156,64 +156,64 @@ bool hoistInstructions(BasicBlock &BB, BranchInst &Branch, bool exceptions) {
     // constant.
     bool isUnsigned = false;
     switch (binOp->getOpcode()) {
-      default:
-        break;
-      case Instruction::UDiv:
-      case Instruction::URem:
-        isUnsigned = true;
-        LLVM_FALLTHROUGH;
-      case Instruction::SDiv:
-      case Instruction::SRem: {
-        auto *divisor = binOp->getOperand(1);
-        if (auto *C = dyn_cast<Constant>(divisor)) {
-          if (C->isZeroValue()) {
-            // Divides by constant zero can be a NOP since there is no
-            // division by zero exception in OpenCL.
-            I.replaceAllUsesWith(binOp->getOperand(0));
-            I.eraseFromParent();
-          }
-        } else {
-          // if the divisor could be illegal, we need to guard it with a
-          // select instruction generated from the branch condition.
-          auto &masked = safeDivisors[divisor];
-          if (!masked) {
-            // NOTE this function does not check for the pattern
-            // "select (x eq 0) 1, x" or equivalent, so we might want to
-            // write it ourselves, but Instruction Combining cleans it
-            // up. NOTE that for a signed division, we also have to
-            // consider the potential overflow situation, which is not
-            // so simple
-            if (isUnsigned && isKnownNonZero(divisor, DL)) {
-              // Static analysis concluded it can't be zero, so we don't
-              // need to do anything.
-              masked = divisor;
+    default:
+      break;
+    case Instruction::UDiv:
+    case Instruction::URem:
+      isUnsigned = true;
+      LLVM_FALLTHROUGH;
+    case Instruction::SDiv:
+    case Instruction::SRem: {
+      auto *divisor = binOp->getOperand(1);
+      if (auto *C = dyn_cast<Constant>(divisor)) {
+        if (C->isZeroValue()) {
+          // Divides by constant zero can be a NOP since there is no
+          // division by zero exception in OpenCL.
+          I.replaceAllUsesWith(binOp->getOperand(0));
+          I.eraseFromParent();
+        }
+      } else {
+        // if the divisor could be illegal, we need to guard it with a
+        // select instruction generated from the branch condition.
+        auto &masked = safeDivisors[divisor];
+        if (!masked) {
+          // NOTE this function does not check for the pattern
+          // "select (x eq 0) 1, x" or equivalent, so we might want to
+          // write it ourselves, but Instruction Combining cleans it
+          // up. NOTE that for a signed division, we also have to
+          // consider the potential overflow situation, which is not
+          // so simple
+          if (isUnsigned && isKnownNonZero(divisor, DL)) {
+            // Static analysis concluded it can't be zero, so we don't
+            // need to do anything.
+            masked = divisor;
+          } else {
+            Value *one = ConstantInt::get(divisor->getType(), 1);
+            Value *cond = Branch.getCondition();
+
+            Instruction *SI;
+            if (TrueBranch) {
+              SI = SelectInst::Create(cond, divisor, one,
+                                      divisor->getName() + ".hoist_guard");
             } else {
-              Value *one = ConstantInt::get(divisor->getType(), 1);
-              Value *cond = Branch.getCondition();
-
-              Instruction *SI;
-              if (TrueBranch) {
-                SI = SelectInst::Create(cond, divisor, one,
-                                        divisor->getName() + ".hoist_guard");
-              } else {
-                SI = SelectInst::Create(cond, one, divisor,
-                                        divisor->getName() + ".hoist_guard");
-              }
-              SI->insertBefore(I.getIterator());
-              masked = SI;
+              SI = SelectInst::Create(cond, one, divisor,
+                                      divisor->getName() + ".hoist_guard");
             }
+            SI->insertBefore(I.getIterator());
+            masked = SI;
           }
+        }
 
-          if (masked != divisor) {
-            binOp->setOperand(1, masked);
-          }
+        if (masked != divisor) {
+          binOp->setOperand(1, masked);
         }
-      } break;
+      }
+    } break;
     }
   }
   return modified;
 }
-}  // namespace
+} // namespace
 
 PreservedAnalyses PreLinearizePass::run(Function &F,
                                         FunctionAnalysisManager &AM) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
index 734a1e1fb93d0..d59a65037555b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -82,9 +82,10 @@ static bool IncrementPtr(const char **fmt) {
   return false;
 }
 
-GlobalVariable *GetNewFormatStringAsGlobalVar(
-    Module &module, GlobalVariable *const string_value,
-    const std::string &new_format_string) {
+GlobalVariable *
+GetNewFormatStringAsGlobalVar(Module &module,
+                              GlobalVariable *const string_value,
+                              const std::string &new_format_string) {
   const ArrayRef<uint8_t> Elts((const uint8_t *)new_format_string.data(),
                                new_format_string.size());
   Constant *new_format_string_const =
@@ -186,61 +187,61 @@ EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
           return kPrintfError_invalidFormatString;
         }
         switch (*fmt) {
-          default:
-            LLVM_DEBUG(dbgs() << "Unexpected character in format string \""
-                              << str.c_str() << "\"");
+        default:
+          LLVM_DEBUG(dbgs() << "Unexpected character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        case '1':
+          // Must be 16, else error
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 16 in format string \""
+                       << str.c_str() << "\"");
             return kPrintfError_invalidFormatString;
-          case '1':
-            // Must be 16, else error
-            if (IncrementPtr(&fmt)) {
-              LLVM_DEBUG(dbgs()
-                         << "Expected vector width of 16 in format string \""
-                         << str.c_str() << "\"");
-              return kPrintfError_invalidFormatString;
-            }
-            if (*fmt != '6') {
-              LLVM_DEBUG(dbgs()
-                         << "Expected vector width of 16 in format string \""
-                         << str.c_str() << "\"");
-              return kPrintfError_invalidFormatString;
-            }
-            vector_length = 16u;
-            break;
-          case '2':
-            vector_length = 2u;
-            break;
-          case '3':
-            vector_length = 3u;
-            // Lookahead for vectors of width 32. We know that we won't go out
-            // of bounds because worst case scenario there should be a null byte
-            // after the '3'.
-            if (*(fmt + 1) == '2') {
-              IncrementPtr(&fmt);
-              vector_length = 32u;
-            }
-            break;
-          case '4':
-            vector_length = 4u;
-            break;
-          case '6':
-            // Must be 64, else error
-            if (IncrementPtr(&fmt)) {
-              LLVM_DEBUG(dbgs()
-                         << "Expected vector width of 64 in format string \""
-                         << str.c_str() << "\"");
-              return kPrintfError_invalidFormatString;
-            }
-            if (*fmt != '4') {
-              LLVM_DEBUG(dbgs()
-                         << "Expected vector width of 64 in format string \""
-                         << str.c_str() << "\"");
-              return kPrintfError_invalidFormatString;
-            }
-            vector_length = 64u;
-            break;
-          case '8':
-            vector_length = 8u;
-            break;
+          }
+          if (*fmt != '6') {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 16 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          vector_length = 16u;
+          break;
+        case '2':
+          vector_length = 2u;
+          break;
+        case '3':
+          vector_length = 3u;
+          // Lookahead for vectors of width 32. We know that we won't go out
+          // of bounds because worst case scenario there should be a null byte
+          // after the '3'.
+          if (*(fmt + 1) == '2') {
+            IncrementPtr(&fmt);
+            vector_length = 32u;
+          }
+          break;
+        case '4':
+          vector_length = 4u;
+          break;
+        case '6':
+          // Must be 64, else error
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 64 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          if (*fmt != '4') {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 64 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          vector_length = 64u;
+          break;
+        case '8':
+          vector_length = 8u;
+          break;
         }
         if (IncrementPtr(&fmt)) {
           LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
@@ -265,76 +266,74 @@ EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
       if (has_supplied_length_modifier) {
         bool consume_next_char = true;
         switch (*fmt) {
-          default:
-            // The 'j', 'z', 't', and 'L' length modifiers are not supported by
-            // OpenCL C.
-            LLVM_DEBUG(dbgs() << "Unsupported length modifier '" << *fmt
-                              << "'specifier in format string \"" << str.c_str()
-                              << "\"");
+        default:
+          // The 'j', 'z', 't', and 'L' length modifiers are not supported by
+          // OpenCL C.
+          LLVM_DEBUG(dbgs() << "Unsupported length modifier '" << *fmt
+                            << "'specifier in format string \"" << str.c_str()
+                            << "\"");
+          return kPrintfError_invalidFormatString;
+        case 'h':
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
             return kPrintfError_invalidFormatString;
-          case 'h':
-            if (IncrementPtr(&fmt)) {
-              LLVM_DEBUG(dbgs()
-                         << "Unexpected \\0 character in format string \""
-                         << str.c_str() << "\"");
-              return kPrintfError_invalidFormatString;
-            }
-            if (*fmt == 'h') {
-              specifier_string += "hh";
-            } else if (*fmt == 'l') {
-              // Native printf doesn't recognize 'hl' so we don't
-              // add it to the new format string.  Luckily, 'hl'
-              // is sizeof(int) - the same as the default on
-              // native printf!
-
-              // Additionally, 'hl' modifier may only be used in
-              // conjunction with the vector specifier
-              if (!is_vector) {
-                LLVM_DEBUG(dbgs()
-                           << "Unexpected \\0 character in format string \""
-                           << str.c_str() << "\"");
-                return kPrintfError_invalidFormatString;
-              }
-            } else {
-              specifier_string += 'h';
-              // We've already incremented the ptr and we found nothing; don't
-              // do it again
-              consume_next_char = false;
-            }
-            break;
-          case 'l':
-            specifier_string += *fmt;
-            // Check ahead to see if the user is using the invalid 'll' length
-            // modifier
-            if (IncrementPtr(&fmt)) {
+          }
+          if (*fmt == 'h') {
+            specifier_string += "hh";
+          } else if (*fmt == 'l') {
+            // Native printf doesn't recognize 'hl' so we don't
+            // add it to the new format string.  Luckily, 'hl'
+            // is sizeof(int) - the same as the default on
+            // native printf!
+
+            // Additionally, 'hl' modifier may only be used in
+            // conjunction with the vector specifier
+            if (!is_vector) {
               LLVM_DEBUG(dbgs()
                          << "Unexpected \\0 character in format string \""
                          << str.c_str() << "\"");
               return kPrintfError_invalidFormatString;
             }
-            if (*fmt == 'l') {
-              LLVM_DEBUG(dbgs()
-                         << "The 'll' length specifier is invalid in OpenCL "
-                            "printf\n  > "
-                         << str.c_str() << "\"");
-              return kPrintfError_invalidFormatString;
-            }
-            // We've already incremented the ptr; don't do it again
-
-            // The 'l' specifier for the OpenCL printf expects 64 bits
-            // integers, check if the system's long are actually 64 bits wide
-            // and if not upgrade the format specifier to 'll'.
-            //
-            // FIXME: This only works for host based devices, which is fine for
-            // our current printf implementation, but it should really be
-            // removed once we have a proper printf implementation.
-            if (sizeof(long) != 8) {
-              specifier_string += 'l';
-            }
-
+          } else {
+            specifier_string += 'h';
+            // We've already incremented the ptr and we found nothing; don't
+            // do it again
             consume_next_char = false;
-            has_used_l_length_modifier = true;
-            break;
+          }
+          break;
+        case 'l':
+          specifier_string += *fmt;
+          // Check ahead to see if the user is using the invalid 'll' length
+          // modifier
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          if (*fmt == 'l') {
+            LLVM_DEBUG(dbgs()
+                       << "The 'll' length specifier is invalid in OpenCL "
+                          "printf\n  > "
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          // We've already incremented the ptr; don't do it again
+
+          // The 'l' specifier for the OpenCL printf expects 64 bits
+          // integers, check if the system's long are actually 64 bits wide
+          // and if not upgrade the format specifier to 'll'.
+          //
+          // FIXME: This only works for host based devices, which is fine for
+          // our current printf implementation, but it should really be
+          // removed once we have a proper printf implementation.
+          if (sizeof(long) != 8) {
+            specifier_string += 'l';
+          }
+
+          consume_next_char = false;
+          has_used_l_length_modifier = true;
+          break;
         }
         if (consume_next_char) {
           if (IncrementPtr(&fmt)) {
@@ -349,27 +348,27 @@ EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
       specifier_string += *fmt;
 
       switch (*fmt) {
-        default:
-          break;
-        case 'n':
-          // The 'n' conversion specifier is not supported by OpenCL C.
+      default:
+        break;
+      case 'n':
+        // The 'n' conversion specifier is not supported by OpenCL C.
+        LLVM_DEBUG(
+            dbgs() << "The 'n' conversion specifier is invalid in OpenCL "
+                      "printf\n  > "
+                   << str.c_str() << "\"");
+        return kPrintfError_invalidFormatString;
+      case 's': // Intentional fall-through
+      case 'c':
+        // The 'l' length modifier followed by the 'c' or 's' conversion
+        // specifiers is not supported by OpenCL C.
+        if (has_used_l_length_modifier) {
           LLVM_DEBUG(dbgs()
-                     << "The 'n' conversion specifier is invalid in OpenCL "
-                        "printf\n  > "
+                     << "The 'l' length modifier followed by the 'c' or "
+                        "'s' conversion is invalid in OpenCL printf\n  > "
                      << str.c_str() << "\"");
           return kPrintfError_invalidFormatString;
-        case 's':  // Intentional fall-through
-        case 'c':
-          // The 'l' length modifier followed by the 'c' or 's' conversion
-          // specifiers is not supported by OpenCL C.
-          if (has_used_l_length_modifier) {
-            LLVM_DEBUG(dbgs()
-                       << "The 'l' length modifier followed by the 'c' or "
-                          "'s' conversion is invalid in OpenCL printf\n  > "
-                       << str.c_str() << "\"");
-            return kPrintfError_invalidFormatString;
-          }
-          break;
+        }
+        break;
       }
 
       // Output the %specifier for each element of the vector,
@@ -389,4 +388,4 @@ EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
 
   return kPrintfError_success;
 }
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
index 09502d2756243..fcb0dfca9e621 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -180,7 +180,7 @@ struct OperandTracer {
   }
 };
 
-}  // namespace
+} // namespace
 
 PreservedAnalyses ScalarizationPass::run(llvm::Function &F,
                                          llvm::FunctionAnalysisManager &AM) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
index 63b33c02f3a44..af44c92bfd780 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -49,7 +49,7 @@ namespace {
 /// The current limitation is due to the masks being used in the SimdPackets
 /// being stored as uint64_t.
 const unsigned MAX_SIMD_WIDTH = 64;
-}  // namespace
+} // namespace
 
 using namespace vecz;
 using namespace llvm;
@@ -212,9 +212,9 @@ Value *Scalarizer::scalarizeOperands(Instruction *I) {
     if (BC->getSrcTy()->isVectorTy() && !BC->getDestTy()->isVectorTy()) {
       // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
       // that, there is a limit on the vector size that Vecz can handle.
-      VECZ_ERROR_IF(
-          multi_llvm::getVectorNumElements(BC->getSrcTy()) > MAX_SIMD_WIDTH,
-          "The SIMD width is too large");
+      VECZ_ERROR_IF(multi_llvm::getVectorNumElements(BC->getSrcTy()) >
+                        MAX_SIMD_WIDTH,
+                    "The SIMD width is too large");
       return scalarizeOperandsBitCast(BC);
     }
   }
@@ -337,54 +337,54 @@ Value *Scalarizer::scalarizeReduceIntrinsic(IntrinsicInst *Intrin) {
   bool isHandled = true;
   Instruction::BinaryOps BinOpcode;
   switch (Intrin->getIntrinsicID()) {
-    default:
-      isHandled = false;
-      break;
-    case Intrinsic::vector_reduce_and:
-      BinOpcode = Instruction::And;
-      break;
-    case Intrinsic::vector_reduce_or:
-      BinOpcode = Instruction::Or;
-      break;
-    case Intrinsic::vector_reduce_xor:
-      BinOpcode = Instruction::Xor;
-      break;
-    case Intrinsic::vector_reduce_add:
-      // TODO: Need to handle FP reduce_add (Instruction::FAdd)
-      if (!Intrin->getType()->isFloatTy()) {
-        BinOpcode = Instruction::Add;
-      } else {
-        isHandled = false;
-      }
-      break;
-    case Intrinsic::vector_reduce_mul:
-      // TODO: Need to handle FP reduce_mul (Instruction::FMul)
-      if (!Intrin->getType()->isFloatTy()) {
-        BinOpcode = Instruction::Mul;
-      } else {
-        isHandled = false;
-      }
-      break;
-    case Intrinsic::vector_reduce_fadd:
-      // TODO: Need to handle FP reduce_add
-      isHandled = false;
-      break;
-    case Intrinsic::vector_reduce_fmul:
-      // TODO: Need to handle FP reduce_mul
-      isHandled = false;
-      break;
-    case Intrinsic::vector_reduce_fmax:
-    case Intrinsic::vector_reduce_smax:
-    case Intrinsic::vector_reduce_umax:
-      // TODO: Need to handle Int (signed/unsigned) Max and FP Max
+  default:
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_and:
+    BinOpcode = Instruction::And;
+    break;
+  case Intrinsic::vector_reduce_or:
+    BinOpcode = Instruction::Or;
+    break;
+  case Intrinsic::vector_reduce_xor:
+    BinOpcode = Instruction::Xor;
+    break;
+  case Intrinsic::vector_reduce_add:
+    // TODO: Need to handle FP reduce_add (Instruction::FAdd)
+    if (!Intrin->getType()->isFloatTy()) {
+      BinOpcode = Instruction::Add;
+    } else {
       isHandled = false;
-      break;
-    case Intrinsic::vector_reduce_fmin:
-    case Intrinsic::vector_reduce_smin:
-    case Intrinsic::vector_reduce_umin:
-      // TODO: Need to handle Int (signed/unsigned) Min and FP Min
+    }
+    break;
+  case Intrinsic::vector_reduce_mul:
+    // TODO: Need to handle FP reduce_mul (Instruction::FMul)
+    if (!Intrin->getType()->isFloatTy()) {
+      BinOpcode = Instruction::Mul;
+    } else {
       isHandled = false;
-      break;
+    }
+    break;
+  case Intrinsic::vector_reduce_fadd:
+    // TODO: Need to handle FP reduce_add
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_fmul:
+    // TODO: Need to handle FP reduce_mul
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umax:
+    // TODO: Need to handle Int (signed/unsigned) Max and FP Max
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_fmin:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umin:
+    // TODO: Need to handle Int (signed/unsigned) Min and FP Min
+    isHandled = false;
+    break;
   }
   // If it's an intrinsic we don't handle here, return nullptr and fallback
   // to simple gathering of any scalarized operands.
@@ -550,50 +550,50 @@ SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) {
   // Figure out what kind of instruction it is and try to scalarize it.
   SimdPacket *Result = nullptr;
   switch (Ins->getOpcode()) {
-    default:
-      if (Ins->isBinaryOp()) {
-        Result = scalarizeBinaryOp(cast<BinaryOperator>(V), PM);
-      } else if (Ins->isCast()) {
-        Result = scalarizeCast(cast<CastInst>(V), PM);
-      } else if (Ins->isUnaryOp()) {
-        Result = scalarizeUnaryOp(cast<UnaryOperator>(V), PM);
-      }
-      break;
-    case Instruction::GetElementPtr:
-      Result = scalarizeGEP(cast<GetElementPtrInst>(V), PM);
-      break;
-    case Instruction::Store:
-      Result = scalarizeStore(cast<StoreInst>(V), PM);
-      break;
-    case Instruction::Load:
-      Result = scalarizeLoad(cast<LoadInst>(V), PM);
-      break;
-    case Instruction::Call:
-      Result = scalarizeCall(cast<CallInst>(V), PM);
-      break;
-    case Instruction::ICmp:
-      Result = scalarizeICmp(cast<ICmpInst>(V), PM);
-      break;
-    case Instruction::FCmp:
-      Result = scalarizeFCmp(cast<FCmpInst>(V), PM);
-      break;
-    case Instruction::Select:
-      Result = scalarizeSelect(cast<SelectInst>(V), PM);
-      break;
-    case Instruction::ShuffleVector:
-      Result = scalarizeShuffleVector(cast<ShuffleVectorInst>(V), PM);
-      break;
-    case Instruction::InsertElement:
-      Result = scalarizeInsertElement(cast<InsertElementInst>(V), PM);
-      break;
-    case Instruction::PHI:
-      Result = scalarizePHI(cast<PHINode>(V), PM);
-      break;
-      // Freeze instruction is not available in LLVM versions prior 10.0
-      // and not used in LLVM versions prior to 11.0
-    case Instruction::Freeze:
-      Result = scalarizeFreeze(cast<FreezeInst>(V), PM);
-      break;
+  default:
+    if (Ins->isBinaryOp()) {
+      Result = scalarizeBinaryOp(cast<BinaryOperator>(V), PM);
+    } else if (Ins->isCast()) {
+      Result = scalarizeCast(cast<CastInst>(V), PM);
+    } else if (Ins->isUnaryOp()) {
+      Result = scalarizeUnaryOp(cast<UnaryOperator>(V), PM);
+    }
+    break;
+  case Instruction::GetElementPtr:
+    Result = scalarizeGEP(cast<GetElementPtrInst>(V), PM);
+    break;
+  case Instruction::Store:
+    Result = scalarizeStore(cast<StoreInst>(V), PM);
+    break;
+  case Instruction::Load:
+    Result = scalarizeLoad(cast<LoadInst>(V), PM);
+    break;
+  case Instruction::Call:
+    Result = scalarizeCall(cast<CallInst>(V), PM);
+    break;
+  case Instruction::ICmp:
+    Result = scalarizeICmp(cast<ICmpInst>(V), PM);
+    break;
+  case Instruction::FCmp:
+    Result = scalarizeFCmp(cast<FCmpInst>(V), PM);
+    break;
+  case Instruction::Select:
+    Result = scalarizeSelect(cast<SelectInst>(V), PM);
+    break;
+  case Instruction::ShuffleVector:
+    Result = scalarizeShuffleVector(cast<ShuffleVectorInst>(V), PM);
+    break;
+  case Instruction::InsertElement:
+    Result = scalarizeInsertElement(cast<InsertElementInst>(V), PM);
+    break;
+  case Instruction::PHI:
+    Result = scalarizePHI(cast<PHINode>(V), PM);
+    break;
+    // Freeze instruction is not available in LLVM versions prior 10.0
+    // and not used in LLVM versions prior to 11.0
+  case Instruction::Freeze:
+    Result = scalarizeFreeze(cast<FreezeInst>(V), PM);
+    break;
   }
 
   if (Result) {
@@ -690,13 +690,13 @@ void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
     DebugLoc DILoc;
 
     switch (DVR->getType()) {
-      case DbgVariableRecord::LocationType::Value:
-      case DbgVariableRecord::LocationType::Declare:
-        DILocal = DVR->getVariable();
-        DILoc = DVR->getDebugLoc();
-        break;
-      default:
-        continue;
+    case DbgVariableRecord::LocationType::Value:
+    case DbgVariableRecord::LocationType::Declare:
+      DILocal = DVR->getVariable();
+      DILoc = DVR->getDebugLoc();
+      break;
+    default:
+      continue;
     }
 
     // Create new DbgVariableRecord across enabled SIMD lanes
@@ -953,21 +953,21 @@ SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) {
   // Make sure we support the cast operation.
   const CastInst::CastOps Opc = CastI->getOpcode();
   switch (Opc) {
-    default:
-      return nullptr;
-    case CastInst::BitCast:
-      return scalarizeBitCast(cast<BitCastInst>(CastI), PM);
-    case CastInst::Trunc:
-    case CastInst::ZExt:
-    case CastInst::SExt:
-    case CastInst::FPToUI:
-    case CastInst::FPToSI:
-    case CastInst::UIToFP:
-    case CastInst::SIToFP:
-    case CastInst::FPTrunc:
-    case CastInst::FPExt:
-    case CastInst::AddrSpaceCast:
-      break;
+  default:
+    return nullptr;
+  case CastInst::BitCast:
+    return scalarizeBitCast(cast<BitCastInst>(CastI), PM);
+  case CastInst::Trunc:
+  case CastInst::ZExt:
+  case CastInst::SExt:
+  case CastInst::FPToUI:
+  case CastInst::FPToSI:
+  case CastInst::UIToFP:
+  case CastInst::SIToFP:
+  case CastInst::FPTrunc:
+  case CastInst::FPExt:
+  case CastInst::AddrSpaceCast:
+    break;
   }
 
   // Scalarize the source value.
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
index f94e60a3645a1..d73bdfb33df85 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -24,9 +24,10 @@
 
 using namespace llvm;
 
-PreservedAnalyses vecz::SimplifyInfiniteLoopPass::run(
-    Loop &L, LoopAnalysisManager &, LoopStandardAnalysisResults &AR,
-    LPMUpdater &) {
+PreservedAnalyses
+vecz::SimplifyInfiniteLoopPass::run(Loop &L, LoopAnalysisManager &,
+                                    LoopStandardAnalysisResults &AR,
+                                    LPMUpdater &) {
   bool modified = false;
 
   SmallVector<BasicBlock *, 1> loopExitBlocks;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
index a794e4abe524c..b4ceb56dc2cd3 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -206,7 +206,7 @@ void Transform(SelectInst *Select, VectorizationContext &Ctx) {
 
   IRCleanup::deleteInstructionNow(Select);
 }
-}  // namespace
+} // namespace
 
 PreservedAnalyses TernaryTransformPass::run(llvm::Function &F,
                                             llvm::FunctionAnalysisManager &AM) {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
index 7d0faee34c9cf..753ec2176b38f 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -83,11 +83,11 @@ void updatePHIs(BasicBlock &BB, BasicBlock *original, BasicBlock *extra) {
   }
 }
 
-}  // namespace
+} // namespace
 
 namespace vecz {
 class Reassociator {
- public:
+public:
   Reassociator() {}
 
   /// @brief perform the Branch Split transformation
@@ -97,7 +97,7 @@ class Reassociator {
   /// @returns true iff any branches were split
   bool run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
 
- private:
+private:
   /// @brief classification of a binary operand according to whether its
   ///        operands are Uniform, Varying, both (Varying Op Uniform), or non-
   ///        canonically both (i.e. Uniform Op Varying).
@@ -355,4 +355,4 @@ PreservedAnalyses UniformReassociationPass::run(Function &F,
   PA.preserve<LoopAnalysis>();
   return PA;
 }
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
index 2ab49546fea1e..b22b7f1816f30 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -65,7 +65,7 @@ bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
                           unsigned Alignment, unsigned) {
   return TTI.isLegalMaskedScatter(Ty, Align(Alignment));
 }
-}  // namespace
+} // namespace
 
 // NOTE the TargetMachine is allowed to be null here; it isn't used in the
 // implementation at present, but if it gets used in future it needs to be
@@ -94,9 +94,8 @@ Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
       const auto Legality =
           isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
       if (!Legality.isVPLegal()) {
-        emitVeczRemarkMissed(F,
-                             "Could not create a VP load as the target "
-                             "reported it would be illegal");
+        emitVeczRemarkMissed(F, "Could not create a VP load as the target "
+                                "reported it would be illegal");
         VECZ_FAIL();
       }
       auto *Mask = createAllTrueMask(B, multi_llvm::getVectorElementCount(Ty));
@@ -161,9 +160,8 @@ Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
       const auto Legality =
           isVPStoreLegal(F, VecTy, Alignment, PtrTy->getAddressSpace());
       if (!Legality.isVPLegal()) {
-        emitVeczRemarkMissed(F,
-                             "Could not create a VP store as the target "
-                             "reported it would be illegal");
+        emitVeczRemarkMissed(F, "Could not create a VP store as the target "
+                                "reported it would be illegal");
         VECZ_FAIL();
       }
       auto *Mask =
@@ -241,9 +239,8 @@ Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
       VECZ_FAIL_IF(!Mask);
       return B.CreateMaskedLoad(Ty, Ptr, Align(Alignment), Mask);
     } else {
-      emitVeczRemarkMissed(F,
-                           "Could not create a masked load as the target "
-                           "reported it would be illegal");
+      emitVeczRemarkMissed(F, "Could not create a masked load as the target "
+                              "reported it would be illegal");
       VECZ_FAIL();
     }
   }
@@ -347,9 +344,8 @@ Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
       VECZ_FAIL_IF(!Mask);
       return B.CreateMaskedStore(Data, Ptr, Align(Alignment), Mask);
     } else {
-      emitVeczRemarkMissed(F,
-                           "Could not create a masked store as the target "
-                           "reported it would be illegal");
+      emitVeczRemarkMissed(F, "Could not create a masked store as the target "
+                              "reported it would be illegal");
       VECZ_FAIL();
     }
   }
@@ -520,9 +516,8 @@ Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
         }
       }
     } else {
-      emitVeczRemarkMissed(F,
-                           "Could not create a masked gather as the target "
-                           "reported it would be illegal");
+      emitVeczRemarkMissed(F, "Could not create a masked gather as the target "
+                              "reported it would be illegal");
       VECZ_FAIL();
     }
   }
@@ -621,9 +616,8 @@ Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
         }
       }
     } else {
-      emitVeczRemarkMissed(F,
-                           "Could not create a masked scatter as the target "
-                           "reported it would be illegal");
+      emitVeczRemarkMissed(F, "Could not create a masked scatter as the target "
+                              "reported it would be illegal");
       VECZ_FAIL();
     }
   }
@@ -909,23 +903,27 @@ TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
   return {isVPLegal, isMaskLegal};
 }
 
-TargetInfo::VPMemOpLegality TargetInfo::isVPLoadLegal(
-    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPLoadLegal(const Function *F, Type *Ty, unsigned Alignment,
+                          unsigned AddrSpace) const {
   return checkMemOpLegality(F, isLegalMaskedLoad, Ty, Alignment, AddrSpace);
 }
 
-TargetInfo::VPMemOpLegality TargetInfo::isVPStoreLegal(
-    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPStoreLegal(const Function *F, Type *Ty, unsigned Alignment,
+                           unsigned AddrSpace) const {
   return checkMemOpLegality(F, isLegalMaskedStore, Ty, Alignment, AddrSpace);
 }
 
-TargetInfo::VPMemOpLegality TargetInfo::isVPGatherLegal(
-    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPGatherLegal(const Function *F, Type *Ty, unsigned Alignment,
+                            unsigned AddrSpace) const {
   return checkMemOpLegality(F, isLegalMaskedGather, Ty, Alignment, AddrSpace);
 }
 
-TargetInfo::VPMemOpLegality TargetInfo::isVPScatterLegal(
-    const Function *F, Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPScatterLegal(const Function *F, Type *Ty, unsigned Alignment,
+                             unsigned AddrSpace) const {
   return checkMemOpLegality(F, isLegalMaskedScatter, Ty, Alignment, AddrSpace);
 }
 
@@ -1060,7 +1058,7 @@ bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
   FixedVectorType *VecTy = nullptr;
   if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
     VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
-  } else {  // eInterleavedLoad || eMaskedInterleavedLoad
+  } else { // eInterleavedLoad || eMaskedInterleavedLoad
     VecTy = cast<FixedVectorType>(Op0->getType());
   }
 
@@ -1308,22 +1306,22 @@ bool TargetInfo::canPacketize(const llvm::Value *, ElementCount) const {
   return true;
 }
 
-std::unique_ptr<TargetInfo> vecz::createTargetInfoFromTargetMachine(
-    TargetMachine *tm) {
+std::unique_ptr<TargetInfo>
+vecz::createTargetInfoFromTargetMachine(TargetMachine *tm) {
   // The TargetMachine is allowed to be null
   if (tm) {
     const Triple &TT(tm->getTargetTriple());
     switch (TT.getArch()) {
-      case Triple::arm:
-        return createTargetInfoArm(tm);
-      case Triple::aarch64:
-        return createTargetInfoAArch64(tm);
-      case Triple::riscv32:
-      case Triple::riscv64:
-        return createTargetInfoRISCV(tm);
-      default:
-        // Just use the generic TargetInfo unless we know better
-        break;
+    case Triple::arm:
+      return createTargetInfoArm(tm);
+    case Triple::aarch64:
+      return createTargetInfoAArch64(tm);
+    case Triple::riscv32:
+    case Triple::riscv64:
+      return createTargetInfoRISCV(tm);
+    default:
+      // Just use the generic TargetInfo unless we know better
+      break;
     }
   }
   return std::make_unique<TargetInfo>(tm);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
index 148db144756f3..bae66eb789260 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 namespace vecz {
 
 class TargetInfoArm final : public TargetInfo {
- public:
+public:
   TargetInfoArm(TargetMachine *tm) : TargetInfo(tm) {}
 
   ~TargetInfoArm() = default;
@@ -43,7 +43,7 @@ class TargetInfoArm final : public TargetInfo {
                                 ArrayRef<Value *> masks, Value *baseAddress,
                                 int stride) const override;
 
- private:
+private:
   bool canOptimizeInterleavedGroupImpl(const Instruction &val,
                                        InterleavedOperation kind, int stride,
                                        unsigned groupSize,
@@ -51,7 +51,7 @@ class TargetInfoArm final : public TargetInfo {
 };
 
 class TargetInfoAArch64 final : public TargetInfo {
- public:
+public:
   TargetInfoAArch64(TargetMachine *tm) : TargetInfo(tm) {}
 
   ~TargetInfoAArch64() = default;
@@ -65,7 +65,7 @@ class TargetInfoAArch64 final : public TargetInfo {
                                 ArrayRef<Value *> masks, Value *baseAddress,
                                 int stride) const override;
 
- private:
+private:
   bool canOptimizeInterleavedGroupImpl(const Instruction &val,
                                        InterleavedOperation kind, int stride,
                                        unsigned groupSize,
@@ -80,7 +80,7 @@ std::unique_ptr<TargetInfo> createTargetInfoAArch64(TargetMachine *tm) {
   return std::make_unique<TargetInfoAArch64>(tm);
 }
 
-}  // namespace vecz
+} // namespace vecz
 
 bool TargetInfoArm::canOptimizeInterleavedGroup(const Instruction &val,
                                                 InterleavedOperation kind,
@@ -99,32 +99,32 @@ bool TargetInfoArm::canOptimizeInterleavedGroupImpl(const Instruction &val,
   Type *dataType = nullptr;
   if (kind == eInterleavedStore) {
     switch (stride) {
-      default:
-        break;
-      case 2:
-        IntrID = Intrinsic::arm_neon_vst2;
-        break;
-      case 3:
-        IntrID = Intrinsic::arm_neon_vst3;
-        break;
-      case 4:
-        IntrID = Intrinsic::arm_neon_vst4;
-        break;
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::arm_neon_vst2;
+      break;
+    case 3:
+      IntrID = Intrinsic::arm_neon_vst3;
+      break;
+    case 4:
+      IntrID = Intrinsic::arm_neon_vst4;
+      break;
     }
     dataType = val.getOperand(0)->getType();
   } else if (kind == eInterleavedLoad) {
     switch (stride) {
-      default:
-        break;
-      case 2:
-        IntrID = Intrinsic::arm_neon_vld2;
-        break;
-      case 3:
-        IntrID = Intrinsic::arm_neon_vld3;
-        break;
-      case 4:
-        IntrID = Intrinsic::arm_neon_vld4;
-        break;
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::arm_neon_vld2;
+      break;
+    case 3:
+      IntrID = Intrinsic::arm_neon_vld3;
+      break;
+    case 4:
+      IntrID = Intrinsic::arm_neon_vld4;
+      break;
     }
     dataType = val.getType();
   } else {
@@ -208,7 +208,7 @@ bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
   FixedVectorType *VecTy = nullptr;
   if (kind == eInterleavedStore) {
     VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
-  } else {  // eInterleavedLoad
+  } else { // eInterleavedLoad
     VecTy = cast<FixedVectorType>(Op0->getType());
   }
 
@@ -267,32 +267,32 @@ bool TargetInfoAArch64::canOptimizeInterleavedGroupImpl(
   Type *dataType = nullptr;
   if (kind == eInterleavedStore) {
     switch (stride) {
-      default:
-        break;
-      case 2:
-        IntrID = Intrinsic::aarch64_neon_st2;
-        break;
-      case 3:
-        IntrID = Intrinsic::aarch64_neon_st3;
-        break;
-      case 4:
-        IntrID = Intrinsic::aarch64_neon_st4;
-        break;
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::aarch64_neon_st2;
+      break;
+    case 3:
+      IntrID = Intrinsic::aarch64_neon_st3;
+      break;
+    case 4:
+      IntrID = Intrinsic::aarch64_neon_st4;
+      break;
     }
     dataType = val.getOperand(0)->getType();
   } else if (kind == eInterleavedLoad) {
     switch (stride) {
-      default:
-        break;
-      case 2:
-        IntrID = Intrinsic::aarch64_neon_ld2;
-        break;
-      case 3:
-        IntrID = Intrinsic::aarch64_neon_ld3;
-        break;
-      case 4:
-        IntrID = Intrinsic::aarch64_neon_ld4;
-        break;
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::aarch64_neon_ld2;
+      break;
+    case 3:
+      IntrID = Intrinsic::aarch64_neon_ld3;
+      break;
+    case 4:
+      IntrID = Intrinsic::aarch64_neon_ld4;
+      break;
     }
     dataType = val.getType();
   } else {
@@ -374,7 +374,7 @@ bool TargetInfoAArch64::optimizeInterleavedGroup(
   FixedVectorType *VecTy = nullptr;
   if (kind == eInterleavedStore) {
     VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
-  } else {  // eInterleavedLoad
+  } else { // eInterleavedLoad
     VecTy = cast<FixedVectorType>(Op0->getType());
   }
 
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
index 66906b1cdac34..8c320bd324ffa 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 namespace vecz {
 
 class TargetInfoRISCV final : public TargetInfo {
- public:
+public:
   TargetInfoRISCV(TargetMachine *tm) : TargetInfo(tm) {}
 
   ~TargetInfoRISCV() = default;
@@ -43,16 +43,18 @@ class TargetInfoRISCV final : public TargetInfo {
       llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src,
       llvm::Value *index, llvm::Value *evl) const override;
 
-  llvm::Value *createOuterScalableBroadcast(
-      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
-      ElementCount factor) const override {
+  llvm::Value *
+  createOuterScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               ElementCount factor) const override {
     return createScalableBroadcast(builder, vector, VL, factor,
                                    /* URem */ true);
   }
 
-  llvm::Value *createInnerScalableBroadcast(
-      llvm::IRBuilder<> &builder, llvm::Value *vector, llvm::Value *VL,
-      ElementCount factor) const override {
+  llvm::Value *
+  createInnerScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               ElementCount factor) const override {
     return createScalableBroadcast(builder, vector, VL, factor,
                                    /* URem */ false);
   }
@@ -73,7 +75,7 @@ class TargetInfoRISCV final : public TargetInfo {
                                    llvm::Value *insert,
                                    llvm::Value *evl) const override;
 
- private:
+private:
   bool isOperationLegal(llvm::Intrinsic::ID ID,
                         llvm::ArrayRef<llvm::Type *> Tys) const;
 
@@ -122,7 +124,7 @@ std::unique_ptr<TargetInfo> createTargetInfoRISCV(TargetMachine *tm) {
   return std::make_unique<TargetInfoRISCV>(tm);
 }
 
-}  // namespace vecz
+} // namespace vecz
 
 bool TargetInfoRISCV::canPacketize(const llvm::Value *Val,
                                    ElementCount Width) const {
@@ -170,31 +172,31 @@ bool TargetInfoRISCV::canPacketize(const llvm::Value *Val,
 bool TargetInfoRISCV::isOperationLegal(llvm::Intrinsic::ID ID,
                                        llvm::ArrayRef<llvm::Type *> Tys) const {
   switch (ID) {
-    case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv:
-    case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask:
-      // riscv_vrgather_vv[_mask](RetTy, _IdxTy)
-      // We only need to check the return type here, as it should be greater or
-      // equal to the index type.
-      assert(Tys.size() == 1 &&
-             "Only the return type is needed to check vrgather_vv intrinsics");
-      return isVectorTypeLegal(Tys.front());
-    case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv:
-    case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask: {
-      constexpr unsigned MaxVectorSize = MaxLegalVectorTypeBits / 16;
-      // riscv_vrgatherei16_vv[_mask](RetTy, _IdxTy)
-      // Case similar to that of riscv_vrgather_vv[_mask], but we also need to
-      // check that the vector size is no greater than MaxLegalVectorTypeSize /
-      // 16, as the index type will always be i16.
-      assert(
-          Tys.size() == 1 &&
-          "Only the return type is needed to check vrgatherei16_vv intrinsics");
-      auto *const RetTy = Tys.front();
-      return isVectorTypeLegal(RetTy) &&
-             multi_llvm::getVectorElementCount(RetTy).getKnownMinValue() <=
-                 MaxVectorSize;
-    }
-    default:
-      break;
+  case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv:
+  case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask:
+    // riscv_vrgather_vv[_mask](RetTy, _IdxTy)
+    // We only need to check the return type here, as it should be greater or
+    // equal to the index type.
+    assert(Tys.size() == 1 &&
+           "Only the return type is needed to check vrgather_vv intrinsics");
+    return isVectorTypeLegal(Tys.front());
+  case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv:
+  case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask: {
+    constexpr unsigned MaxVectorSize = MaxLegalVectorTypeBits / 16;
+    // riscv_vrgatherei16_vv[_mask](RetTy, _IdxTy)
+    // Case similar to that of riscv_vrgather_vv[_mask], but we also need to
+    // check that the vector size is no greater than MaxLegalVectorTypeSize /
+    // 16, as the index type will always be i16.
+    assert(
+        Tys.size() == 1 &&
+        "Only the return type is needed to check vrgatherei16_vv intrinsics");
+    auto *const RetTy = Tys.front();
+    return isVectorTypeLegal(RetTy) &&
+           multi_llvm::getVectorElementCount(RetTy).getKnownMinValue() <=
+               MaxVectorSize;
+  }
+  default:
+    break;
   }
   llvm_unreachable("Don't know how to check whether this intrinsic is legal.");
 }
@@ -242,8 +244,8 @@ llvm::Value *getIntrinsicVL(llvm::IRBuilderBase &B, llvm::Value *VL,
 ///
 /// @param[in] vs2Ty Type of the source vector.
 /// @param[in] isMasked Whether the intrinsic should be masked.
-std::pair<llvm::Intrinsic::RISCVIntrinsics, unsigned> getGatherIntrinsic(
-    llvm::Type *vs2Ty, bool isMasked = false) {
+std::pair<llvm::Intrinsic::RISCVIntrinsics, unsigned>
+getGatherIntrinsic(llvm::Type *vs2Ty, bool isMasked = false) {
   assert(!vs2Ty->isPtrOrPtrVectorTy() &&
          "Cannot get gather intrinsic for a vector of pointers");
 
@@ -281,7 +283,7 @@ llvm::Intrinsic::RISCVIntrinsics getSlideUpIntrinsic(llvm::Type *vs2Ty) {
   return Opc;
 }
 
-}  // namespace
+} // namespace
 
 llvm::Value *TargetInfoRISCV::createScalableExtractElement(
     llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
index cb9feaea7ddbf..0c16da1c10106 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
@@ -61,7 +61,7 @@ static const VectorizationChoices::ChoiceInfo choicesArray[] = {
      "purposes)"},
 };
 
-}  // namespace
+} // namespace
 
 namespace vecz {
 
@@ -169,4 +169,4 @@ void VectorizationChoices::printChoicesParseError(StringRef Input,
   }
   errs() << "^\n";
 }
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
index 57c25642c6c8e..a90ce8d767048 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -65,8 +65,8 @@ VectorizationContext::VectorizationContext(llvm::Module &target,
                                            compiler::utils::BuiltinInfo &bi)
     : VTI(vti), Module(target), BI(bi), DL(&Module.getDataLayout()) {}
 
-TargetTransformInfo VectorizationContext::getTargetTransformInfo(
-    Function &F) const {
+TargetTransformInfo
+VectorizationContext::getTargetTransformInfo(Function &F) const {
   auto *const TM = targetInfo().getTargetMachine();
   if (TM) {
     return TM->getTargetTransformInfo(F);
@@ -125,8 +125,9 @@ bool VectorizationContext::canExpandBuiltin(const Function *ScalarFn) const {
   return true;
 }
 
-VectorizationResult &VectorizationContext::getOrCreateBuiltin(
-    llvm::Function &F, unsigned SimdWidth) {
+VectorizationResult &
+VectorizationContext::getOrCreateBuiltin(llvm::Function &F,
+                                         unsigned SimdWidth) {
   compiler::utils::BuiltinInfo &BI = builtins();
   const auto Cached = VectorizedBuiltins.find(&F);
   if (Cached != VectorizedBuiltins.end()) {
@@ -178,8 +179,9 @@ VectorizationResult &VectorizationContext::getOrCreateBuiltin(
   return result;
 }
 
-VectorizationResult VectorizationContext::getVectorizedFunction(
-    Function &callee, ElementCount factor) {
+VectorizationResult
+VectorizationContext::getVectorizedFunction(Function &callee,
+                                            ElementCount factor) {
   VectorizationResult result;
   if (factor.isScalable()) {
     // We can't vectorize builtins by a scalable factor yet.
@@ -233,7 +235,7 @@ Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
   if (!F) {
     F = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
   }
-  VECZ_FAIL_IF(!F);  // TODO: Support indirect function calls.
+  VECZ_FAIL_IF(!F); // TODO: Support indirect function calls.
   LLVMContext &ctx = F->getContext();
 
   // We will handle printf statements, but handling every possible vararg
@@ -542,27 +544,27 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
   // Mangle ordering
   auto mangleOrdering = [&O](AtomicOrdering Ordering) {
     switch (Ordering) {
-      case AtomicOrdering::Acquire:
-        O << "acquire";
-        return;
-      case AtomicOrdering::AcquireRelease:
-        O << "acqrel";
-        return;
-      case AtomicOrdering::Monotonic:
-        O << "monotonic";
-        return;
-      case AtomicOrdering::NotAtomic:
-        O << "notatomic";
-        return;
-      case AtomicOrdering::Release:
-        O << "release";
-        return;
-      case AtomicOrdering::SequentiallyConsistent:
-        O << "seqcst";
-        return;
-      case AtomicOrdering::Unordered:
-        O << "unordered";
-        return;
+    case AtomicOrdering::Acquire:
+      O << "acquire";
+      return;
+    case AtomicOrdering::AcquireRelease:
+      O << "acqrel";
+      return;
+    case AtomicOrdering::Monotonic:
+      O << "monotonic";
+      return;
+    case AtomicOrdering::NotAtomic:
+      O << "notatomic";
+      return;
+    case AtomicOrdering::Release:
+      O << "release";
+      return;
+    case AtomicOrdering::SequentiallyConsistent:
+      O << "seqcst";
+      return;
+    case AtomicOrdering::Unordered:
+      O << "unordered";
+      return;
     }
 
     O << static_cast<unsigned>(Ordering);
@@ -599,8 +601,8 @@ Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
 }
 
 namespace {
-std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
-    StringRef fnName, Type *const ty) {
+std::optional<std::tuple<bool, RecurKind, bool>>
+isSubgroupScan(StringRef fnName, Type *const ty) {
   compiler::utils::Lexer L(fnName);
   if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
     return std::nullopt;
@@ -650,7 +652,7 @@ std::optional<std::tuple<bool, RecurKind, bool>> isSubgroupScan(
   }
   return std::nullopt;
 }
-}  // namespace
+} // namespace
 
 bool VectorizationContext::defineInternalBuiltin(Function *F) {
   assert(F->isDeclaration() && "builtin is already defined");
@@ -1056,9 +1058,9 @@ bool VectorizationContext::emitMaskedAtomicBody(
   Value *RetVal = nullptr;
   Value *RetSuccessVal = nullptr;
 
-  auto CreateLoopBody = [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal,
-                         &RetSuccessVal, IsVector, IsCmpXchg](
-                            BasicBlock *BB, Value *Idx, ArrayRef<Value *> IVs,
+  auto CreateLoopBody =
+      [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal, &RetSuccessVal,
+       IsVector, IsCmpXchg](BasicBlock *BB, Value *Idx, ArrayRef<Value *> IVs,
                             MutableArrayRef<Value *> IVsNext) -> BasicBlock * {
     IRBuilder<> IRB(BB);
 
@@ -1169,8 +1171,9 @@ bool VectorizationContext::emitMaskedAtomicBody(
   return true;
 }
 
-Function *VectorizationContext::getInternalVectorEquivalent(
-    Function *ScalarFn, unsigned SimdWidth) {
+Function *
+VectorizationContext::getInternalVectorEquivalent(Function *ScalarFn,
+                                                  unsigned SimdWidth) {
   // Handle masked memory loads and stores.
   if (!ScalarFn) {
     return nullptr;
@@ -1195,8 +1198,8 @@ bool VectorizationContext::insertMaskedFunction(llvm::Function *F,
   return result.second;
 }
 
-llvm::Function *VectorizationContext::getOriginalMaskedFunction(
-    llvm::Function *F) {
+llvm::Function *
+VectorizationContext::getOriginalMaskedFunction(llvm::Function *F) {
   auto Iter = MaskedFunctionsMap.find(F);
   if (Iter != MaskedFunctionsMap.end()) {
     return dyn_cast_or_null<llvm::Function>(Iter->second);
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
index 24c892f7b606a..29d505d28369b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -120,9 +120,9 @@ void cloneOpenCLNamedMetadataHelper(const VectorizationUnit &VU,
 /// into the vectorized function.
 ///
 /// @param[in,out] ValueMap Map to update with the arguments.
-SmallVector<Instruction *, 2> createArgumentPlaceholders(
-    const VectorizationUnit &VU, Function *VecFunc,
-    ValueToValueMapTy &ValueMap) {
+SmallVector<Instruction *, 2>
+createArgumentPlaceholders(const VectorizationUnit &VU, Function *VecFunc,
+                           ValueToValueMapTy &ValueMap) {
   SmallVector<Instruction *, 2> Placeholders;
   const auto &Arguments = VU.arguments();
   unsigned i = 0u;
@@ -145,7 +145,7 @@ SmallVector<Instruction *, 2> createArgumentPlaceholders(
   return Placeholders;
 }
 
-}  // namespace
+} // namespace
 
 namespace vecz {
 std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF,
@@ -261,7 +261,8 @@ Function *cloneFunctionToVector(const VectorizationUnit &VU) {
 static DILocation *getDILocation(unsigned Line, unsigned Column, MDNode *Scope,
                                  MDNode *InlinedAt = nullptr) {
   // If no scope is available, this is an unknown location.
-  if (!Scope) return DebugLoc();
+  if (!Scope)
+    return DebugLoc();
   return DILocation::get(Scope->getContext(), Line, Column, Scope, InlinedAt,
                          /*ImplicitCode*/ false);
 }
@@ -339,4 +340,4 @@ void cloneOpenCLMetadata(const VectorizationUnit &VU) {
   cloneOpenCLNamedMetadataHelper(VU, "opencl.kernel_wg_size_info");
 }
 
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
index 7084bb6a4211d..afa45e4c325c0 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -33,7 +33,7 @@ namespace {
 class Heuristics {
   enum class BrClauseKind { None = 0, True, False };
 
- public:
+public:
   Heuristics(llvm::Function &F, VectorizationContext &Ctx, ElementCount VF,
              unsigned SimdDimIdx)
       : F(F), Ctx(Ctx), SimdWidth(VF), SimdDimIdx(SimdDimIdx) {}
@@ -49,7 +49,7 @@ class Heuristics {
   /// @return Whether we should vectorize the function or not.
   bool shouldVectorize();
 
- private:
+private:
   /// @brief Passthrough to CmpInst.
   ///
   /// @param[in] Comp The instruction to inspect.
@@ -81,8 +81,9 @@ class Heuristics {
   /// @param[in] Pred The kind of comparison.
   ///
   /// @return The branch's path not to vectorize, if any.
-  BrClauseKind shouldVectorizeVisitCmpOperands(
-      const llvm::Value *RHS, llvm::CmpInst::Predicate Pred) const;
+  BrClauseKind
+  shouldVectorizeVisitCmpOperands(const llvm::Value *RHS,
+                                  llvm::CmpInst::Predicate Pred) const;
 
   /// @brief The function to analyze.
   llvm::Function &F;
@@ -97,8 +98,9 @@ class Heuristics {
   unsigned SimdDimIdx;
 };
 
-Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmpOperands(
-    const Value *RHS, CmpInst::Predicate Pred) const {
+Heuristics::BrClauseKind
+Heuristics::shouldVectorizeVisitCmpOperands(const Value *RHS,
+                                            CmpInst::Predicate Pred) const {
   // If we have an `EQ` comparison, the single lane computation happens on
   // the true successor.
   if (Pred == CmpInst::Predicate::ICMP_EQ) {
@@ -122,38 +124,38 @@ Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmpOperands(
     // If we have a branch whose condition only applies for at most half of the
     // simd width, it is not worth vectorizing it.
     switch (Pred) {
-      default:
-        break;
-      // If we have a `GT` or `GE` comparison, if the constant we compare the
-      // opencl builtin against is greater than half of the simd width, we will
-      // not take the true branch as often as the false branch.
-      case CmpInst::Predicate::ICMP_UGT:
-      case CmpInst::Predicate::ICMP_UGE:
-      case CmpInst::Predicate::ICMP_SGT:
-      case CmpInst::Predicate::ICMP_SGE:
-        if (SimdWidth.isScalable()) {
-          return BrClauseKind::True;
-        } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
-          return BrClauseKind::True;
-        } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
-          return BrClauseKind::False;
-        }
-        break;
-      // If we have an `LT` or `LE` comparison, if the constant we compare the
-      // opencl builtin against is smaller than half of the simd width, we will
-      // not take the true branch as often as the false branch.
-      case CmpInst::Predicate::ICMP_ULT:
-      case CmpInst::Predicate::ICMP_ULE:
-      case CmpInst::Predicate::ICMP_SLT:
-      case CmpInst::Predicate::ICMP_SLE:
-        if (SimdWidth.isScalable()) {
-          return BrClauseKind::False;
-        } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
-          return BrClauseKind::True;
-        } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
-          return BrClauseKind::False;
-        }
-        break;
+    default:
+      break;
+    // If we have a `GT` or `GE` comparison, if the constant we compare the
+    // opencl builtin against is greater than half of the simd width, we will
+    // not take the true branch as often as the false branch.
+    case CmpInst::Predicate::ICMP_UGT:
+    case CmpInst::Predicate::ICMP_UGE:
+    case CmpInst::Predicate::ICMP_SGT:
+    case CmpInst::Predicate::ICMP_SGE:
+      if (SimdWidth.isScalable()) {
+        return BrClauseKind::True;
+      } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::True;
+      } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::False;
+      }
+      break;
+    // If we have an `LT` or `LE` comparison, if the constant we compare the
+    // opencl builtin against is smaller than half of the simd width, we will
+    // not take the true branch as often as the false branch.
+    case CmpInst::Predicate::ICMP_ULT:
+    case CmpInst::Predicate::ICMP_ULE:
+    case CmpInst::Predicate::ICMP_SLT:
+    case CmpInst::Predicate::ICMP_SLE:
+      if (SimdWidth.isScalable()) {
+        return BrClauseKind::False;
+      } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::True;
+      } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::False;
+      }
+      break;
     }
   }
 
@@ -225,8 +227,8 @@ const Value *Heuristics::shouldVectorizeVisitCmpOperand(
   return (Cache[Val] = nullptr);
 }
 
-Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmp(
-    const CmpInst *Cmp) const {
+Heuristics::BrClauseKind
+Heuristics::shouldVectorizeVisitCmp(const CmpInst *Cmp) const {
   // The following two calls return either a CallInst, a ConstantInt, or
   // nullptr otherwise. If it returns a CallInst, it necessarily is a call to
   // get_{global|local}_id, because otherwise we don't care.
@@ -256,8 +258,8 @@ Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitCmp(
   return vectorize;
 }
 
-Heuristics::BrClauseKind Heuristics::shouldVectorizeVisitBr(
-    const Value *Comp) const {
+Heuristics::BrClauseKind
+Heuristics::shouldVectorizeVisitBr(const Value *Comp) const {
   // If we are visiting a binary operator, inspect both its operands to
   // perhaps find CmpInsts.
   // E.g.: %and = and ...
@@ -381,7 +383,7 @@ bool Heuristics::shouldVectorize() {
 
   return true;
 }
-}  // namespace
+} // namespace
 
 namespace vecz {
 bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
@@ -389,4 +391,4 @@ bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
   Heuristics VH(F, Ctx, VF, SimdDimIdx);
   return VH.shouldVectorize();
 }
-}  // namespace vecz
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
index 22fdd0cee503d..6516d2f593982 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
@@ -44,15 +44,9 @@ VectorizationUnit::VectorizationUnit(Function &F, ElementCount Width,
                                      unsigned Dimension,
                                      VectorizationContext &Ctx,
                                      const VectorizationChoices &Ch)
-    : Ctx(Ctx),
-      Choices(Ch),
-      ScalarFn(&F),
-      VectorizedFn(nullptr),
-      SimdWidth(ElementCount()),
-      LocalSize(0),
-      AutoSimdWidth(false),
-      SimdDimIdx(Dimension),
-      FnFlags(eFunctionNoFlag) {
+    : Ctx(Ctx), Choices(Ch), ScalarFn(&F), VectorizedFn(nullptr),
+      SimdWidth(ElementCount()), LocalSize(0), AutoSimdWidth(false),
+      SimdDimIdx(Dimension), FnFlags(eFunctionNoFlag) {
   // Gather information about the function's arguments.
   for (Argument &Arg : F.args()) {
     VectorizerTargetArgument TargetArg;
@@ -148,8 +142,9 @@ void VectorizationUnit::setVectorizedFunction(llvm::Function *NewFunction) {
   }
 }
 
-vecz::internal::AnalysisFailResult VectorizationUnit::setFailed(
-    const char *remark, const llvm::Function *F, const llvm::Value *V) {
+vecz::internal::AnalysisFailResult
+VectorizationUnit::setFailed(const char *remark, const llvm::Function *F,
+                             const llvm::Value *V) {
   setFlag(eFunctionVectorizationFailed);
   emitVeczRemarkMissed(F ? F : &function(), V, remark);
   return vecz::internal::AnalysisFailResult();
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
index f5576eddc0de4..a9c44e44b2cd4 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -45,12 +45,13 @@ using namespace vecz;
 using namespace llvm;
 
 namespace {
-static cl::opt<bool> VeczDumpReport(
-    "vecz-dump-report", cl::desc("report the post-vectorization status"));
+static cl::opt<bool>
+    VeczDumpReport("vecz-dump-report",
+                   cl::desc("report the post-vectorization status"));
 // static cl options allow us to access these options from other cpp files,
 // such as vectorization_unit.cpp
 
-}  // namespace
+} // namespace
 
 // Statistics
 STATISTIC(VeczSuccess, "Number of kernels successfully vectorized [ID#V80]");
@@ -238,7 +239,7 @@ void collectStatistics(VectorizationUnit &VU, Function *Scalar,
                         ScalarVectorInsts * MaxScalarVectorWidth)) /
           VeczInstructions;
 }
-}  // namespace
+} // namespace
 
 VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx,
                                                  Function *Kernel,
@@ -349,13 +350,13 @@ bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) {
   const compiler::utils::VectorizationInfo info{
       finalVF, dim, vu.choices().vectorPredication()};
 
-  if (vectorizedFn && vectorizedFn != fn) {  // success
+  if (vectorizedFn && vectorizedFn != fn) { // success
     // Link the original function to the vectorized one.
     compiler::utils::linkOrigToVeczFnMetadata(*fn, *vectorizedFn, info);
 
     // Link the vectorized function back to the original one.
     compiler::utils::linkVeczToOrigFnMetadata(*vectorizedFn, *fn, info);
-  } else {  // fail or bail
+  } else { // fail or bail
     compiler::utils::encodeVectorizationFailedMetadata(*fn, info);
   }
   return vectorizedFn;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
index 6fd9fdcb83bbf..bcbeabbf9766b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -112,11 +112,11 @@ void VeczPassMachinery::registerPasses() {
 
 void VeczPassMachinery::addClassToPassNames() {
   {
-#define MODULE_PASS(NAME, CREATE_PASS) \
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
   PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
-#define FUNCTION_PASS(NAME, CREATE_PASS) \
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
-#define LOOP_PASS(NAME, CREATE_PASS) \
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
   PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #include "passes.def"
   }
@@ -153,21 +153,21 @@ void VeczPassMachinery::registerPassCallbacks() {
   PB.registerPipelineParsingCallback(
       [](StringRef Name, ModulePassManager &PM,
          ArrayRef<PassBuilder::PipelineElement>) {
-#define MODULE_PASS(NAME, CREATE_PASS) \
-  if (Name == NAME) {                  \
-    PM.addPass(CREATE_PASS);           \
-    return true;                       \
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  if (Name == NAME) {                                                          \
+    PM.addPass(CREATE_PASS);                                                   \
+    return true;                                                               \
   }
-#define FUNCTION_PASS(NAME, CREATE_PASS)                        \
-  if (Name == NAME) {                                           \
-    PM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS)); \
-    return true;                                                \
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    PM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS));                \
+    return true;                                                               \
   }
-#define LOOP_PASS(NAME, CREATE_PASS)                    \
-  if (Name == NAME) {                                   \
-    PM.addPass(createModuleToFunctionPassAdaptor(       \
-        createFunctionToLoopPassAdaptor(CREATE_PASS))); \
-    return true;                                        \
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME) {                                                          \
+    PM.addPass(createModuleToFunctionPassAdaptor(                              \
+        createFunctionToLoopPassAdaptor(CREATE_PASS)));                        \
+    return true;                                                               \
   }
 #include "passes.def"
         return false;
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
index 2b722b490582e..5c4a4f228db00 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -52,39 +52,39 @@
 #include "vecz/pass.h"
 #include "vecz/vecz_target_info.h"
 
-static llvm::cl::opt<std::string> InputFilename(
-    llvm::cl::Positional, llvm::cl::desc("<input .bc file>"),
-    llvm::cl::init("-"));
+static llvm::cl::opt<std::string>
+    InputFilename(llvm::cl::Positional, llvm::cl::desc("<input .bc file>"),
+                  llvm::cl::init("-"));
 
-static llvm::cl::opt<std::string> OutputFilename(
-    "o", llvm::cl::desc("Override output filename"),
-    llvm::cl::value_desc("filename"));
-static llvm::cl::opt<bool, false> WriteTextual(
-    "S", llvm::cl::desc("Write module as text"));
+static llvm::cl::opt<std::string>
+    OutputFilename("o", llvm::cl::desc("Override output filename"),
+                   llvm::cl::value_desc("filename"));
+static llvm::cl::opt<bool, false>
+    WriteTextual("S", llvm::cl::desc("Write module as text"));
 
-static llvm::cl::list<std::string> KernelNameSpecs(
-    "k", llvm::cl::desc("Kernel to vectorize"), llvm::cl::ZeroOrMore,
-    llvm::cl::value_desc("name"));
+static llvm::cl::list<std::string>
+    KernelNameSpecs("k", llvm::cl::desc("Kernel to vectorize"),
+                    llvm::cl::ZeroOrMore, llvm::cl::value_desc("name"));
 
-static llvm::cl::opt<unsigned> SIMDDimIdx(
-    "d", llvm::cl::desc("Dimension index to vectorize on"), llvm::cl::init(0),
-    llvm::cl::value_desc("dimension"));
+static llvm::cl::opt<unsigned>
+    SIMDDimIdx("d", llvm::cl::desc("Dimension index to vectorize on"),
+               llvm::cl::init(0), llvm::cl::value_desc("dimension"));
 
-static llvm::cl::opt<unsigned> SIMDWidth(
-    "w", llvm::cl::desc("Width to vectorize to"), llvm::cl::init(0),
-    llvm::cl::value_desc("width"));
+static llvm::cl::opt<unsigned>
+    SIMDWidth("w", llvm::cl::desc("Width to vectorize to"), llvm::cl::init(0),
+              llvm::cl::value_desc("width"));
 
 static llvm::cl::opt<bool> FailQuietly(
     "vecz-fail-quietly",
     llvm::cl::desc("don't return an error code on vectorization failure"));
 
-static llvm::cl::opt<bool> ChoicesHelp(
-    "vecz-choices-help",
-    llvm::cl::desc("see information about available choices"));
+static llvm::cl::opt<bool>
+    ChoicesHelp("vecz-choices-help",
+                llvm::cl::desc("see information about available choices"));
 
-static llvm::cl::opt<bool> VeczAuto(
-    "vecz-auto",
-    llvm::cl::desc("run the vectorizer if it is found to be useful"));
+static llvm::cl::opt<bool>
+    VeczAuto("vecz-auto",
+             llvm::cl::desc("run the vectorizer if it is found to be useful"));
 
 static llvm::cl::opt<unsigned, 0> VeczSimdWidth(
     "vecz-simd-width",
@@ -96,27 +96,29 @@ static llvm::cl::opt<llvm::cl::boolOrDefault> VeczScalable(
 
 // Allow the passing of Vecz Choices string on the command line. This is parsed
 // after the choices environment variable, thus overriding it.
-static llvm::cl::opt<std::string> ChoicesString(
-    "vecz-choices", llvm::cl::desc("Set vecz choices"));
+static llvm::cl::opt<std::string>
+    ChoicesString("vecz-choices", llvm::cl::desc("Set vecz choices"));
 
-static llvm::cl::opt<bool> VeczCollectStats(
-    "vecz-llvm-stats", llvm::cl::desc("enable reporting LLVM statistics"));
+static llvm::cl::opt<bool>
+    VeczCollectStats("vecz-llvm-stats",
+                     llvm::cl::desc("enable reporting LLVM statistics"));
 
-static llvm::cl::opt<std::string> UserTriple(
-    "vecz-target-triple", llvm::cl::desc("the target triple"));
+static llvm::cl::opt<std::string>
+    UserTriple("vecz-target-triple", llvm::cl::desc("the target triple"));
 static llvm::cl::opt<std::string> UserCPU("vecz-target-mcpu",
                                           llvm::cl::desc("Set the CPU model"));
-static llvm::cl::opt<std::string> CPUFeatures(
-    "vecz-target-features", llvm::cl::desc("Set the CPU feature string"));
+static llvm::cl::opt<std::string>
+    CPUFeatures("vecz-target-features",
+                llvm::cl::desc("Set the CPU feature string"));
 static llvm::cl::opt<bool> DoubleSupport(
     "vecz-double-support", llvm::cl::init(true),
     llvm::cl::desc(
         "Assume the target has double-precision floating point support"));
 
-static llvm::cl::list<unsigned> SGSizes(
-    "device-sg-sizes",
-    llvm::cl::desc("Comma-separated list of supported sub-group sizes"),
-    llvm::cl::CommaSeparated);
+static llvm::cl::list<unsigned>
+    SGSizes("device-sg-sizes",
+            llvm::cl::desc("Comma-separated list of supported sub-group sizes"),
+            llvm::cl::CommaSeparated);
 
 static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
                                            llvm::StringRef cpu_model,
@@ -203,9 +205,9 @@ static vecz::VeczPassOptions getDefaultPassOptions() {
 // <simd_width> ::= '@' <number>
 // <scalable_spec> ::= 's'
 // <predicated_spec> ::= 'p'
-static bool parsePassOptionsSwitch(
-    const llvm::StringRef spec, llvm::StringRef &name,
-    llvm::SmallVectorImpl<vecz::VeczPassOptions> &opts) {
+static bool
+parsePassOptionsSwitch(const llvm::StringRef spec, llvm::StringRef &name,
+                       llvm::SmallVectorImpl<vecz::VeczPassOptions> &opts) {
   auto pair = spec.split(':');
   name = pair.first;
   auto vals = pair.second;

From 477f0cd0024e7cc3a92ea56684cba6ec599012c1 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 5 Sep 2025 13:43:44 +0100
Subject: [PATCH 182/182] [NATIVE_CPU][SYCL] Switch to using native_cpu
 compiler pipeline inline

This brings the Native CPU compiler pipeline files directly under the ownership
of intel/llvm. This removes the direct dependence on the oneAPI Construction
Kit, although the history of those files still exists under intel/llvm and the
originals still exist at
https://github.com/uxlfoundation/oneapi-construction-kit.

This is the post merge update to the the oneAPI Construction Kit move of the compiler
pipeline files for Native CPU and replaces the original FetchContent method.

`llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt` was updated to remove the fetch
content and the ugly install workaround. Missing is the lit tests for the
pipeline tests as these refer to the use of a non-ported tool, `muxc`.
CMakeLists.txt across was updated to use LLVM macros across `compiler_pipeline`
and `vecz` ported code, as well as changing `NATIVECPU_USE_OCK` to be the
default.The lit tests cfg files were updated to fit with LLVM, and a small
change was made to avoid use of `undef`.

A brief explanation and limitations of the importing of the files are under
`llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst`.
---
 .github/workflows/sycl-linux-build.yml        |   3 +-
 llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt    |  88 +++---------
 .../compiler_passes/CMakeLists.txt            |   2 +
 .../compiler_passes/compiler_passes.rst       |  63 ++++++++
 .../compiler_pipeline/CMakeLists.txt          |  32 +++++
 .../compiler_passes/vecz/CMakeLists.txt       | 135 ++++++++++++++++++
 .../compiler_passes/vecz/test/CMakeLists.txt  |   1 +
 .../vecz/test/lit/CMakeLists.txt              |  26 ++++
 .../compiler_passes/vecz/test/lit/lit.cfg.py  |  37 +++++
 .../vecz/test/lit/lit.site.cfg.py.in          |  22 +++
 .../test/lit/llvm/packetization_debug_info.ll |   4 +-
 .../compiler_passes/vecz/tools/CMakeLists.txt |  14 ++
 12 files changed, 352 insertions(+), 75 deletions(-)
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in
 create mode 100644 llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt

diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml
index 33c2269fb360b..fb593ee00b3eb 100644
--- a/.github/workflows/sycl-linux-build.yml
+++ b/.github/workflows/sycl-linux-build.yml
@@ -202,8 +202,7 @@ jobs:
           --ci-defaults ${{ inputs.build_configure_extra_args }} \
           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-          -DLLVM_INSTALL_UTILS=ON \
-          -DNATIVECPU_USE_OCK=Off
+          -DLLVM_INSTALL_UTILS=ON
     - name: Compile
       id: build
       # Emulate default value for manual dispatch as we've run out of available arguments.
diff --git a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt
index fb2bf7703ab10..dce987133970b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt
+++ b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt
@@ -1,3 +1,16 @@
+set(OCK_LIBS)
+option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" ON)
+
+# Don't use OCK compiler_passes if Native CPU is not enabled.
+if(NOT "native_cpu" IN_LIST SYCL_ENABLE_BACKENDS)
+  set(NATIVECPU_USE_OCK Off CACHE BOOL "Use the oneAPI Construction Kit for Native CPU" FORCE)
+endif()
+
+if(NATIVECPU_USE_OCK)
+  add_subdirectory(compiler_passes EXCLUDE_FROM_ALL)
+  set(OCK_LIBS NativeCPUPipeline NativeCPUVecz)
+endif()
+
 add_llvm_component_library(LLVMSYCLNativeCPUUtils
   PipelineSYCLNativeCPU.cpp
   PrepareSYCLNativeCPU.cpp
@@ -17,80 +30,13 @@ add_llvm_component_library(LLVMSYCLNativeCPUUtils
   TargetParser
   TransformUtils
   ipo
-  )
+  ${OCK_LIBS}
+)
 
-option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" ON)
-
-# Don't fetch OCK if Native CPU is not enabled.
-if(NOT "native_cpu" IN_LIST SYCL_ENABLE_BACKENDS)
-  set(NATIVECPU_USE_OCK Off CACHE BOOL "Use the oneAPI Construction Kit for Native CPU" FORCE)
-endif()
 
 if(NATIVECPU_USE_OCK)
-  set(OCK_SEARCH_LOC "oneapi-construction-kit/compiler_passes")
-  if(NOT FETCHCONTENT_SOURCE_DIR_ONEAPI-CK)
-    find_path(OCK_SOURCE_DIR ${OCK_SEARCH_LOC} PATHS ${CMAKE_PREFIX_PATH})
-  endif()
-  if(OCK_SOURCE_DIR)
-    message(STATUS "Found system source location of oneAPI Construction Kit in ${OCK_SOURCE_DIR}")
-    set(OCK_SOURCE_DIR "${OCK_SOURCE_DIR}/${OCK_SEARCH_LOC}")
-    set(OCK_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/oneapi-construction-kit")
-  else()
-    set(OCK_GIT_REPO "https://github.com/uxlfoundation/oneapi-construction-kit.git")
-    # commit d0a32d701e34b3285de7ce776ea36abfec673df7
-    # Merge: a9f848e0e8 56473a8c25
-    # Author: Harald van Dijk <harald.vandijk@codeplay.com>
-    # Date:   Mon Jun 30 12:24:46 2025 +0100
-    # 
-    #     Merge pull request #878 from hvdijk/specify-fuse-ld-lld
-    #     
-    #     [RefSi] Explicitly specify -fuse-ld=lld.
-    set(OCK_GIT_TAG d0a32d701e34b3285de7ce776ea36abfec673df7)
-   
-    include(FetchContent)
-    FetchContent_Declare(oneapi-ck
-     GIT_REPOSITORY "${OCK_GIT_REPO}"
-     GIT_TAG "${OCK_GIT_TAG}"
-    )
-    FetchContent_GetProperties(oneapi-ck)
-    if(NOT oneapi-ck_POPULATED)
-      if(FETCHCONTENT_SOURCE_DIR_ONEAPI-CK)
-        message(STATUS "Using specified oneAPI Construction Kit repo location at ${FETCHCONTENT_SOURCE_DIR_ONEAPI-CK}")
-      else()
-        message(STATUS "Cloning oneAPI Construction Kit from ${OCK_GIT_REPO}, tag ${OCK_GIT_TAG}")
-      endif()
-      FetchContent_Populate(oneapi-ck)
-      message(STATUS "oneAPI Construction Kit cloned in ${oneapi-ck_SOURCE_DIR}")
-      set(OCK_SOURCE_DIR ${oneapi-ck_SOURCE_DIR}/compiler_passes)
-      set(OCK_BINARY_DIR ${oneapi-ck_BINARY_DIR})
-    endif()
-  endif()
-
-  set(CA_ENABLE_API "cl" CACHE STRING "" FORCE)
-  add_subdirectory(
-    ${OCK_SOURCE_DIR}
-    ${OCK_BINARY_DIR} EXCLUDE_FROM_ALL)
-
-  install(TARGETS compiler-pipeline
-  EXPORT;LLVMExports
-          LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline
-          ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline
-          RUNTIME DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline)
-  set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS compiler-pipeline)
-  install(TARGETS vecz
-  EXPORT;LLVMExports
-          LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz
-          ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz
-          RUNTIME DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz)
-  set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS vecz)
-  install(TARGETS multi_llvm EXPORT;LLVMExports)
-  set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS multi_llvm)
   target_compile_definitions(LLVMSYCLNativeCPUUtils PRIVATE  NATIVECPU_USE_OCK)
   target_include_directories(LLVMSYCLNativeCPUUtils PRIVATE 
-    ${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include
-    ${oneapi-ck_SOURCE_DIR}/modules/cargo/include
-    ${oneapi-ck_SOURCE_DIR}/modules/compiler/vecz/include
-    ${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include)
-  target_link_libraries(LLVMSYCLNativeCPUUtils PRIVATE compiler-pipeline vecz)
-
+    ${CMAKE_CURRENT_SOURCE_DIR}/compiler_passes/compiler_pipeline/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/compiler_passes/vecz/include)
 endif()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt
new file mode 100644
index 0000000000000..de47b25e03a30
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/compiler_pipeline)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/vecz)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst
new file mode 100644
index 0000000000000..cdfe3a9c79034
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst
@@ -0,0 +1,63 @@
+Compiler passes
+===============
+
+Introduction
+------------
+
+Files under this directory are integrated from the `oneAPI Construction Kit`_
+using `git-filter-repo`. They are used by Native CPU to help create a pipeline for
+turning a base kernel into something which can be executed across multiple work
+items, including auto-vectorization.
+
+These files are largely from the sub-directories
+**modules/compiler/compiler_pipeline**, **modules/compiler/vecz** and
+**modules/compiler/multi_llvm**. Only files that are used have been integrated
+and the **CMake** files have been updated to fit in with LLVM components.
+
+These sub-directories are used as follows:
+
+* **compiler_pipeline** provides the passes to build a pipeline from the initial
+  kernel, including generating working item loops, handling local memory,
+  handling metadata and calling the vectorizer **vecz**.
+
+* **vecz** provides a full function vectorizer, which generates a copy of the
+  original function but vectorized across the work group, taking into account
+  subgroups.
+
+* **multi_llvm**. This provides some support for these functions to work across
+  multiple LLVM versions. Although this is not strictly needed in LLVM, it has
+  been integrated to allow the integration to go smoothly, without changing files
+  directly. Note this is header only and exists under
+  **compiler_pipeline/include/multi_llvm**.
+
+**compiler_pipeline** and **vecz** will be documented under `sycl/docs`. Note
+that there are several limitations in the code that are a result of the initial
+integration. These should be addressed over time for maintainability reasons,
+they are not necessary for correctness or performance reasons.
+
+General limitations
+-------------------
+
+To simplify the integration and reduce risk, most of the files were integrated
+with no changes at all. This means there are currently the following limitations:
+
+* The namespace in **compiler_pipeline** is **compiler/utils**, the namespace in
+  multi_llvm is **multi_llvm** and the namespace in **vecz** is **vecz**. These should
+  be updated to reflect being under **LLVM**.
+* include files should ideally be moved to under **llvm/include** but remain under
+  these directories after the integration.
+* **vecz** has a test tool **veczc** and associated **lit** tests. This tool if
+  required should be moved under **llvm/tools** or **llvm/test**. This is also
+  requires `NATIVE_CPU_BUILD_VECZ_TEST_TOOLS` **CMake** option to build. This can be
+  run using the target `check-sycl-vecz`.
+* **compiler_pipeline** has lit tests for the passes which have not been integrated.
+  This is because they use a tool **muxc**, but these passes should be
+  able to be tested using **opt**. These lit tests can be found in the
+  `pipeline pass tests`_.
+* There are many integrated files that are unlikely to have any code coverage but because
+  there are referred to in other files which we do need, they exist here. These
+  should be pruned over time as a better understanding is made of what is
+  essential.
+
+.. _oneAPI Construction Kit: https://github.com/uxlfoundation/oneapi-construction-kit
+.. _pipeline pass tests: https://github.com/uxlfoundation/oneapi-construction-kit/tree/main/modules/compiler/test/lit/passes
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt
new file mode 100644
index 0000000000000..90981a1718dac
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt
@@ -0,0 +1,32 @@
+add_llvm_component_library(LLVMNativeCPUPipeline
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/attributes.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/barrier_regions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/builtin_info.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/cl_builtin_info.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/define_mux_builtins_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/dma.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/encode_kernel_metadata_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/group_collective_helpers.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/mangling.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/metadata.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/mux_builtin_info.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/pass_functions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/optimal_builtin_replacement_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/pass_machinery.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/prepare_barriers_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/replace_local_module_scope_variables_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/scheduling.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/sub_group_analysis.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/target_extension_types.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/work_item_loops_pass.cpp
+
+  LINK_COMPONENTS
+  Passes
+  Core
+  )
+
+# TODO: Move to under LLVM include and work out why ADDITIONAL_HEADER_DIRS
+# does not capture it.
+target_include_directories(LLVMNativeCPUPipeline PUBLIC
+$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt
new file mode 100644
index 0000000000000..7aa151998effa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt
@@ -0,0 +1,135 @@
+set(VECZ_PUBLIC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(VECZ_PRIVATE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/source)
+set(VECZ_PRIVATE_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/source/include)
+
+set(COMMON_SRCS
+  ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/pass.h
+  ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/vecz_choices.h
+  ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/vecz_target_info.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/control_flow_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/divergence_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/instantiation_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/liveness_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/packetization_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/simd_width_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/stride_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/uniform_value_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/vectorizable_function_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/vectorization_unit_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/common_gep_elimination_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/control_flow_conversion_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/inline_post_vectorization_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/instantiation_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/interleaved_group_combine_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetization_helpers.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetization_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/passes.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/printf_scalarizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/scalarization_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/scalarizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/ternary_transform_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/control_flow_boscc.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/control_flow_roscc.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/debugging.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/ir_cleanup.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/llvm_helpers.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/memory_operations.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/offset_info.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/reachability.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/simd_packet.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_context.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_helpers.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_heuristics.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_unit.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vecz_pass_builder.h
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/control_flow_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/divergence_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/instantiation_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/liveness_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/packetization_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/simd_width_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/stride_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/uniform_value_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/vectorizable_function_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/vectorization_unit_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/basic_mem2reg_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/builtin_inlining_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/common_gep_elimination_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/control_flow_conversion_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/inline_post_vectorization_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/loop_rotate_custom_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/instantiation_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/interleaved_group_combine_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetization_helpers.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetization_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/passes.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/pre_linearize_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/printf_scalarizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/remove_intptr_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/scalarization_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/scalarizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/simplify_infinite_loop_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/squash_small_vectors_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/ternary_transform_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/uniform_reassociation_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/control_flow_boscc.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/control_flow_roscc.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/debugging.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/ir_cleanup.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/llvm_helpers.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/memory_operations.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/offset_info.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/reachability.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/simd_packet.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info_arm.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info_riscv.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_choices.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_context.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_helpers.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_heuristics.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_unit.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vecz_pass_builder.cpp
+)
+
+if(MSVC)
+  # Disable: unreferenced formal parameter.
+  list(REMOVE_ITEM VECZ_COMPILE_OPTIONS -we4100)
+  list(APPEND VECZ_COMPILE_OPTIONS -wd4100)
+endif()
+
+add_llvm_component_library(LLVMNativeCPUVecz
+  ${COMMON_SRCS}
+  LINK_COMPONENTS
+  NativeCPUPipeline
+  support
+  core
+  analysis
+  instcombine
+  aggressiveinstcombine
+  transformutils
+  scalaropts
+  ipo
+  passes
+  )
+
+target_include_directories(LLVMNativeCPUVecz
+  PUBLIC $<BUILD_INTERFACE:${VECZ_PUBLIC_INCLUDE_DIR}>
+  PRIVATE $<BUILD_INTERFACE:${VECZ_PRIVATE_INCLUDE_DIR}>
+)
+target_compile_options(LLVMNativeCPUVecz PRIVATE ${VECZ_COMPILE_OPTIONS})
+target_compile_definitions(LLVMNativeCPUVecz PRIVATE
+  ${VECZ_COMPILE_DEFINITIONS})
+
+# Currently disabled by default, these allow us to run lit tests using veczc
+# with the target check-sycl-vecz
+set(NATIVE_CPU_BUILD_VECZ_TEST_TOOLS OFF CACHE BOOL "Build vecz test and tools")
+if (NATIVE_CPU_BUILD_VECZ_TEST_TOOLS)
+  add_subdirectory(tools)
+  add_subdirectory(test)
+endif()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt
new file mode 100644
index 0000000000000..b47f8f35b3df2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lit)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt
new file mode 100644
index 0000000000000..7f67eb3a1a873
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt
@@ -0,0 +1,26 @@
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+  PATHS
+  "CMAKE_OSX_SYSROOT"
+  "LLVM_SOURCE_DIR"
+  "LLVM_BINARY_DIR"
+  "LLVM_TOOLS_DIR"
+  "LLVM_LIBS_DIR"
+  "SHLIBDIR"
+  )
+
+# TODO: Consider adding to check-sycl if this is every moved to under llvm/tests
+# Add a target to invoke tests via Ninja/make.
+add_lit_testsuite(check-sycl-vecz-tests "Running SYCL vecz lit tests"
+    "${CMAKE_CURRENT_BINARY_DIR}"
+
+    DEPENDS
+    veczc
+    FileCheck
+)
+
+add_custom_target(check-sycl-vecz)
+add_dependencies(check-sycl-vecz check-sycl-vecz-tests)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py
new file mode 100644
index 0000000000000..0c0a2590b6274
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py
@@ -0,0 +1,37 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Python configuration file for lit."""
+
+import os
+import lit.formats
+from lit.llvm import llvm_config
+
+
+# Name of the test suite.
+config.name = "LLVM"
+
+# File extensions for testing.
+config.suffixes = [".hlsl", ".ll"]
+
+# The test format used to interpret tests.
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+# The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+llvm_config.with_environment(
+    "PATH", os.path.abspath(config.llvm_tools_dir), append_path=True
+)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in
new file mode 100644
index 0000000000000..785ee42143601
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in
@@ -0,0 +1,22 @@
+"""Python configuration file for lit."""
+
+@LIT_SITE_CFG_IN_HEADER@
+
+import os
+from lit.llvm.subst import ToolSubst
+from lit.llvm import llvm_config
+
+config.test_exec_root = r"@CURRENT_BINARY_DIR@"
+
+# Paths to helper utilities
+config.tools = [ ToolSubst('veczc') ]
+
+config.targets = frozenset('@LLVM_TARGETS_TO_BUILD@'.split(';'))
+
+config.llvm_tools_dir = lit_config.substitute(path(r"@LLVM_TOOLS_DIR@"))
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, os.path.join('@CMAKE_CURRENT_SOURCE_DIR@', "lit.cfg.py"))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
index 456414d1c22ff..2d953c5daa499 100644
--- a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -51,7 +51,7 @@ entry:
   call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
   %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
   store i64 %call, i64* %tid, align 8, !dbg !31
-; CHECK: #dbg_value(i32 {{undef|poison}}, [[DI_A:![0-9]+]], !DIExpression(),
+; CHECK: #dbg_value(i32 poison, [[DI_A:![0-9]+]], !DIExpression(),
 ; CHECK-SAME: [[A_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
   %0 = load i64, i64* %tid, align 8, !dbg !32
@@ -59,7 +59,7 @@ entry:
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
   store i32 %2, i32* %a, align 4, !dbg !32
-; CHECK: #dbg_value(i32 {{undef|poison}}, [[DI_B:![0-9]+]], !DIExpression(),
+; CHECK: #dbg_value(i32 poison, [[DI_B:![0-9]+]], !DIExpression(),
 ; CHECK-SAME: [[B_LOC:![0-9]+]]
   call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
   %3 = load i64, i64* %tid, align 8, !dbg !33
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt
new file mode 100644
index 0000000000000..921204b382aa0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt
@@ -0,0 +1,14 @@
+llvm_map_components_to_libnames(llvm_libs all ${LLVM_TARGETS_TO_BUILD})
+list(REMOVE_ITEM llvm_libs LTO OptRemarks)
+
+add_llvm_tool(veczc
+ ${CMAKE_CURRENT_SOURCE_DIR}/source/veczc.cpp
+)
+target_compile_options(veczc PRIVATE ${VECZ_COMPILE_OPTIONS})
+target_compile_definitions(veczc PRIVATE ${VECZ_COMPILE_DEFINITIONS})
+target_include_directories(veczc PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include>
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../compiler_pipeline/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../vecz/include
+ )
+target_link_libraries(veczc PUBLIC ${llvm_libs})